This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2003, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg) STMT_START {                                          \
260     char *ellipses = "";                                                \
261     IV len = RExC_end - RExC_precomp;                                   \
262                                                                         \
263     if (!SIZE_ONLY)                                                     \
264         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
265     if (len > RegexLengthToShowInErrorMessages) {                       \
266         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
267         len = RegexLengthToShowInErrorMessages - 10;                    \
268         ellipses = "...";                                               \
269     }                                                                   \
270     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
271             msg, (int)len, RExC_precomp, ellipses);                     \
272 } STMT_END
273
274 /*
275  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276  * args. Show regex, up to a maximum length. If it's too long, chop and add
277  * "...".
278  */
279 #define FAIL2(pat,msg) STMT_START {                                     \
280     char *ellipses = "";                                                \
281     IV len = RExC_end - RExC_precomp;                                   \
282                                                                         \
283     if (!SIZE_ONLY)                                                     \
284         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
285     if (len > RegexLengthToShowInErrorMessages) {                       \
286         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
287         len = RegexLengthToShowInErrorMessages - 10;                    \
288         ellipses = "...";                                               \
289     }                                                                   \
290     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
291             msg, (int)len, RExC_precomp, ellipses);                     \
292 } STMT_END
293
294
295 /*
296  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297  */
298 #define Simple_vFAIL(m) STMT_START {                                    \
299     IV offset = RExC_parse - RExC_precomp;                              \
300     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
301             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
302 } STMT_END
303
304 /*
305  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306  */
307 #define vFAIL(m) STMT_START {                           \
308     if (!SIZE_ONLY)                                     \
309         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
310     Simple_vFAIL(m);                                    \
311 } STMT_END
312
313 /*
314  * Like Simple_vFAIL(), but accepts two arguments.
315  */
316 #define Simple_vFAIL2(m,a1) STMT_START {                        \
317     IV offset = RExC_parse - RExC_precomp;                      \
318     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
319             (int)offset, RExC_precomp, RExC_precomp + offset);  \
320 } STMT_END
321
322 /*
323  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324  */
325 #define vFAIL2(m,a1) STMT_START {                       \
326     if (!SIZE_ONLY)                                     \
327         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
328     Simple_vFAIL2(m, a1);                               \
329 } STMT_END
330
331
332 /*
333  * Like Simple_vFAIL(), but accepts three arguments.
334  */
335 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
336     IV offset = RExC_parse - RExC_precomp;                      \
337     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
338             (int)offset, RExC_precomp, RExC_precomp + offset);  \
339 } STMT_END
340
341 /*
342  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343  */
344 #define vFAIL3(m,a1,a2) STMT_START {                    \
345     if (!SIZE_ONLY)                                     \
346         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
347     Simple_vFAIL3(m, a1, a2);                           \
348 } STMT_END
349
350 /*
351  * Like Simple_vFAIL(), but accepts four arguments.
352  */
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
354     IV offset = RExC_parse - RExC_precomp;                      \
355     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
356             (int)offset, RExC_precomp, RExC_precomp + offset);  \
357 } STMT_END
358
359 /*
360  * Like Simple_vFAIL(), but accepts five arguments.
361  */
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
363     IV offset = RExC_parse - RExC_precomp;                      \
364     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
365             (int)offset, RExC_precomp, RExC_precomp + offset);  \
366 } STMT_END
367
368
369 #define vWARN(loc,m) STMT_START {                                       \
370     IV offset = loc - RExC_precomp;                                     \
371     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
372             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
373 } STMT_END
374
375 #define vWARNdep(loc,m) STMT_START {                                    \
376     IV offset = loc - RExC_precomp;                                     \
377     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
378             "%s" REPORT_LOCATION,                                       \
379             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
380 } STMT_END
381
382
383 #define vWARN2(loc, m, a1) STMT_START {                                 \
384     IV offset = loc - RExC_precomp;                                     \
385     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
386             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
387 } STMT_END
388
389 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
390     IV offset = loc - RExC_precomp;                                     \
391     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
392             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
393 } STMT_END
394
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
396     IV offset = loc - RExC_precomp;                                     \
397     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
398             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399 } STMT_END
400
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
402     IV offset = loc - RExC_precomp;                                     \
403     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
404             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 } STMT_END
406
407
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START {                  \
410     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411 } STMT_END
412
413 /* Macros for recording node offsets.   20001227 mjd@plover.com 
414  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
415  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
416  * Element 0 holds the number n.
417  */
418
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
424     if (! SIZE_ONLY) {                                                  \
425         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
426                 __LINE__, (node), (byte)));                             \
427         if((node) < 0) {                                                \
428             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429         } else {                                                        \
430             RExC_offsets[2*(node)-1] = (byte);                          \
431         }                                                               \
432     }                                                                   \
433 } STMT_END
434
435 #define Set_Node_Offset(node,byte) \
436     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
440     if (! SIZE_ONLY) {                                                  \
441         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
442                 __LINE__, (node), (len)));                              \
443         if((node) < 0) {                                                \
444             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445         } else {                                                        \
446             RExC_offsets[2*(node)] = (len);                             \
447         }                                                               \
448     }                                                                   \
449 } STMT_END
450
451 #define Set_Node_Length(node,len) \
452     Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455     Set_Node_Length(node, RExC_parse - parse_start)
456
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
461 static void clear_re(pTHX_ void *r);
462
463 /* Mark that we cannot extend a found fixed substring at this point.
464    Updata the longest found anchored substring and the longest found
465    floating substrings if needed. */
466
467 STATIC void
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
469 {
470     STRLEN l = CHR_SVLEN(data->last_found);
471     STRLEN old_l = CHR_SVLEN(*data->longest);
472
473     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474         sv_setsv(*data->longest, data->last_found);
475         if (*data->longest == data->longest_fixed) {
476             data->offset_fixed = l ? data->last_start_min : data->pos_min;
477             if (data->flags & SF_BEFORE_EOL)
478                 data->flags
479                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480             else
481                 data->flags &= ~SF_FIX_BEFORE_EOL;
482         }
483         else {
484             data->offset_float_min = l ? data->last_start_min : data->pos_min;
485             data->offset_float_max = (l
486                                       ? data->last_start_max
487                                       : data->pos_min + data->pos_delta);
488             if ((U32)data->offset_float_max > (U32)I32_MAX)
489                 data->offset_float_max = I32_MAX;
490             if (data->flags & SF_BEFORE_EOL)
491                 data->flags
492                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493             else
494                 data->flags &= ~SF_FL_BEFORE_EOL;
495         }
496     }
497     SvCUR_set(data->last_found, 0);
498     {
499         SV * sv = data->last_found;
500         MAGIC *mg =
501             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502         if (mg && mg->mg_len > 0)
503             mg->mg_len = 0;
504     }
505     data->last_end = -1;
506     data->flags &= ~SF_BEFORE_EOL;
507 }
508
509 /* Can match anything (initialization) */
510 STATIC void
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
512 {
513     ANYOF_CLASS_ZERO(cl);
514     ANYOF_BITMAP_SETALL(cl);
515     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
516     if (LOC)
517         cl->flags |= ANYOF_LOCALE;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC int
522 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
523 {
524     int value;
525
526     for (value = 0; value <= ANYOF_MAX; value += 2)
527         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528             return 1;
529     if (!(cl->flags & ANYOF_UNICODE_ALL))
530         return 0;
531     if (!ANYOF_BITMAP_TESTALLSET(cl))
532         return 0;
533     return 1;
534 }
535
536 /* Can match anything (initialization) */
537 STATIC void
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
539 {
540     Zero(cl, 1, struct regnode_charclass_class);
541     cl->type = ANYOF;
542     cl_anything(pRExC_state, cl);
543 }
544
545 STATIC void
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
547 {
548     Zero(cl, 1, struct regnode_charclass_class);
549     cl->type = ANYOF;
550     cl_anything(pRExC_state, cl);
551     if (LOC)
552         cl->flags |= ANYOF_LOCALE;
553 }
554
555 /* 'And' a given class with another one.  Can create false positives */
556 /* We assume that cl is not inverted */
557 STATIC void
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559          struct regnode_charclass_class *and_with)
560 {
561     if (!(and_with->flags & ANYOF_CLASS)
562         && !(cl->flags & ANYOF_CLASS)
563         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564         && !(and_with->flags & ANYOF_FOLD)
565         && !(cl->flags & ANYOF_FOLD)) {
566         int i;
567
568         if (and_with->flags & ANYOF_INVERT)
569             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570                 cl->bitmap[i] &= ~and_with->bitmap[i];
571         else
572             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573                 cl->bitmap[i] &= and_with->bitmap[i];
574     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575     if (!(and_with->flags & ANYOF_EOS))
576         cl->flags &= ~ANYOF_EOS;
577
578     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
579         cl->flags &= ~ANYOF_UNICODE_ALL;
580         cl->flags |= ANYOF_UNICODE;
581         ARG_SET(cl, ARG(and_with));
582     }
583     if (!(and_with->flags & ANYOF_UNICODE_ALL))
584         cl->flags &= ~ANYOF_UNICODE_ALL;
585     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
586         cl->flags &= ~ANYOF_UNICODE;
587 }
588
589 /* 'OR' a given class with another one.  Can create false positives */
590 /* We assume that cl is not inverted */
591 STATIC void
592 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
593 {
594     if (or_with->flags & ANYOF_INVERT) {
595         /* We do not use
596          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
597          *   <= (B1 | !B2) | (CL1 | !CL2)
598          * which is wasteful if CL2 is small, but we ignore CL2:
599          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
600          * XXXX Can we handle case-fold?  Unclear:
601          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
602          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
603          */
604         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605              && !(or_with->flags & ANYOF_FOLD)
606              && !(cl->flags & ANYOF_FOLD) ) {
607             int i;
608
609             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
610                 cl->bitmap[i] |= ~or_with->bitmap[i];
611         } /* XXXX: logic is complicated otherwise */
612         else {
613             cl_anything(pRExC_state, cl);
614         }
615     } else {
616         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
617         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618              && (!(or_with->flags & ANYOF_FOLD)
619                  || (cl->flags & ANYOF_FOLD)) ) {
620             int i;
621
622             /* OR char bitmap and class bitmap separately */
623             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624                 cl->bitmap[i] |= or_with->bitmap[i];
625             if (or_with->flags & ANYOF_CLASS) {
626                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
627                     cl->classflags[i] |= or_with->classflags[i];
628                 cl->flags |= ANYOF_CLASS;
629             }
630         }
631         else { /* XXXX: logic is complicated, leave it along for a moment. */
632             cl_anything(pRExC_state, cl);
633         }
634     }
635     if (or_with->flags & ANYOF_EOS)
636         cl->flags |= ANYOF_EOS;
637
638     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
639         ARG(cl) != ARG(or_with)) {
640         cl->flags |= ANYOF_UNICODE_ALL;
641         cl->flags &= ~ANYOF_UNICODE;
642     }
643     if (or_with->flags & ANYOF_UNICODE_ALL) {
644         cl->flags |= ANYOF_UNICODE_ALL;
645         cl->flags &= ~ANYOF_UNICODE;
646     }
647 }
648
649 /*
650  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
651  * These need to be revisited when a newer toolchain becomes available.
652  */
653 #if defined(__sparc64__) && defined(__GNUC__)
654 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
655 #       undef  SPARC64_GCC_WORKAROUND
656 #       define SPARC64_GCC_WORKAROUND 1
657 #   endif
658 #endif
659
660 /* REx optimizer.  Converts nodes into quickier variants "in place".
661    Finds fixed substrings.  */
662
663 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
664    to the position after last scanned or to NULL. */
665
666 STATIC I32
667 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
668                         /* scanp: Start here (read-write). */
669                         /* deltap: Write maxlen-minlen here. */
670                         /* last: Stop before this one. */
671 {
672     I32 min = 0, pars = 0, code;
673     regnode *scan = *scanp, *next;
674     I32 delta = 0;
675     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
676     int is_inf_internal = 0;            /* The studied chunk is infinite */
677     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
678     scan_data_t data_fake;
679     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
680
681     while (scan && OP(scan) != END && scan < last) {
682         /* Peephole optimizer: */
683
684         if (PL_regkind[(U8)OP(scan)] == EXACT) {
685             /* Merge several consecutive EXACTish nodes into one. */
686             regnode *n = regnext(scan);
687             U32 stringok = 1;
688 #ifdef DEBUGGING
689             regnode *stop = scan;
690 #endif
691
692             next = scan + NODE_SZ_STR(scan);
693             /* Skip NOTHING, merge EXACT*. */
694             while (n &&
695                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
696                      (stringok && (OP(n) == OP(scan))))
697                    && NEXT_OFF(n)
698                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
699                 if (OP(n) == TAIL || n > next)
700                     stringok = 0;
701                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
702                     NEXT_OFF(scan) += NEXT_OFF(n);
703                     next = n + NODE_STEP_REGNODE;
704 #ifdef DEBUGGING
705                     if (stringok)
706                         stop = n;
707 #endif
708                     n = regnext(n);
709                 }
710                 else if (stringok) {
711                     int oldl = STR_LEN(scan);
712                     regnode *nnext = regnext(n);
713
714                     if (oldl + STR_LEN(n) > U8_MAX)
715                         break;
716                     NEXT_OFF(scan) += NEXT_OFF(n);
717                     STR_LEN(scan) += STR_LEN(n);
718                     next = n + NODE_SZ_STR(n);
719                     /* Now we can overwrite *n : */
720                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
721 #ifdef DEBUGGING
722                     stop = next - 1;
723 #endif
724                     n = nnext;
725                 }
726             }
727
728             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
729 /*
730   Two problematic code points in Unicode casefolding of EXACT nodes:
731
732    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
733    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
734
735    which casefold to
736
737    Unicode                      UTF-8
738
739    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
740    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
741
742    This means that in case-insensitive matching (or "loose matching",
743    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
744    length of the above casefolded versions) can match a target string
745    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
746    This would rather mess up the minimum length computation.
747
748    What we'll do is to look for the tail four bytes, and then peek
749    at the preceding two bytes to see whether we need to decrease
750    the minimum length by four (six minus two).
751
752    Thanks to the design of UTF-8, there cannot be false matches:
753    A sequence of valid UTF-8 bytes cannot be a subsequence of
754    another valid sequence of UTF-8 bytes.
755
756 */
757                  char *s0 = STRING(scan), *s, *t;
758                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
759                  char *t0 = "\xcc\x88\xcc\x81";
760                  char *t1 = t0 + 3;
761                  
762                  for (s = s0 + 2;
763                       s < s2 && (t = ninstr(s, s1, t0, t1));
764                       s = t + 4) {
765                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
766                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
767                            min -= 4;
768                  }
769             }
770
771 #ifdef DEBUGGING
772             /* Allow dumping */
773             n = scan + NODE_SZ_STR(scan);
774             while (n <= stop) {
775                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
776                     OP(n) = OPTIMIZED;
777                     NEXT_OFF(n) = 0;
778                 }
779                 n++;
780             }
781 #endif
782         }
783         /* Follow the next-chain of the current node and optimize
784            away all the NOTHINGs from it.  */
785         if (OP(scan) != CURLYX) {
786             int max = (reg_off_by_arg[OP(scan)]
787                        ? I32_MAX
788                        /* I32 may be smaller than U16 on CRAYs! */
789                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
790             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
791             int noff;
792             regnode *n = scan;
793         
794             /* Skip NOTHING and LONGJMP. */
795             while ((n = regnext(n))
796                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
797                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
798                    && off + noff < max)
799                 off += noff;
800             if (reg_off_by_arg[OP(scan)])
801                 ARG(scan) = off;
802             else
803                 NEXT_OFF(scan) = off;
804         }
805         /* The principal pseudo-switch.  Cannot be a switch, since we
806            look into several different things.  */
807         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
808                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
809             next = regnext(scan);
810             code = OP(scan);
811         
812             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
813                 I32 max1 = 0, min1 = I32_MAX, num = 0;
814                 struct regnode_charclass_class accum;
815                 
816                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
817                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
818                 if (flags & SCF_DO_STCLASS)
819                     cl_init_zero(pRExC_state, &accum);
820                 while (OP(scan) == code) {
821                     I32 deltanext, minnext, f = 0, fake;
822                     struct regnode_charclass_class this_class;
823
824                     num++;
825                     data_fake.flags = 0;
826                     if (data) {         
827                         data_fake.whilem_c = data->whilem_c;
828                         data_fake.last_closep = data->last_closep;
829                     }
830                     else
831                         data_fake.last_closep = &fake;
832                     next = regnext(scan);
833                     scan = NEXTOPER(scan);
834                     if (code != BRANCH)
835                         scan = NEXTOPER(scan);
836                     if (flags & SCF_DO_STCLASS) {
837                         cl_init(pRExC_state, &this_class);
838                         data_fake.start_class = &this_class;
839                         f = SCF_DO_STCLASS_AND;
840                     }           
841                     if (flags & SCF_WHILEM_VISITED_POS)
842                         f |= SCF_WHILEM_VISITED_POS;
843                     /* we suppose the run is continuous, last=next...*/
844                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
845                                           next, &data_fake, f);
846                     if (min1 > minnext)
847                         min1 = minnext;
848                     if (max1 < minnext + deltanext)
849                         max1 = minnext + deltanext;
850                     if (deltanext == I32_MAX)
851                         is_inf = is_inf_internal = 1;
852                     scan = next;
853                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
854                         pars++;
855                     if (data && (data_fake.flags & SF_HAS_EVAL))
856                         data->flags |= SF_HAS_EVAL;
857                     if (data)
858                         data->whilem_c = data_fake.whilem_c;
859                     if (flags & SCF_DO_STCLASS)
860                         cl_or(pRExC_state, &accum, &this_class);
861                     if (code == SUSPEND)
862                         break;
863                 }
864                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
865                     min1 = 0;
866                 if (flags & SCF_DO_SUBSTR) {
867                     data->pos_min += min1;
868                     data->pos_delta += max1 - min1;
869                     if (max1 != min1 || is_inf)
870                         data->longest = &(data->longest_float);
871                 }
872                 min += min1;
873                 delta += max1 - min1;
874                 if (flags & SCF_DO_STCLASS_OR) {
875                     cl_or(pRExC_state, data->start_class, &accum);
876                     if (min1) {
877                         cl_and(data->start_class, &and_with);
878                         flags &= ~SCF_DO_STCLASS;
879                     }
880                 }
881                 else if (flags & SCF_DO_STCLASS_AND) {
882                     if (min1) {
883                         cl_and(data->start_class, &accum);
884                         flags &= ~SCF_DO_STCLASS;
885                     }
886                     else {
887                         /* Switch to OR mode: cache the old value of
888                          * data->start_class */
889                         StructCopy(data->start_class, &and_with,
890                                    struct regnode_charclass_class);
891                         flags &= ~SCF_DO_STCLASS_AND;
892                         StructCopy(&accum, data->start_class,
893                                    struct regnode_charclass_class);
894                         flags |= SCF_DO_STCLASS_OR;
895                         data->start_class->flags |= ANYOF_EOS;
896                     }
897                 }
898             }
899             else if (code == BRANCHJ)   /* single branch is optimized. */
900                 scan = NEXTOPER(NEXTOPER(scan));
901             else                        /* single branch is optimized. */
902                 scan = NEXTOPER(scan);
903             continue;
904         }
905         else if (OP(scan) == EXACT) {
906             I32 l = STR_LEN(scan);
907             UV uc = *((U8*)STRING(scan));
908             if (UTF) {
909                 U8 *s = (U8*)STRING(scan);
910                 l = utf8_length(s, s + l);
911                 uc = utf8_to_uvchr(s, NULL);
912             }
913             min += l;
914             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
915                 /* The code below prefers earlier match for fixed
916                    offset, later match for variable offset.  */
917                 if (data->last_end == -1) { /* Update the start info. */
918                     data->last_start_min = data->pos_min;
919                     data->last_start_max = is_inf
920                         ? I32_MAX : data->pos_min + data->pos_delta;
921                 }
922                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
923                 {
924                     SV * sv = data->last_found;
925                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
926                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
927                     if (mg && mg->mg_len >= 0)
928                         mg->mg_len += utf8_length(STRING(scan),
929                                                   STRING(scan)+STR_LEN(scan));
930                 }
931                 if (UTF)
932                     SvUTF8_on(data->last_found);
933                 data->last_end = data->pos_min + l;
934                 data->pos_min += l; /* As in the first entry. */
935                 data->flags &= ~SF_BEFORE_EOL;
936             }
937             if (flags & SCF_DO_STCLASS_AND) {
938                 /* Check whether it is compatible with what we know already! */
939                 int compat = 1;
940
941                 if (uc >= 0x100 ||
942                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
943                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
944                     && (!(data->start_class->flags & ANYOF_FOLD)
945                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
946                     )
947                     compat = 0;
948                 ANYOF_CLASS_ZERO(data->start_class);
949                 ANYOF_BITMAP_ZERO(data->start_class);
950                 if (compat)
951                     ANYOF_BITMAP_SET(data->start_class, uc);
952                 data->start_class->flags &= ~ANYOF_EOS;
953                 if (uc < 0x100)
954                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
955             }
956             else if (flags & SCF_DO_STCLASS_OR) {
957                 /* false positive possible if the class is case-folded */
958                 if (uc < 0x100)
959                     ANYOF_BITMAP_SET(data->start_class, uc);
960                 else
961                     data->start_class->flags |= ANYOF_UNICODE_ALL;
962                 data->start_class->flags &= ~ANYOF_EOS;
963                 cl_and(data->start_class, &and_with);
964             }
965             flags &= ~SCF_DO_STCLASS;
966         }
967         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
968             I32 l = STR_LEN(scan);
969             UV uc = *((U8*)STRING(scan));
970
971             /* Search for fixed substrings supports EXACT only. */
972             if (flags & SCF_DO_SUBSTR)
973                 scan_commit(pRExC_state, data);
974             if (UTF) {
975                 U8 *s = (U8 *)STRING(scan);
976                 l = utf8_length(s, s + l);
977                 uc = utf8_to_uvchr(s, NULL);
978             }
979             min += l;
980             if (data && (flags & SCF_DO_SUBSTR))
981                 data->pos_min += l;
982             if (flags & SCF_DO_STCLASS_AND) {
983                 /* Check whether it is compatible with what we know already! */
984                 int compat = 1;
985
986                 if (uc >= 0x100 ||
987                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
988                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
989                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
990                     compat = 0;
991                 ANYOF_CLASS_ZERO(data->start_class);
992                 ANYOF_BITMAP_ZERO(data->start_class);
993                 if (compat) {
994                     ANYOF_BITMAP_SET(data->start_class, uc);
995                     data->start_class->flags &= ~ANYOF_EOS;
996                     data->start_class->flags |= ANYOF_FOLD;
997                     if (OP(scan) == EXACTFL)
998                         data->start_class->flags |= ANYOF_LOCALE;
999                 }
1000             }
1001             else if (flags & SCF_DO_STCLASS_OR) {
1002                 if (data->start_class->flags & ANYOF_FOLD) {
1003                     /* false positive possible if the class is case-folded.
1004                        Assume that the locale settings are the same... */
1005                     if (uc < 0x100)
1006                         ANYOF_BITMAP_SET(data->start_class, uc);
1007                     data->start_class->flags &= ~ANYOF_EOS;
1008                 }
1009                 cl_and(data->start_class, &and_with);
1010             }
1011             flags &= ~SCF_DO_STCLASS;
1012         }
1013         else if (strchr((char*)PL_varies,OP(scan))) {
1014             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1015             I32 f = flags, pos_before = 0;
1016             regnode *oscan = scan;
1017             struct regnode_charclass_class this_class;
1018             struct regnode_charclass_class *oclass = NULL;
1019             I32 next_is_eval = 0;
1020
1021             switch (PL_regkind[(U8)OP(scan)]) {
1022             case WHILEM:                /* End of (?:...)* . */
1023                 scan = NEXTOPER(scan);
1024                 goto finish;
1025             case PLUS:
1026                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1027                     next = NEXTOPER(scan);
1028                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1029                         mincount = 1;
1030                         maxcount = REG_INFTY;
1031                         next = regnext(scan);
1032                         scan = NEXTOPER(scan);
1033                         goto do_curly;
1034                     }
1035                 }
1036                 if (flags & SCF_DO_SUBSTR)
1037                     data->pos_min++;
1038                 min++;
1039                 /* Fall through. */
1040             case STAR:
1041                 if (flags & SCF_DO_STCLASS) {
1042                     mincount = 0;
1043                     maxcount = REG_INFTY;
1044                     next = regnext(scan);
1045                     scan = NEXTOPER(scan);
1046                     goto do_curly;
1047                 }
1048                 is_inf = is_inf_internal = 1;
1049                 scan = regnext(scan);
1050                 if (flags & SCF_DO_SUBSTR) {
1051                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1052                     data->longest = &(data->longest_float);
1053                 }
1054                 goto optimize_curly_tail;
1055             case CURLY:
1056                 mincount = ARG1(scan);
1057                 maxcount = ARG2(scan);
1058                 next = regnext(scan);
1059                 if (OP(scan) == CURLYX) {
1060                     I32 lp = (data ? *(data->last_closep) : 0);
1061
1062                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1063                 }
1064                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1065                 next_is_eval = (OP(scan) == EVAL);
1066               do_curly:
1067                 if (flags & SCF_DO_SUBSTR) {
1068                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1069                     pos_before = data->pos_min;
1070                 }
1071                 if (data) {
1072                     fl = data->flags;
1073                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1074                     if (is_inf)
1075                         data->flags |= SF_IS_INF;
1076                 }
1077                 if (flags & SCF_DO_STCLASS) {
1078                     cl_init(pRExC_state, &this_class);
1079                     oclass = data->start_class;
1080                     data->start_class = &this_class;
1081                     f |= SCF_DO_STCLASS_AND;
1082                     f &= ~SCF_DO_STCLASS_OR;
1083                 }
1084                 /* These are the cases when once a subexpression
1085                    fails at a particular position, it cannot succeed
1086                    even after backtracking at the enclosing scope.
1087                 
1088                    XXXX what if minimal match and we are at the
1089                         initial run of {n,m}? */
1090                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1091                     f &= ~SCF_WHILEM_VISITED_POS;
1092
1093                 /* This will finish on WHILEM, setting scan, or on NULL: */
1094                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1095                                       mincount == 0
1096                                         ? (f & ~SCF_DO_SUBSTR) : f);
1097
1098                 if (flags & SCF_DO_STCLASS)
1099                     data->start_class = oclass;
1100                 if (mincount == 0 || minnext == 0) {
1101                     if (flags & SCF_DO_STCLASS_OR) {
1102                         cl_or(pRExC_state, data->start_class, &this_class);
1103                     }
1104                     else if (flags & SCF_DO_STCLASS_AND) {
1105                         /* Switch to OR mode: cache the old value of
1106                          * data->start_class */
1107                         StructCopy(data->start_class, &and_with,
1108                                    struct regnode_charclass_class);
1109                         flags &= ~SCF_DO_STCLASS_AND;
1110                         StructCopy(&this_class, data->start_class,
1111                                    struct regnode_charclass_class);
1112                         flags |= SCF_DO_STCLASS_OR;
1113                         data->start_class->flags |= ANYOF_EOS;
1114                     }
1115                 } else {                /* Non-zero len */
1116                     if (flags & SCF_DO_STCLASS_OR) {
1117                         cl_or(pRExC_state, data->start_class, &this_class);
1118                         cl_and(data->start_class, &and_with);
1119                     }
1120                     else if (flags & SCF_DO_STCLASS_AND)
1121                         cl_and(data->start_class, &this_class);
1122                     flags &= ~SCF_DO_STCLASS;
1123                 }
1124                 if (!scan)              /* It was not CURLYX, but CURLY. */
1125                     scan = next;
1126                 if (ckWARN(WARN_REGEXP)
1127                        /* ? quantifier ok, except for (?{ ... }) */
1128                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1129                     && (minnext == 0) && (deltanext == 0)
1130                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1131                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1132                 {
1133                     vWARN(RExC_parse,
1134                           "Quantifier unexpected on zero-length expression");
1135                 }
1136
1137                 min += minnext * mincount;
1138                 is_inf_internal |= ((maxcount == REG_INFTY
1139                                      && (minnext + deltanext) > 0)
1140                                     || deltanext == I32_MAX);
1141                 is_inf |= is_inf_internal;
1142                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1143
1144                 /* Try powerful optimization CURLYX => CURLYN. */
1145                 if (  OP(oscan) == CURLYX && data
1146                       && data->flags & SF_IN_PAR
1147                       && !(data->flags & SF_HAS_EVAL)
1148                       && !deltanext && minnext == 1 ) {
1149                     /* Try to optimize to CURLYN.  */
1150                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1151                     regnode *nxt1 = nxt;
1152 #ifdef DEBUGGING
1153                     regnode *nxt2;
1154 #endif
1155
1156                     /* Skip open. */
1157                     nxt = regnext(nxt);
1158                     if (!strchr((char*)PL_simple,OP(nxt))
1159                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1160                              && STR_LEN(nxt) == 1))
1161                         goto nogo;
1162 #ifdef DEBUGGING
1163                     nxt2 = nxt;
1164 #endif
1165                     nxt = regnext(nxt);
1166                     if (OP(nxt) != CLOSE)
1167                         goto nogo;
1168                     /* Now we know that nxt2 is the only contents: */
1169                     oscan->flags = (U8)ARG(nxt);
1170                     OP(oscan) = CURLYN;
1171                     OP(nxt1) = NOTHING; /* was OPEN. */
1172 #ifdef DEBUGGING
1173                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1174                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1175                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1176                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1177                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1178                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1179 #endif
1180                 }
1181               nogo:
1182
1183                 /* Try optimization CURLYX => CURLYM. */
1184                 if (  OP(oscan) == CURLYX && data
1185                       && !(data->flags & SF_HAS_PAR)
1186                       && !(data->flags & SF_HAS_EVAL)
1187                       && !deltanext  ) {
1188                     /* XXXX How to optimize if data == 0? */
1189                     /* Optimize to a simpler form.  */
1190                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1191                     regnode *nxt2;
1192
1193                     OP(oscan) = CURLYM;
1194                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1195                             && (OP(nxt2) != WHILEM))
1196                         nxt = nxt2;
1197                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1198                     /* Need to optimize away parenths. */
1199                     if (data->flags & SF_IN_PAR) {
1200                         /* Set the parenth number.  */
1201                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1202
1203                         if (OP(nxt) != CLOSE)
1204                             FAIL("Panic opt close");
1205                         oscan->flags = (U8)ARG(nxt);
1206                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1207                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1208 #ifdef DEBUGGING
1209                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1210                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1211                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1212                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1213 #endif
1214 #if 0
1215                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1216                             regnode *nnxt = regnext(nxt1);
1217                         
1218                             if (nnxt == nxt) {
1219                                 if (reg_off_by_arg[OP(nxt1)])
1220                                     ARG_SET(nxt1, nxt2 - nxt1);
1221                                 else if (nxt2 - nxt1 < U16_MAX)
1222                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1223                                 else
1224                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1225                             }
1226                             nxt1 = nnxt;
1227                         }
1228 #endif
1229                         /* Optimize again: */
1230                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1231                                     NULL, 0);
1232                     }
1233                     else
1234                         oscan->flags = 0;
1235                 }
1236                 else if ((OP(oscan) == CURLYX)
1237                          && (flags & SCF_WHILEM_VISITED_POS)
1238                          /* See the comment on a similar expression above.
1239                             However, this time it not a subexpression
1240                             we care about, but the expression itself. */
1241                          && (maxcount == REG_INFTY)
1242                          && data && ++data->whilem_c < 16) {
1243                     /* This stays as CURLYX, we can put the count/of pair. */
1244                     /* Find WHILEM (as in regexec.c) */
1245                     regnode *nxt = oscan + NEXT_OFF(oscan);
1246
1247                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1248                         nxt += ARG(nxt);
1249                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1250                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1251                 }
1252                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1253                     pars++;
1254                 if (flags & SCF_DO_SUBSTR) {
1255                     SV *last_str = Nullsv;
1256                     int counted = mincount != 0;
1257
1258                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1259 #if defined(SPARC64_GCC_WORKAROUND)
1260                         I32 b = 0;
1261                         STRLEN l = 0;
1262                         char *s = NULL;
1263                         I32 old = 0;
1264
1265                         if (pos_before >= data->last_start_min)
1266                             b = pos_before;
1267                         else
1268                             b = data->last_start_min;
1269
1270                         l = 0;
1271                         s = SvPV(data->last_found, l);
1272                         old = b - data->last_start_min;
1273
1274 #else
1275                         I32 b = pos_before >= data->last_start_min
1276                             ? pos_before : data->last_start_min;
1277                         STRLEN l;
1278                         char *s = SvPV(data->last_found, l);
1279                         I32 old = b - data->last_start_min;
1280 #endif
1281
1282                         if (UTF)
1283                             old = utf8_hop((U8*)s, old) - (U8*)s;
1284                         
1285                         l -= old;
1286                         /* Get the added string: */
1287                         last_str = newSVpvn(s  + old, l);
1288                         if (UTF)
1289                             SvUTF8_on(last_str);
1290                         if (deltanext == 0 && pos_before == b) {
1291                             /* What was added is a constant string */
1292                             if (mincount > 1) {
1293                                 SvGROW(last_str, (mincount * l) + 1);
1294                                 repeatcpy(SvPVX(last_str) + l,
1295                                           SvPVX(last_str), l, mincount - 1);
1296                                 SvCUR(last_str) *= mincount;
1297                                 /* Add additional parts. */
1298                                 SvCUR_set(data->last_found,
1299                                           SvCUR(data->last_found) - l);
1300                                 sv_catsv(data->last_found, last_str);
1301                                 {
1302                                     SV * sv = data->last_found;
1303                                     MAGIC *mg =
1304                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1305                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1306                                     if (mg && mg->mg_len >= 0)
1307                                         mg->mg_len += CHR_SVLEN(last_str);
1308                                 }
1309                                 data->last_end += l * (mincount - 1);
1310                             }
1311                         } else {
1312                             /* start offset must point into the last copy */
1313                             data->last_start_min += minnext * (mincount - 1);
1314                             data->last_start_max += is_inf ? I32_MAX
1315                                 : (maxcount - 1) * (minnext + data->pos_delta);
1316                         }
1317                     }
1318                     /* It is counted once already... */
1319                     data->pos_min += minnext * (mincount - counted);
1320                     data->pos_delta += - counted * deltanext +
1321                         (minnext + deltanext) * maxcount - minnext * mincount;
1322                     if (mincount != maxcount) {
1323                          /* Cannot extend fixed substrings found inside
1324                             the group.  */
1325                         scan_commit(pRExC_state,data);
1326                         if (mincount && last_str) {
1327                             sv_setsv(data->last_found, last_str);
1328                             data->last_end = data->pos_min;
1329                             data->last_start_min =
1330                                 data->pos_min - CHR_SVLEN(last_str);
1331                             data->last_start_max = is_inf
1332                                 ? I32_MAX
1333                                 : data->pos_min + data->pos_delta
1334                                 - CHR_SVLEN(last_str);
1335                         }
1336                         data->longest = &(data->longest_float);
1337                     }
1338                     SvREFCNT_dec(last_str);
1339                 }
1340                 if (data && (fl & SF_HAS_EVAL))
1341                     data->flags |= SF_HAS_EVAL;
1342               optimize_curly_tail:
1343                 if (OP(oscan) != CURLYX) {
1344                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1345                            && NEXT_OFF(next))
1346                         NEXT_OFF(oscan) += NEXT_OFF(next);
1347                 }
1348                 continue;
1349             default:                    /* REF and CLUMP only? */
1350                 if (flags & SCF_DO_SUBSTR) {
1351                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1352                     data->longest = &(data->longest_float);
1353                 }
1354                 is_inf = is_inf_internal = 1;
1355                 if (flags & SCF_DO_STCLASS_OR)
1356                     cl_anything(pRExC_state, data->start_class);
1357                 flags &= ~SCF_DO_STCLASS;
1358                 break;
1359             }
1360         }
1361         else if (strchr((char*)PL_simple,OP(scan))) {
1362             int value = 0;
1363
1364             if (flags & SCF_DO_SUBSTR) {
1365                 scan_commit(pRExC_state,data);
1366                 data->pos_min++;
1367             }
1368             min++;
1369             if (flags & SCF_DO_STCLASS) {
1370                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1371
1372                 /* Some of the logic below assumes that switching
1373                    locale on will only add false positives. */
1374                 switch (PL_regkind[(U8)OP(scan)]) {
1375                 case SANY:
1376                 default:
1377                   do_default:
1378                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1379                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1380                         cl_anything(pRExC_state, data->start_class);
1381                     break;
1382                 case REG_ANY:
1383                     if (OP(scan) == SANY)
1384                         goto do_default;
1385                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1386                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1387                                  || (data->start_class->flags & ANYOF_CLASS));
1388                         cl_anything(pRExC_state, data->start_class);
1389                     }
1390                     if (flags & SCF_DO_STCLASS_AND || !value)
1391                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1392                     break;
1393                 case ANYOF:
1394                     if (flags & SCF_DO_STCLASS_AND)
1395                         cl_and(data->start_class,
1396                                (struct regnode_charclass_class*)scan);
1397                     else
1398                         cl_or(pRExC_state, data->start_class,
1399                               (struct regnode_charclass_class*)scan);
1400                     break;
1401                 case ALNUM:
1402                     if (flags & SCF_DO_STCLASS_AND) {
1403                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1404                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1405                             for (value = 0; value < 256; value++)
1406                                 if (!isALNUM(value))
1407                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1408                         }
1409                     }
1410                     else {
1411                         if (data->start_class->flags & ANYOF_LOCALE)
1412                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1413                         else {
1414                             for (value = 0; value < 256; value++)
1415                                 if (isALNUM(value))
1416                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1417                         }
1418                     }
1419                     break;
1420                 case ALNUML:
1421                     if (flags & SCF_DO_STCLASS_AND) {
1422                         if (data->start_class->flags & ANYOF_LOCALE)
1423                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1424                     }
1425                     else {
1426                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1427                         data->start_class->flags |= ANYOF_LOCALE;
1428                     }
1429                     break;
1430                 case NALNUM:
1431                     if (flags & SCF_DO_STCLASS_AND) {
1432                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1433                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1434                             for (value = 0; value < 256; value++)
1435                                 if (isALNUM(value))
1436                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1437                         }
1438                     }
1439                     else {
1440                         if (data->start_class->flags & ANYOF_LOCALE)
1441                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1442                         else {
1443                             for (value = 0; value < 256; value++)
1444                                 if (!isALNUM(value))
1445                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1446                         }
1447                     }
1448                     break;
1449                 case NALNUML:
1450                     if (flags & SCF_DO_STCLASS_AND) {
1451                         if (data->start_class->flags & ANYOF_LOCALE)
1452                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1453                     }
1454                     else {
1455                         data->start_class->flags |= ANYOF_LOCALE;
1456                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1457                     }
1458                     break;
1459                 case SPACE:
1460                     if (flags & SCF_DO_STCLASS_AND) {
1461                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1462                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1463                             for (value = 0; value < 256; value++)
1464                                 if (!isSPACE(value))
1465                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1466                         }
1467                     }
1468                     else {
1469                         if (data->start_class->flags & ANYOF_LOCALE)
1470                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1471                         else {
1472                             for (value = 0; value < 256; value++)
1473                                 if (isSPACE(value))
1474                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1475                         }
1476                     }
1477                     break;
1478                 case SPACEL:
1479                     if (flags & SCF_DO_STCLASS_AND) {
1480                         if (data->start_class->flags & ANYOF_LOCALE)
1481                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1482                     }
1483                     else {
1484                         data->start_class->flags |= ANYOF_LOCALE;
1485                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1486                     }
1487                     break;
1488                 case NSPACE:
1489                     if (flags & SCF_DO_STCLASS_AND) {
1490                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1491                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1492                             for (value = 0; value < 256; value++)
1493                                 if (isSPACE(value))
1494                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1495                         }
1496                     }
1497                     else {
1498                         if (data->start_class->flags & ANYOF_LOCALE)
1499                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1500                         else {
1501                             for (value = 0; value < 256; value++)
1502                                 if (!isSPACE(value))
1503                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1504                         }
1505                     }
1506                     break;
1507                 case NSPACEL:
1508                     if (flags & SCF_DO_STCLASS_AND) {
1509                         if (data->start_class->flags & ANYOF_LOCALE) {
1510                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1511                             for (value = 0; value < 256; value++)
1512                                 if (!isSPACE(value))
1513                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1514                         }
1515                     }
1516                     else {
1517                         data->start_class->flags |= ANYOF_LOCALE;
1518                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1519                     }
1520                     break;
1521                 case DIGIT:
1522                     if (flags & SCF_DO_STCLASS_AND) {
1523                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1524                         for (value = 0; value < 256; value++)
1525                             if (!isDIGIT(value))
1526                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1527                     }
1528                     else {
1529                         if (data->start_class->flags & ANYOF_LOCALE)
1530                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1531                         else {
1532                             for (value = 0; value < 256; value++)
1533                                 if (isDIGIT(value))
1534                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1535                         }
1536                     }
1537                     break;
1538                 case NDIGIT:
1539                     if (flags & SCF_DO_STCLASS_AND) {
1540                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1541                         for (value = 0; value < 256; value++)
1542                             if (isDIGIT(value))
1543                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1544                     }
1545                     else {
1546                         if (data->start_class->flags & ANYOF_LOCALE)
1547                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1548                         else {
1549                             for (value = 0; value < 256; value++)
1550                                 if (!isDIGIT(value))
1551                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1552                         }
1553                     }
1554                     break;
1555                 }
1556                 if (flags & SCF_DO_STCLASS_OR)
1557                     cl_and(data->start_class, &and_with);
1558                 flags &= ~SCF_DO_STCLASS;
1559             }
1560         }
1561         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1562             data->flags |= (OP(scan) == MEOL
1563                             ? SF_BEFORE_MEOL
1564                             : SF_BEFORE_SEOL);
1565         }
1566         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1567                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1568                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1569                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1570             /* Lookahead/lookbehind */
1571             I32 deltanext, minnext, fake = 0;
1572             regnode *nscan;
1573             struct regnode_charclass_class intrnl;
1574             int f = 0;
1575
1576             data_fake.flags = 0;
1577             if (data) {         
1578                 data_fake.whilem_c = data->whilem_c;
1579                 data_fake.last_closep = data->last_closep;
1580             }
1581             else
1582                 data_fake.last_closep = &fake;
1583             if ( flags & SCF_DO_STCLASS && !scan->flags
1584                  && OP(scan) == IFMATCH ) { /* Lookahead */
1585                 cl_init(pRExC_state, &intrnl);
1586                 data_fake.start_class = &intrnl;
1587                 f |= SCF_DO_STCLASS_AND;
1588             }
1589             if (flags & SCF_WHILEM_VISITED_POS)
1590                 f |= SCF_WHILEM_VISITED_POS;
1591             next = regnext(scan);
1592             nscan = NEXTOPER(NEXTOPER(scan));
1593             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1594             if (scan->flags) {
1595                 if (deltanext) {
1596                     vFAIL("Variable length lookbehind not implemented");
1597                 }
1598                 else if (minnext > U8_MAX) {
1599                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1600                 }
1601                 scan->flags = (U8)minnext;
1602             }
1603             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1604                 pars++;
1605             if (data && (data_fake.flags & SF_HAS_EVAL))
1606                 data->flags |= SF_HAS_EVAL;
1607             if (data)
1608                 data->whilem_c = data_fake.whilem_c;
1609             if (f & SCF_DO_STCLASS_AND) {
1610                 int was = (data->start_class->flags & ANYOF_EOS);
1611
1612                 cl_and(data->start_class, &intrnl);
1613                 if (was)
1614                     data->start_class->flags |= ANYOF_EOS;
1615             }
1616         }
1617         else if (OP(scan) == OPEN) {
1618             pars++;
1619         }
1620         else if (OP(scan) == CLOSE) {
1621             if ((I32)ARG(scan) == is_par) {
1622                 next = regnext(scan);
1623
1624                 if ( next && (OP(next) != WHILEM) && next < last)
1625                     is_par = 0;         /* Disable optimization */
1626             }
1627             if (data)
1628                 *(data->last_closep) = ARG(scan);
1629         }
1630         else if (OP(scan) == EVAL) {
1631                 if (data)
1632                     data->flags |= SF_HAS_EVAL;
1633         }
1634         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1635                 if (flags & SCF_DO_SUBSTR) {
1636                     scan_commit(pRExC_state,data);
1637                     data->longest = &(data->longest_float);
1638                 }
1639                 is_inf = is_inf_internal = 1;
1640                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1641                     cl_anything(pRExC_state, data->start_class);
1642                 flags &= ~SCF_DO_STCLASS;
1643         }
1644         /* Else: zero-length, ignore. */
1645         scan = regnext(scan);
1646     }
1647
1648   finish:
1649     *scanp = scan;
1650     *deltap = is_inf_internal ? I32_MAX : delta;
1651     if (flags & SCF_DO_SUBSTR && is_inf)
1652         data->pos_delta = I32_MAX - data->pos_min;
1653     if (is_par > U8_MAX)
1654         is_par = 0;
1655     if (is_par && pars==1 && data) {
1656         data->flags |= SF_IN_PAR;
1657         data->flags &= ~SF_HAS_PAR;
1658     }
1659     else if (pars && data) {
1660         data->flags |= SF_HAS_PAR;
1661         data->flags &= ~SF_IN_PAR;
1662     }
1663     if (flags & SCF_DO_STCLASS_OR)
1664         cl_and(data->start_class, &and_with);
1665     return min;
1666 }
1667
1668 STATIC I32
1669 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1670 {
1671     if (RExC_rx->data) {
1672         Renewc(RExC_rx->data,
1673                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1674                char, struct reg_data);
1675         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1676         RExC_rx->data->count += n;
1677     }
1678     else {
1679         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1680              char, struct reg_data);
1681         New(1208, RExC_rx->data->what, n, U8);
1682         RExC_rx->data->count = n;
1683     }
1684     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1685     return RExC_rx->data->count - n;
1686 }
1687
1688 void
1689 Perl_reginitcolors(pTHX)
1690 {
1691     int i = 0;
1692     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1693         
1694     if (s) {
1695         PL_colors[0] = s = savepv(s);
1696         while (++i < 6) {
1697             s = strchr(s, '\t');
1698             if (s) {
1699                 *s = '\0';
1700                 PL_colors[i] = ++s;
1701             }
1702             else
1703                 PL_colors[i] = s = "";
1704         }
1705     } else {
1706         while (i < 6)
1707             PL_colors[i++] = "";
1708     }
1709     PL_colorset = 1;
1710 }
1711
1712
1713 /*
1714  - pregcomp - compile a regular expression into internal code
1715  *
1716  * We can't allocate space until we know how big the compiled form will be,
1717  * but we can't compile it (and thus know how big it is) until we've got a
1718  * place to put the code.  So we cheat:  we compile it twice, once with code
1719  * generation turned off and size counting turned on, and once "for real".
1720  * This also means that we don't allocate space until we are sure that the
1721  * thing really will compile successfully, and we never have to move the
1722  * code and thus invalidate pointers into it.  (Note that it has to be in
1723  * one piece because free() must be able to free it all.) [NB: not true in perl]
1724  *
1725  * Beware that the optimization-preparation code in here knows about some
1726  * of the structure of the compiled regexp.  [I'll say.]
1727  */
1728 regexp *
1729 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1730 {
1731     register regexp *r;
1732     regnode *scan;
1733     regnode *first;
1734     I32 flags;
1735     I32 minlen = 0;
1736     I32 sawplus = 0;
1737     I32 sawopen = 0;
1738     scan_data_t data;
1739     RExC_state_t RExC_state;
1740     RExC_state_t *pRExC_state = &RExC_state;
1741
1742     if (exp == NULL)
1743         FAIL("NULL regexp argument");
1744
1745     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1746
1747     RExC_precomp = exp;
1748     DEBUG_r({
1749          if (!PL_colorset) reginitcolors();
1750          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1751                        PL_colors[4],PL_colors[5],PL_colors[0],
1752                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1753     });
1754     RExC_flags = pm->op_pmflags;
1755     RExC_sawback = 0;
1756
1757     RExC_seen = 0;
1758     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1759     RExC_seen_evals = 0;
1760     RExC_extralen = 0;
1761
1762     /* First pass: determine size, legality. */
1763     RExC_parse = exp;
1764     RExC_start = exp;
1765     RExC_end = xend;
1766     RExC_naughty = 0;
1767     RExC_npar = 1;
1768     RExC_size = 0L;
1769     RExC_emit = &PL_regdummy;
1770     RExC_whilem_seen = 0;
1771 #if 0 /* REGC() is (currently) a NOP at the first pass.
1772        * Clever compilers notice this and complain. --jhi */
1773     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1774 #endif
1775     if (reg(pRExC_state, 0, &flags) == NULL) {
1776         RExC_precomp = Nullch;
1777         return(NULL);
1778     }
1779     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1780
1781     /* Small enough for pointer-storage convention?
1782        If extralen==0, this means that we will not need long jumps. */
1783     if (RExC_size >= 0x10000L && RExC_extralen)
1784         RExC_size += RExC_extralen;
1785     else
1786         RExC_extralen = 0;
1787     if (RExC_whilem_seen > 15)
1788         RExC_whilem_seen = 15;
1789
1790     /* Allocate space and initialize. */
1791     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1792          char, regexp);
1793     if (r == NULL)
1794         FAIL("Regexp out of space");
1795
1796 #ifdef DEBUGGING
1797     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1798     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1799 #endif
1800     r->refcnt = 1;
1801     r->prelen = xend - exp;
1802     r->precomp = savepvn(RExC_precomp, r->prelen);
1803     r->subbeg = NULL;
1804     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1805     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1806
1807     r->substrs = 0;                     /* Useful during FAIL. */
1808     r->startp = 0;                      /* Useful during FAIL. */
1809     r->endp = 0;                        /* Useful during FAIL. */
1810
1811     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1812     if (r->offsets) {
1813       r->offsets[0] = RExC_size; 
1814     }
1815     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1816                           "%s %"UVuf" bytes for offset annotations.\n", 
1817                           r->offsets ? "Got" : "Couldn't get", 
1818                           (UV)((2*RExC_size+1) * sizeof(U32))));
1819
1820     RExC_rx = r;
1821
1822     /* Second pass: emit code. */
1823     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1824     RExC_parse = exp;
1825     RExC_end = xend;
1826     RExC_naughty = 0;
1827     RExC_npar = 1;
1828     RExC_emit_start = r->program;
1829     RExC_emit = r->program;
1830     /* Store the count of eval-groups for security checks: */
1831     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1832     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1833     r->data = 0;
1834     if (reg(pRExC_state, 0, &flags) == NULL)
1835         return(NULL);
1836
1837     /* Dig out information for optimizations. */
1838     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1839     pm->op_pmflags = RExC_flags;
1840     if (UTF)
1841         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1842     r->regstclass = NULL;
1843     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1844         r->reganch |= ROPT_NAUGHTY;
1845     scan = r->program + 1;              /* First BRANCH. */
1846
1847     /* XXXX To minimize changes to RE engine we always allocate
1848        3-units-long substrs field. */
1849     Newz(1004, r->substrs, 1, struct reg_substr_data);
1850
1851     StructCopy(&zero_scan_data, &data, scan_data_t);
1852     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1853     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1854         I32 fake;
1855         STRLEN longest_float_length, longest_fixed_length;
1856         struct regnode_charclass_class ch_class;
1857         int stclass_flag;
1858         I32 last_close = 0;
1859
1860         first = scan;
1861         /* Skip introductions and multiplicators >= 1. */
1862         while ((OP(first) == OPEN && (sawopen = 1)) ||
1863                /* An OR of *one* alternative - should not happen now. */
1864             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1865             (OP(first) == PLUS) ||
1866             (OP(first) == MINMOD) ||
1867                /* An {n,m} with n>0 */
1868             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1869                 if (OP(first) == PLUS)
1870                     sawplus = 1;
1871                 else
1872                     first += regarglen[(U8)OP(first)];
1873                 first = NEXTOPER(first);
1874         }
1875
1876         /* Starting-point info. */
1877       again:
1878         if (PL_regkind[(U8)OP(first)] == EXACT) {
1879             if (OP(first) == EXACT)
1880                 ;       /* Empty, get anchored substr later. */
1881             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1882                 r->regstclass = first;
1883         }
1884         else if (strchr((char*)PL_simple,OP(first)))
1885             r->regstclass = first;
1886         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1887                  PL_regkind[(U8)OP(first)] == NBOUND)
1888             r->regstclass = first;
1889         else if (PL_regkind[(U8)OP(first)] == BOL) {
1890             r->reganch |= (OP(first) == MBOL
1891                            ? ROPT_ANCH_MBOL
1892                            : (OP(first) == SBOL
1893                               ? ROPT_ANCH_SBOL
1894                               : ROPT_ANCH_BOL));
1895             first = NEXTOPER(first);
1896             goto again;
1897         }
1898         else if (OP(first) == GPOS) {
1899             r->reganch |= ROPT_ANCH_GPOS;
1900             first = NEXTOPER(first);
1901             goto again;
1902         }
1903         else if (!sawopen && (OP(first) == STAR &&
1904             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1905             !(r->reganch & ROPT_ANCH) )
1906         {
1907             /* turn .* into ^.* with an implied $*=1 */
1908             int type = OP(NEXTOPER(first));
1909
1910             if (type == REG_ANY)
1911                 type = ROPT_ANCH_MBOL;
1912             else
1913                 type = ROPT_ANCH_SBOL;
1914
1915             r->reganch |= type | ROPT_IMPLICIT;
1916             first = NEXTOPER(first);
1917             goto again;
1918         }
1919         if (sawplus && (!sawopen || !RExC_sawback)
1920             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1921             /* x+ must match at the 1st pos of run of x's */
1922             r->reganch |= ROPT_SKIP;
1923
1924         /* Scan is after the zeroth branch, first is atomic matcher. */
1925         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1926                               (IV)(first - scan + 1)));
1927         /*
1928         * If there's something expensive in the r.e., find the
1929         * longest literal string that must appear and make it the
1930         * regmust.  Resolve ties in favor of later strings, since
1931         * the regstart check works with the beginning of the r.e.
1932         * and avoiding duplication strengthens checking.  Not a
1933         * strong reason, but sufficient in the absence of others.
1934         * [Now we resolve ties in favor of the earlier string if
1935         * it happens that c_offset_min has been invalidated, since the
1936         * earlier string may buy us something the later one won't.]
1937         */
1938         minlen = 0;
1939
1940         data.longest_fixed = newSVpvn("",0);
1941         data.longest_float = newSVpvn("",0);
1942         data.last_found = newSVpvn("",0);
1943         data.longest = &(data.longest_fixed);
1944         first = scan;
1945         if (!r->regstclass) {
1946             cl_init(pRExC_state, &ch_class);
1947             data.start_class = &ch_class;
1948             stclass_flag = SCF_DO_STCLASS_AND;
1949         } else                          /* XXXX Check for BOUND? */
1950             stclass_flag = 0;
1951         data.last_closep = &last_close;
1952
1953         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1954                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1955         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1956              && data.last_start_min == 0 && data.last_end > 0
1957              && !RExC_seen_zerolen
1958              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1959             r->reganch |= ROPT_CHECK_ALL;
1960         scan_commit(pRExC_state, &data);
1961         SvREFCNT_dec(data.last_found);
1962
1963         longest_float_length = CHR_SVLEN(data.longest_float);
1964         if (longest_float_length
1965             || (data.flags & SF_FL_BEFORE_EOL
1966                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1967                     || (RExC_flags & PMf_MULTILINE)))) {
1968             int t;
1969
1970             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1971                 && data.offset_fixed == data.offset_float_min
1972                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1973                     goto remove_float;          /* As in (a)+. */
1974
1975             if (SvUTF8(data.longest_float)) {
1976                 r->float_utf8 = data.longest_float;
1977                 r->float_substr = Nullsv;
1978             } else {
1979                 r->float_substr = data.longest_float;
1980                 r->float_utf8 = Nullsv;
1981             }
1982             r->float_min_offset = data.offset_float_min;
1983             r->float_max_offset = data.offset_float_max;
1984             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1985                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1986                            || (RExC_flags & PMf_MULTILINE)));
1987             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1988         }
1989         else {
1990           remove_float:
1991             r->float_substr = r->float_utf8 = Nullsv;
1992             SvREFCNT_dec(data.longest_float);
1993             longest_float_length = 0;
1994         }
1995
1996         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1997         if (longest_fixed_length
1998             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1999                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2000                     || (RExC_flags & PMf_MULTILINE)))) {
2001             int t;
2002
2003             if (SvUTF8(data.longest_fixed)) {
2004                 r->anchored_utf8 = data.longest_fixed;
2005                 r->anchored_substr = Nullsv;
2006             } else {
2007                 r->anchored_substr = data.longest_fixed;
2008                 r->anchored_utf8 = Nullsv;
2009             }
2010             r->anchored_offset = data.offset_fixed;
2011             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2012                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2013                      || (RExC_flags & PMf_MULTILINE)));
2014             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2015         }
2016         else {
2017             r->anchored_substr = r->anchored_utf8 = Nullsv;
2018             SvREFCNT_dec(data.longest_fixed);
2019             longest_fixed_length = 0;
2020         }
2021         if (r->regstclass
2022             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2023             r->regstclass = NULL;
2024         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2025             && stclass_flag
2026             && !(data.start_class->flags & ANYOF_EOS)
2027             && !cl_is_anything(data.start_class))
2028         {
2029             I32 n = add_data(pRExC_state, 1, "f");
2030
2031             New(1006, RExC_rx->data->data[n], 1,
2032                 struct regnode_charclass_class);
2033             StructCopy(data.start_class,
2034                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2035                        struct regnode_charclass_class);
2036             r->regstclass = (regnode*)RExC_rx->data->data[n];
2037             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2038             PL_regdata = r->data; /* for regprop() */
2039             DEBUG_r({ SV *sv = sv_newmortal();
2040                       regprop(sv, (regnode*)data.start_class);
2041                       PerlIO_printf(Perl_debug_log,
2042                                     "synthetic stclass `%s'.\n",
2043                                     SvPVX(sv));});
2044         }
2045
2046         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2047         if (longest_fixed_length > longest_float_length) {
2048             r->check_substr = r->anchored_substr;
2049             r->check_utf8 = r->anchored_utf8;
2050             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2051             if (r->reganch & ROPT_ANCH_SINGLE)
2052                 r->reganch |= ROPT_NOSCAN;
2053         }
2054         else {
2055             r->check_substr = r->float_substr;
2056             r->check_utf8 = r->float_utf8;
2057             r->check_offset_min = data.offset_float_min;
2058             r->check_offset_max = data.offset_float_max;
2059         }
2060         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2061            This should be changed ASAP!  */
2062         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2063             r->reganch |= RE_USE_INTUIT;
2064             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2065                 r->reganch |= RE_INTUIT_TAIL;
2066         }
2067     }
2068     else {
2069         /* Several toplevels. Best we can is to set minlen. */
2070         I32 fake;
2071         struct regnode_charclass_class ch_class;
2072         I32 last_close = 0;
2073         
2074         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2075         scan = r->program + 1;
2076         cl_init(pRExC_state, &ch_class);
2077         data.start_class = &ch_class;
2078         data.last_closep = &last_close;
2079         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2080         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2081                 = r->float_substr = r->float_utf8 = Nullsv;
2082         if (!(data.start_class->flags & ANYOF_EOS)
2083             && !cl_is_anything(data.start_class))
2084         {
2085             I32 n = add_data(pRExC_state, 1, "f");
2086
2087             New(1006, RExC_rx->data->data[n], 1,
2088                 struct regnode_charclass_class);
2089             StructCopy(data.start_class,
2090                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2091                        struct regnode_charclass_class);
2092             r->regstclass = (regnode*)RExC_rx->data->data[n];
2093             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2094             DEBUG_r({ SV* sv = sv_newmortal();
2095                       regprop(sv, (regnode*)data.start_class);
2096                       PerlIO_printf(Perl_debug_log,
2097                                     "synthetic stclass `%s'.\n",
2098                                     SvPVX(sv));});
2099         }
2100     }
2101
2102     r->minlen = minlen;
2103     if (RExC_seen & REG_SEEN_GPOS)
2104         r->reganch |= ROPT_GPOS_SEEN;
2105     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2106         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2107     if (RExC_seen & REG_SEEN_EVAL)
2108         r->reganch |= ROPT_EVAL_SEEN;
2109     if (RExC_seen & REG_SEEN_CANY)
2110         r->reganch |= ROPT_CANY_SEEN;
2111     Newz(1002, r->startp, RExC_npar, I32);
2112     Newz(1002, r->endp, RExC_npar, I32);
2113     PL_regdata = r->data; /* for regprop() */
2114     DEBUG_r(regdump(r));
2115     return(r);
2116 }
2117
2118 /*
2119  - reg - regular expression, i.e. main body or parenthesized thing
2120  *
2121  * Caller must absorb opening parenthesis.
2122  *
2123  * Combining parenthesis handling with the base level of regular expression
2124  * is a trifle forced, but the need to tie the tails of the branches to what
2125  * follows makes it hard to avoid.
2126  */
2127 STATIC regnode *
2128 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2129     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2130 {
2131     register regnode *ret;              /* Will be the head of the group. */
2132     register regnode *br;
2133     register regnode *lastbr;
2134     register regnode *ender = 0;
2135     register I32 parno = 0;
2136     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2137
2138     /* for (?g), (?gc), and (?o) warnings; warning
2139        about (?c) will warn about (?g) -- japhy    */
2140
2141     I32 wastedflags = 0x00,
2142         wasted_o    = 0x01,
2143         wasted_g    = 0x02,
2144         wasted_gc   = 0x02 | 0x04,
2145         wasted_c    = 0x04;
2146
2147     char * parse_start = RExC_parse; /* MJD */
2148     char *oregcomp_parse = RExC_parse;
2149     char c;
2150
2151     *flagp = 0;                         /* Tentatively. */
2152
2153
2154     /* Make an OPEN node, if parenthesized. */
2155     if (paren) {
2156         if (*RExC_parse == '?') { /* (?...) */
2157             U32 posflags = 0, negflags = 0;
2158             U32 *flagsp = &posflags;
2159             int logical = 0;
2160             char *seqstart = RExC_parse;
2161
2162             RExC_parse++;
2163             paren = *RExC_parse++;
2164             ret = NULL;                 /* For look-ahead/behind. */
2165             switch (paren) {
2166             case '<':           /* (?<...) */
2167                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2168                 if (*RExC_parse == '!')
2169                     paren = ',';
2170                 if (*RExC_parse != '=' && *RExC_parse != '!')
2171                     goto unknown;
2172                 RExC_parse++;
2173             case '=':           /* (?=...) */
2174             case '!':           /* (?!...) */
2175                 RExC_seen_zerolen++;
2176             case ':':           /* (?:...) */
2177             case '>':           /* (?>...) */
2178                 break;
2179             case '$':           /* (?$...) */
2180             case '@':           /* (?@...) */
2181                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2182                 break;
2183             case '#':           /* (?#...) */
2184                 while (*RExC_parse && *RExC_parse != ')')
2185                     RExC_parse++;
2186                 if (*RExC_parse != ')')
2187                     FAIL("Sequence (?#... not terminated");
2188                 nextchar(pRExC_state);
2189                 *flagp = TRYAGAIN;
2190                 return NULL;
2191             case 'p':           /* (?p...) */
2192                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2193                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2194                 /* FALL THROUGH*/
2195             case '?':           /* (??...) */
2196                 logical = 1;
2197                 if (*RExC_parse != '{')
2198                     goto unknown;
2199                 paren = *RExC_parse++;
2200                 /* FALL THROUGH */
2201             case '{':           /* (?{...}) */
2202             {
2203                 I32 count = 1, n = 0;
2204                 char c;
2205                 char *s = RExC_parse;
2206                 SV *sv;
2207                 OP_4tree *sop, *rop;
2208
2209                 RExC_seen_zerolen++;
2210                 RExC_seen |= REG_SEEN_EVAL;
2211                 while (count && (c = *RExC_parse)) {
2212                     if (c == '\\' && RExC_parse[1])
2213                         RExC_parse++;
2214                     else if (c == '{')
2215                         count++;
2216                     else if (c == '}')
2217                         count--;
2218                     RExC_parse++;
2219                 }
2220                 if (*RExC_parse != ')')
2221                 {
2222                     RExC_parse = s;             
2223                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2224                 }
2225                 if (!SIZE_ONLY) {
2226                     PAD *pad;
2227                 
2228                     if (RExC_parse - 1 - s)
2229                         sv = newSVpvn(s, RExC_parse - 1 - s);
2230                     else
2231                         sv = newSVpvn("", 0);
2232
2233                     ENTER;
2234                     Perl_save_re_context(aTHX);
2235                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2236                     sop->op_private |= OPpREFCOUNTED;
2237                     /* re_dup will OpREFCNT_inc */
2238                     OpREFCNT_set(sop, 1);
2239                     LEAVE;
2240
2241                     n = add_data(pRExC_state, 3, "nop");
2242                     RExC_rx->data->data[n] = (void*)rop;
2243                     RExC_rx->data->data[n+1] = (void*)sop;
2244                     RExC_rx->data->data[n+2] = (void*)pad;
2245                     SvREFCNT_dec(sv);
2246                 }
2247                 else {                                          /* First pass */
2248                     if (PL_reginterp_cnt < ++RExC_seen_evals
2249                         && PL_curcop != &PL_compiling)
2250                         /* No compiled RE interpolated, has runtime
2251                            components ===> unsafe.  */
2252                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2253                     if (PL_tainting && PL_tainted)
2254                         FAIL("Eval-group in insecure regular expression");
2255                 }
2256                 
2257                 nextchar(pRExC_state);
2258                 if (logical) {
2259                     ret = reg_node(pRExC_state, LOGICAL);
2260                     if (!SIZE_ONLY)
2261                         ret->flags = 2;
2262                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2263                     /* deal with the length of this later - MJD */
2264                     return ret;
2265                 }
2266                 ret = reganode(pRExC_state, EVAL, n);
2267                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2268                 Set_Node_Offset(ret, parse_start);
2269                 return ret;
2270             }
2271             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2272             {
2273                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2274                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2275                         || RExC_parse[1] == '<'
2276                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2277                         I32 flag;
2278                         
2279                         ret = reg_node(pRExC_state, LOGICAL);
2280                         if (!SIZE_ONLY)
2281                             ret->flags = 1;
2282                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2283                         goto insert_if;
2284                     }
2285                 }
2286                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2287                     /* (?(1)...) */
2288                     parno = atoi(RExC_parse++);
2289
2290                     while (isDIGIT(*RExC_parse))
2291                         RExC_parse++;
2292                     ret = reganode(pRExC_state, GROUPP, parno);
2293                     
2294                     if ((c = *nextchar(pRExC_state)) != ')')
2295                         vFAIL("Switch condition not recognized");
2296                   insert_if:
2297                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2298                     br = regbranch(pRExC_state, &flags, 1);
2299                     if (br == NULL)
2300                         br = reganode(pRExC_state, LONGJMP, 0);
2301                     else
2302                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2303                     c = *nextchar(pRExC_state);
2304                     if (flags&HASWIDTH)
2305                         *flagp |= HASWIDTH;
2306                     if (c == '|') {
2307                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2308                         regbranch(pRExC_state, &flags, 1);
2309                         regtail(pRExC_state, ret, lastbr);
2310                         if (flags&HASWIDTH)
2311                             *flagp |= HASWIDTH;
2312                         c = *nextchar(pRExC_state);
2313                     }
2314                     else
2315                         lastbr = NULL;
2316                     if (c != ')')
2317                         vFAIL("Switch (?(condition)... contains too many branches");
2318                     ender = reg_node(pRExC_state, TAIL);
2319                     regtail(pRExC_state, br, ender);
2320                     if (lastbr) {
2321                         regtail(pRExC_state, lastbr, ender);
2322                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2323                     }
2324                     else
2325                         regtail(pRExC_state, ret, ender);
2326                     return ret;
2327                 }
2328                 else {
2329                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2330                 }
2331             }
2332             case 0:
2333                 RExC_parse--; /* for vFAIL to print correctly */
2334                 vFAIL("Sequence (? incomplete");
2335                 break;
2336             default:
2337                 --RExC_parse;
2338               parse_flags:      /* (?i) */
2339                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2340                     /* (?g), (?gc) and (?o) are useless here
2341                        and must be globally applied -- japhy */
2342
2343                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2344                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2345                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2346                             if (! (wastedflags & wflagbit) ) {
2347                                 wastedflags |= wflagbit;
2348                                 vWARN5(
2349                                     RExC_parse + 1,
2350                                     "Useless (%s%c) - %suse /%c modifier",
2351                                     flagsp == &negflags ? "?-" : "?",
2352                                     *RExC_parse,
2353                                     flagsp == &negflags ? "don't " : "",
2354                                     *RExC_parse
2355                                 );
2356                             }
2357                         }
2358                     }
2359                     else if (*RExC_parse == 'c') {
2360                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361                             if (! (wastedflags & wasted_c) ) {
2362                                 wastedflags |= wasted_gc;
2363                                 vWARN3(
2364                                     RExC_parse + 1,
2365                                     "Useless (%sc) - %suse /gc modifier",
2366                                     flagsp == &negflags ? "?-" : "?",
2367                                     flagsp == &negflags ? "don't " : ""
2368                                 );
2369                             }
2370                         }
2371                     }
2372                     else { pmflag(flagsp, *RExC_parse); }
2373
2374                     ++RExC_parse;
2375                 }
2376                 if (*RExC_parse == '-') {
2377                     flagsp = &negflags;
2378                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2379                     ++RExC_parse;
2380                     goto parse_flags;
2381                 }
2382                 RExC_flags |= posflags;
2383                 RExC_flags &= ~negflags;
2384                 if (*RExC_parse == ':') {
2385                     RExC_parse++;
2386                     paren = ':';
2387                     break;
2388                 }               
2389               unknown:
2390                 if (*RExC_parse != ')') {
2391                     RExC_parse++;
2392                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2393                 }
2394                 nextchar(pRExC_state);
2395                 *flagp = TRYAGAIN;
2396                 return NULL;
2397             }
2398         }
2399         else {                  /* (...) */
2400             parno = RExC_npar;
2401             RExC_npar++;
2402             ret = reganode(pRExC_state, OPEN, parno);
2403             Set_Node_Length(ret, 1); /* MJD */
2404             Set_Node_Offset(ret, RExC_parse); /* MJD */
2405             open = 1;
2406         }
2407     }
2408     else                        /* ! paren */
2409         ret = NULL;
2410
2411     /* Pick up the branches, linking them together. */
2412     parse_start = RExC_parse;   /* MJD */
2413     br = regbranch(pRExC_state, &flags, 1);
2414     /*     branch_len = (paren != 0); */
2415     
2416     if (br == NULL)
2417         return(NULL);
2418     if (*RExC_parse == '|') {
2419         if (!SIZE_ONLY && RExC_extralen) {
2420             reginsert(pRExC_state, BRANCHJ, br);
2421         }
2422         else {                  /* MJD */
2423             reginsert(pRExC_state, BRANCH, br);
2424             Set_Node_Length(br, paren != 0);
2425             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2426         }
2427         have_branch = 1;
2428         if (SIZE_ONLY)
2429             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2430     }
2431     else if (paren == ':') {
2432         *flagp |= flags&SIMPLE;
2433     }
2434     if (open) {                         /* Starts with OPEN. */
2435         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2436     }
2437     else if (paren != '?')              /* Not Conditional */
2438         ret = br;
2439     *flagp |= flags & (SPSTART | HASWIDTH);
2440     lastbr = br;
2441     while (*RExC_parse == '|') {
2442         if (!SIZE_ONLY && RExC_extralen) {
2443             ender = reganode(pRExC_state, LONGJMP,0);
2444             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2445         }
2446         if (SIZE_ONLY)
2447             RExC_extralen += 2;         /* Account for LONGJMP. */
2448         nextchar(pRExC_state);
2449         br = regbranch(pRExC_state, &flags, 0);
2450         
2451         if (br == NULL)
2452             return(NULL);
2453         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2454         lastbr = br;
2455         if (flags&HASWIDTH)
2456             *flagp |= HASWIDTH;
2457         *flagp |= flags&SPSTART;
2458     }
2459
2460     if (have_branch || paren != ':') {
2461         /* Make a closing node, and hook it on the end. */
2462         switch (paren) {
2463         case ':':
2464             ender = reg_node(pRExC_state, TAIL);
2465             break;
2466         case 1:
2467             ender = reganode(pRExC_state, CLOSE, parno);
2468             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2469             Set_Node_Length(ender,1); /* MJD */
2470             break;
2471         case '<':
2472         case ',':
2473         case '=':
2474         case '!':
2475             *flagp &= ~HASWIDTH;
2476             /* FALL THROUGH */
2477         case '>':
2478             ender = reg_node(pRExC_state, SUCCEED);
2479             break;
2480         case 0:
2481             ender = reg_node(pRExC_state, END);
2482             break;
2483         }
2484         regtail(pRExC_state, lastbr, ender);
2485
2486         if (have_branch) {
2487             /* Hook the tails of the branches to the closing node. */
2488             for (br = ret; br != NULL; br = regnext(br)) {
2489                 regoptail(pRExC_state, br, ender);
2490             }
2491         }
2492     }
2493
2494     {
2495         char *p;
2496         static char parens[] = "=!<,>";
2497
2498         if (paren && (p = strchr(parens, paren))) {
2499             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2500             int flag = (p - parens) > 1;
2501
2502             if (paren == '>')
2503                 node = SUSPEND, flag = 0;
2504             reginsert(pRExC_state, node,ret);
2505             Set_Node_Offset(ret, oregcomp_parse);
2506             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2507             ret->flags = flag;
2508             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2509         }
2510     }
2511
2512     /* Check for proper termination. */
2513     if (paren) {
2514         RExC_flags = oregflags;
2515         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2516             RExC_parse = oregcomp_parse;
2517             vFAIL("Unmatched (");
2518         }
2519     }
2520     else if (!paren && RExC_parse < RExC_end) {
2521         if (*RExC_parse == ')') {
2522             RExC_parse++;
2523             vFAIL("Unmatched )");
2524         }
2525         else
2526             FAIL("Junk on end of regexp");      /* "Can't happen". */
2527         /* NOTREACHED */
2528     }
2529
2530     return(ret);
2531 }
2532
2533 /*
2534  - regbranch - one alternative of an | operator
2535  *
2536  * Implements the concatenation operator.
2537  */
2538 STATIC regnode *
2539 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2540 {
2541     register regnode *ret;
2542     register regnode *chain = NULL;
2543     register regnode *latest;
2544     I32 flags = 0, c = 0;
2545
2546     if (first)
2547         ret = NULL;
2548     else {
2549         if (!SIZE_ONLY && RExC_extralen)
2550             ret = reganode(pRExC_state, BRANCHJ,0);
2551         else {
2552             ret = reg_node(pRExC_state, BRANCH);
2553             Set_Node_Length(ret, 1);
2554         }
2555     }
2556         
2557     if (!first && SIZE_ONLY)
2558         RExC_extralen += 1;                     /* BRANCHJ */
2559
2560     *flagp = WORST;                     /* Tentatively. */
2561
2562     RExC_parse--;
2563     nextchar(pRExC_state);
2564     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2565         flags &= ~TRYAGAIN;
2566         latest = regpiece(pRExC_state, &flags);
2567         if (latest == NULL) {
2568             if (flags & TRYAGAIN)
2569                 continue;
2570             return(NULL);
2571         }
2572         else if (ret == NULL)
2573             ret = latest;
2574         *flagp |= flags&HASWIDTH;
2575         if (chain == NULL)      /* First piece. */
2576             *flagp |= flags&SPSTART;
2577         else {
2578             RExC_naughty++;
2579             regtail(pRExC_state, chain, latest);
2580         }
2581         chain = latest;
2582         c++;
2583     }
2584     if (chain == NULL) {        /* Loop ran zero times. */
2585         chain = reg_node(pRExC_state, NOTHING);
2586         if (ret == NULL)
2587             ret = chain;
2588     }
2589     if (c == 1) {
2590         *flagp |= flags&SIMPLE;
2591     }
2592
2593     return(ret);
2594 }
2595
2596 /*
2597  - regpiece - something followed by possible [*+?]
2598  *
2599  * Note that the branching code sequences used for ? and the general cases
2600  * of * and + are somewhat optimized:  they use the same NOTHING node as
2601  * both the endmarker for their branch list and the body of the last branch.
2602  * It might seem that this node could be dispensed with entirely, but the
2603  * endmarker role is not redundant.
2604  */
2605 STATIC regnode *
2606 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2607 {
2608     register regnode *ret;
2609     register char op;
2610     register char *next;
2611     I32 flags;
2612     char *origparse = RExC_parse;
2613     char *maxpos;
2614     I32 min;
2615     I32 max = REG_INFTY;
2616     char *parse_start;
2617
2618     ret = regatom(pRExC_state, &flags);
2619     if (ret == NULL) {
2620         if (flags & TRYAGAIN)
2621             *flagp |= TRYAGAIN;
2622         return(NULL);
2623     }
2624
2625     op = *RExC_parse;
2626
2627     if (op == '{' && regcurly(RExC_parse)) {
2628         parse_start = RExC_parse; /* MJD */
2629         next = RExC_parse + 1;
2630         maxpos = Nullch;
2631         while (isDIGIT(*next) || *next == ',') {
2632             if (*next == ',') {
2633                 if (maxpos)
2634                     break;
2635                 else
2636                     maxpos = next;
2637             }
2638             next++;
2639         }
2640         if (*next == '}') {             /* got one */
2641             if (!maxpos)
2642                 maxpos = next;
2643             RExC_parse++;
2644             min = atoi(RExC_parse);
2645             if (*maxpos == ',')
2646                 maxpos++;
2647             else
2648                 maxpos = RExC_parse;
2649             max = atoi(maxpos);
2650             if (!max && *maxpos != '0')
2651                 max = REG_INFTY;                /* meaning "infinity" */
2652             else if (max >= REG_INFTY)
2653                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2654             RExC_parse = next;
2655             nextchar(pRExC_state);
2656
2657         do_curly:
2658             if ((flags&SIMPLE)) {
2659                 RExC_naughty += 2 + RExC_naughty / 2;
2660                 reginsert(pRExC_state, CURLY, ret);
2661                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2662                 Set_Node_Cur_Length(ret);
2663             }
2664             else {
2665                 regnode *w = reg_node(pRExC_state, WHILEM);
2666
2667                 w->flags = 0;
2668                 regtail(pRExC_state, ret, w);
2669                 if (!SIZE_ONLY && RExC_extralen) {
2670                     reginsert(pRExC_state, LONGJMP,ret);
2671                     reginsert(pRExC_state, NOTHING,ret);
2672                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2673                 }
2674                 reginsert(pRExC_state, CURLYX,ret);
2675                                 /* MJD hk */
2676                 Set_Node_Offset(ret, parse_start+1);
2677                 Set_Node_Length(ret, 
2678                                 op == '{' ? (RExC_parse - parse_start) : 1);
2679                 
2680                 if (!SIZE_ONLY && RExC_extralen)
2681                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2682                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2683                 if (SIZE_ONLY)
2684                     RExC_whilem_seen++, RExC_extralen += 3;
2685                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2686             }
2687             ret->flags = 0;
2688
2689             if (min > 0)
2690                 *flagp = WORST;
2691             if (max > 0)
2692                 *flagp |= HASWIDTH;
2693             if (max && max < min)
2694                 vFAIL("Can't do {n,m} with n > m");
2695             if (!SIZE_ONLY) {
2696                 ARG1_SET(ret, (U16)min);
2697                 ARG2_SET(ret, (U16)max);
2698             }
2699
2700             goto nest_check;
2701         }
2702     }
2703
2704     if (!ISMULT1(op)) {
2705         *flagp = flags;
2706         return(ret);
2707     }
2708
2709 #if 0                           /* Now runtime fix should be reliable. */
2710
2711     /* if this is reinstated, don't forget to put this back into perldiag:
2712
2713             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2714
2715            (F) The part of the regexp subject to either the * or + quantifier
2716            could match an empty string. The {#} shows in the regular
2717            expression about where the problem was discovered.
2718
2719     */
2720
2721     if (!(flags&HASWIDTH) && op != '?')
2722       vFAIL("Regexp *+ operand could be empty");
2723 #endif
2724
2725     parse_start = RExC_parse;
2726     nextchar(pRExC_state);
2727
2728     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2729
2730     if (op == '*' && (flags&SIMPLE)) {
2731         reginsert(pRExC_state, STAR, ret);
2732         ret->flags = 0;
2733         RExC_naughty += 4;
2734     }
2735     else if (op == '*') {
2736         min = 0;
2737         goto do_curly;
2738     }
2739     else if (op == '+' && (flags&SIMPLE)) {
2740         reginsert(pRExC_state, PLUS, ret);
2741         ret->flags = 0;
2742         RExC_naughty += 3;
2743     }
2744     else if (op == '+') {
2745         min = 1;
2746         goto do_curly;
2747     }
2748     else if (op == '?') {
2749         min = 0; max = 1;
2750         goto do_curly;
2751     }
2752   nest_check:
2753     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2754         vWARN3(RExC_parse,
2755                "%.*s matches null string many times",
2756                RExC_parse - origparse,
2757                origparse);
2758     }
2759
2760     if (*RExC_parse == '?') {
2761         nextchar(pRExC_state);
2762         reginsert(pRExC_state, MINMOD, ret);
2763         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2764     }
2765     if (ISMULT2(RExC_parse)) {
2766         RExC_parse++;
2767         vFAIL("Nested quantifiers");
2768     }
2769
2770     return(ret);
2771 }
2772
2773 /*
2774  - regatom - the lowest level
2775  *
2776  * Optimization:  gobbles an entire sequence of ordinary characters so that
2777  * it can turn them into a single node, which is smaller to store and
2778  * faster to run.  Backslashed characters are exceptions, each becoming a
2779  * separate node; the code is simpler that way and it's not worth fixing.
2780  *
2781  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2782 STATIC regnode *
2783 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2784 {
2785     register regnode *ret = 0;
2786     I32 flags;
2787     char *parse_start = 0;
2788
2789     *flagp = WORST;             /* Tentatively. */
2790
2791 tryagain:
2792     switch (*RExC_parse) {
2793     case '^':
2794         RExC_seen_zerolen++;
2795         nextchar(pRExC_state);
2796         if (RExC_flags & PMf_MULTILINE)
2797             ret = reg_node(pRExC_state, MBOL);
2798         else if (RExC_flags & PMf_SINGLELINE)
2799             ret = reg_node(pRExC_state, SBOL);
2800         else
2801             ret = reg_node(pRExC_state, BOL);
2802         Set_Node_Length(ret, 1); /* MJD */
2803         break;
2804     case '$':
2805         nextchar(pRExC_state);
2806         if (*RExC_parse)
2807             RExC_seen_zerolen++;
2808         if (RExC_flags & PMf_MULTILINE)
2809             ret = reg_node(pRExC_state, MEOL);
2810         else if (RExC_flags & PMf_SINGLELINE)
2811             ret = reg_node(pRExC_state, SEOL);
2812         else
2813             ret = reg_node(pRExC_state, EOL);
2814         Set_Node_Length(ret, 1); /* MJD */
2815         break;
2816     case '.':
2817         nextchar(pRExC_state);
2818         if (RExC_flags & PMf_SINGLELINE)
2819             ret = reg_node(pRExC_state, SANY);
2820         else
2821             ret = reg_node(pRExC_state, REG_ANY);
2822         *flagp |= HASWIDTH|SIMPLE;
2823         RExC_naughty++;
2824         Set_Node_Length(ret, 1); /* MJD */
2825         break;
2826     case '[':
2827     {
2828         char *oregcomp_parse = ++RExC_parse;
2829         ret = regclass(pRExC_state);
2830         if (*RExC_parse != ']') {
2831             RExC_parse = oregcomp_parse;
2832             vFAIL("Unmatched [");
2833         }
2834         nextchar(pRExC_state);
2835         *flagp |= HASWIDTH|SIMPLE;
2836         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2837         break;
2838     }
2839     case '(':
2840         nextchar(pRExC_state);
2841         ret = reg(pRExC_state, 1, &flags);
2842         if (ret == NULL) {
2843                 if (flags & TRYAGAIN) {
2844                     if (RExC_parse == RExC_end) {
2845                          /* Make parent create an empty node if needed. */
2846                         *flagp |= TRYAGAIN;
2847                         return(NULL);
2848                     }
2849                     goto tryagain;
2850                 }
2851                 return(NULL);
2852         }
2853         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2854         break;
2855     case '|':
2856     case ')':
2857         if (flags & TRYAGAIN) {
2858             *flagp |= TRYAGAIN;
2859             return NULL;
2860         }
2861         vFAIL("Internal urp");
2862                                 /* Supposed to be caught earlier. */
2863         break;
2864     case '{':
2865         if (!regcurly(RExC_parse)) {
2866             RExC_parse++;
2867             goto defchar;
2868         }
2869         /* FALL THROUGH */
2870     case '?':
2871     case '+':
2872     case '*':
2873         RExC_parse++;
2874         vFAIL("Quantifier follows nothing");
2875         break;
2876     case '\\':
2877         switch (*++RExC_parse) {
2878         case 'A':
2879             RExC_seen_zerolen++;
2880             ret = reg_node(pRExC_state, SBOL);
2881             *flagp |= SIMPLE;
2882             nextchar(pRExC_state);
2883             Set_Node_Length(ret, 2); /* MJD */
2884             break;
2885         case 'G':
2886             ret = reg_node(pRExC_state, GPOS);
2887             RExC_seen |= REG_SEEN_GPOS;
2888             *flagp |= SIMPLE;
2889             nextchar(pRExC_state);
2890             Set_Node_Length(ret, 2); /* MJD */
2891             break;
2892         case 'Z':
2893             ret = reg_node(pRExC_state, SEOL);
2894             *flagp |= SIMPLE;
2895             RExC_seen_zerolen++;                /* Do not optimize RE away */
2896             nextchar(pRExC_state);
2897             break;
2898         case 'z':
2899             ret = reg_node(pRExC_state, EOS);
2900             *flagp |= SIMPLE;
2901             RExC_seen_zerolen++;                /* Do not optimize RE away */
2902             nextchar(pRExC_state);
2903             Set_Node_Length(ret, 2); /* MJD */
2904             break;
2905         case 'C':
2906             ret = reg_node(pRExC_state, CANY);
2907             RExC_seen |= REG_SEEN_CANY;
2908             *flagp |= HASWIDTH|SIMPLE;
2909             nextchar(pRExC_state);
2910             Set_Node_Length(ret, 2); /* MJD */
2911             break;
2912         case 'X':
2913             ret = reg_node(pRExC_state, CLUMP);
2914             *flagp |= HASWIDTH;
2915             nextchar(pRExC_state);
2916             Set_Node_Length(ret, 2); /* MJD */
2917             break;
2918         case 'w':
2919             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2920             *flagp |= HASWIDTH|SIMPLE;
2921             nextchar(pRExC_state);
2922             Set_Node_Length(ret, 2); /* MJD */
2923             break;
2924         case 'W':
2925             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2926             *flagp |= HASWIDTH|SIMPLE;
2927             nextchar(pRExC_state);
2928             Set_Node_Length(ret, 2); /* MJD */
2929             break;
2930         case 'b':
2931             RExC_seen_zerolen++;
2932             RExC_seen |= REG_SEEN_LOOKBEHIND;
2933             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2934             *flagp |= SIMPLE;
2935             nextchar(pRExC_state);
2936             Set_Node_Length(ret, 2); /* MJD */
2937             break;
2938         case 'B':
2939             RExC_seen_zerolen++;
2940             RExC_seen |= REG_SEEN_LOOKBEHIND;
2941             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2942             *flagp |= SIMPLE;
2943             nextchar(pRExC_state);
2944             Set_Node_Length(ret, 2); /* MJD */
2945             break;
2946         case 's':
2947             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2948             *flagp |= HASWIDTH|SIMPLE;
2949             nextchar(pRExC_state);
2950             Set_Node_Length(ret, 2); /* MJD */
2951             break;
2952         case 'S':
2953             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2954             *flagp |= HASWIDTH|SIMPLE;
2955             nextchar(pRExC_state);
2956             Set_Node_Length(ret, 2); /* MJD */
2957             break;
2958         case 'd':
2959             ret = reg_node(pRExC_state, DIGIT);
2960             *flagp |= HASWIDTH|SIMPLE;
2961             nextchar(pRExC_state);
2962             Set_Node_Length(ret, 2); /* MJD */
2963             break;
2964         case 'D':
2965             ret = reg_node(pRExC_state, NDIGIT);
2966             *flagp |= HASWIDTH|SIMPLE;
2967             nextchar(pRExC_state);
2968             Set_Node_Length(ret, 2); /* MJD */
2969             break;
2970         case 'p':
2971         case 'P':
2972             {   
2973                 char* oldregxend = RExC_end;
2974                 char* parse_start = RExC_parse - 2;
2975
2976                 if (RExC_parse[1] == '{') {
2977                   /* a lovely hack--pretend we saw [\pX] instead */
2978                     RExC_end = strchr(RExC_parse, '}');
2979                     if (!RExC_end) {
2980                         U8 c = (U8)*RExC_parse;
2981                         RExC_parse += 2;
2982                         RExC_end = oldregxend;
2983                         vFAIL2("Missing right brace on \\%c{}", c);
2984                     }
2985                     RExC_end++;
2986                 }
2987                 else {
2988                     RExC_end = RExC_parse + 2;
2989                     if (RExC_end > oldregxend)
2990                         RExC_end = oldregxend;
2991                 }
2992                 RExC_parse--;
2993
2994                 ret = regclass(pRExC_state);
2995
2996                 RExC_end = oldregxend;
2997                 RExC_parse--;
2998
2999                 Set_Node_Offset(ret, parse_start + 2);
3000                 Set_Node_Cur_Length(ret);
3001                 nextchar(pRExC_state);
3002                 *flagp |= HASWIDTH|SIMPLE;
3003             }
3004             break;
3005         case 'n':
3006         case 'r':
3007         case 't':
3008         case 'f':
3009         case 'e':
3010         case 'a':
3011         case 'x':
3012         case 'c':
3013         case '0':
3014             goto defchar;
3015         case '1': case '2': case '3': case '4':
3016         case '5': case '6': case '7': case '8': case '9':
3017             {
3018                 I32 num = atoi(RExC_parse);
3019
3020                 if (num > 9 && num >= RExC_npar)
3021                     goto defchar;
3022                 else {
3023                     char * parse_start = RExC_parse - 1; /* MJD */
3024                     while (isDIGIT(*RExC_parse))
3025                         RExC_parse++;
3026
3027                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3028                         vFAIL("Reference to nonexistent group");
3029                     RExC_sawback = 1;
3030                     ret = reganode(pRExC_state,
3031                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3032                                    num);
3033                     *flagp |= HASWIDTH;
3034                     
3035                     /* override incorrect value set in reganode MJD */
3036                     Set_Node_Offset(ret, parse_start+1); 
3037                     Set_Node_Cur_Length(ret); /* MJD */
3038                     RExC_parse--;
3039                     nextchar(pRExC_state);
3040                 }
3041             }
3042             break;
3043         case '\0':
3044             if (RExC_parse >= RExC_end)
3045                 FAIL("Trailing \\");
3046             /* FALL THROUGH */
3047         default:
3048             /* Do not generate `unrecognized' warnings here, we fall
3049                back into the quick-grab loop below */
3050             goto defchar;
3051         }
3052         break;
3053
3054     case '#':
3055         if (RExC_flags & PMf_EXTENDED) {
3056             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3057             if (RExC_parse < RExC_end)
3058                 goto tryagain;
3059         }
3060         /* FALL THROUGH */
3061
3062     default: {
3063             register STRLEN len;
3064             register UV ender;
3065             register char *p;
3066             char *oldp, *s;
3067             STRLEN numlen;
3068             STRLEN foldlen;
3069             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3070
3071             parse_start = RExC_parse - 1;
3072
3073             RExC_parse++;
3074
3075         defchar:
3076             ender = 0;
3077             ret = reg_node(pRExC_state,
3078                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3079             s = STRING(ret);
3080             for (len = 0, p = RExC_parse - 1;
3081               len < 127 && p < RExC_end;
3082               len++)
3083             {
3084                 oldp = p;
3085
3086                 if (RExC_flags & PMf_EXTENDED)
3087                     p = regwhite(p, RExC_end);
3088                 switch (*p) {
3089                 case '^':
3090                 case '$':
3091                 case '.':
3092                 case '[':
3093                 case '(':
3094                 case ')':
3095                 case '|':
3096                     goto loopdone;
3097                 case '\\':
3098                     switch (*++p) {
3099                     case 'A':
3100                     case 'C':
3101                     case 'X':
3102                     case 'G':
3103                     case 'Z':
3104                     case 'z':
3105                     case 'w':
3106                     case 'W':
3107                     case 'b':
3108                     case 'B':
3109                     case 's':
3110                     case 'S':
3111                     case 'd':
3112                     case 'D':
3113                     case 'p':
3114                     case 'P':
3115                         --p;
3116                         goto loopdone;
3117                     case 'n':
3118                         ender = '\n';
3119                         p++;
3120                         break;
3121                     case 'r':
3122                         ender = '\r';
3123                         p++;
3124                         break;
3125                     case 't':
3126                         ender = '\t';
3127                         p++;
3128                         break;
3129                     case 'f':
3130                         ender = '\f';
3131                         p++;
3132                         break;
3133                     case 'e':
3134                           ender = ASCII_TO_NATIVE('\033');
3135                         p++;
3136                         break;
3137                     case 'a':
3138                           ender = ASCII_TO_NATIVE('\007');
3139                         p++;
3140                         break;
3141                     case 'x':
3142                         if (*++p == '{') {
3143                             char* e = strchr(p, '}');
3144         
3145                             if (!e) {
3146                                 RExC_parse = p + 1;
3147                                 vFAIL("Missing right brace on \\x{}");
3148                             }
3149                             else {
3150                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3151                                     | PERL_SCAN_DISALLOW_PREFIX;
3152                                 numlen = e - p - 1;
3153                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3154                                 if (ender > 0xff)
3155                                     RExC_utf8 = 1;
3156                                 /* numlen is generous */
3157                                 if (numlen + len >= 127) {
3158                                     p--;
3159                                     goto loopdone;
3160                                 }
3161                                 p = e + 1;
3162                             }
3163                         }
3164                         else {
3165                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3166                             numlen = 2;
3167                             ender = grok_hex(p, &numlen, &flags, NULL);
3168                             p += numlen;
3169                         }
3170                         break;
3171                     case 'c':
3172                         p++;
3173                         ender = UCHARAT(p++);
3174                         ender = toCTRL(ender);
3175                         break;
3176                     case '0': case '1': case '2': case '3':case '4':
3177                     case '5': case '6': case '7': case '8':case '9':
3178                         if (*p == '0' ||
3179                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3180                             I32 flags = 0;
3181                             numlen = 3;
3182                             ender = grok_oct(p, &numlen, &flags, NULL);
3183                             p += numlen;
3184                         }
3185                         else {
3186                             --p;
3187                             goto loopdone;
3188                         }
3189                         break;
3190                     case '\0':
3191                         if (p >= RExC_end)
3192                             FAIL("Trailing \\");
3193                         /* FALL THROUGH */
3194                     default:
3195                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3196                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3197                         goto normal_default;
3198                     }
3199                     break;
3200                 default:
3201                   normal_default:
3202                     if (UTF8_IS_START(*p) && UTF) {
3203                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3204                                                &numlen, 0);
3205                         p += numlen;
3206                     }
3207                     else
3208                         ender = *p++;
3209                     break;
3210                 }
3211                 if (RExC_flags & PMf_EXTENDED)
3212                     p = regwhite(p, RExC_end);
3213                 if (UTF && FOLD) {
3214                     /* Prime the casefolded buffer. */
3215                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3216                 }
3217                 if (ISMULT2(p)) { /* Back off on ?+*. */
3218                     if (len)
3219                         p = oldp;
3220                     else if (UTF) {
3221                          STRLEN unilen;
3222
3223                          if (FOLD) {
3224                               /* Emit all the Unicode characters. */
3225                               for (foldbuf = tmpbuf;
3226                                    foldlen;
3227                                    foldlen -= numlen) {
3228                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3229                                    if (numlen > 0) {
3230                                         reguni(pRExC_state, ender, s, &unilen);
3231                                         s       += unilen;
3232                                         len     += unilen;
3233                                         /* In EBCDIC the numlen
3234                                          * and unilen can differ. */
3235                                         foldbuf += numlen;
3236                                         if (numlen >= foldlen)
3237                                              break;
3238                                    }
3239                                    else
3240                                         break; /* "Can't happen." */
3241                               }
3242                          }
3243                          else {
3244                               reguni(pRExC_state, ender, s, &unilen);
3245                               if (unilen > 0) {
3246                                    s   += unilen;
3247                                    len += unilen;
3248                               }
3249                          }
3250                     }
3251                     else {
3252                         len++;
3253                         REGC((char)ender, s++);
3254                     }
3255                     break;
3256                 }
3257                 if (UTF) {
3258                      STRLEN unilen;
3259
3260                      if (FOLD) {
3261                           /* Emit all the Unicode characters. */
3262                           for (foldbuf = tmpbuf;
3263                                foldlen;
3264                                foldlen -= numlen) {
3265                                ender = utf8_to_uvchr(foldbuf, &numlen);
3266                                if (numlen > 0) {
3267                                     reguni(pRExC_state, ender, s, &unilen);
3268                                     len     += unilen;
3269                                     s       += unilen;
3270                                     /* In EBCDIC the numlen
3271                                      * and unilen can differ. */
3272                                     foldbuf += numlen;
3273                                     if (numlen >= foldlen)
3274                                          break;
3275                                }
3276                                else
3277                                     break;
3278                           }
3279                      }
3280                      else {
3281                           reguni(pRExC_state, ender, s, &unilen);
3282                           if (unilen > 0) {
3283                                s   += unilen;
3284                                len += unilen;
3285                           }
3286                      }
3287                      len--;
3288                 }
3289                 else
3290                     REGC((char)ender, s++);
3291             }
3292         loopdone:
3293             RExC_parse = p - 1;
3294             Set_Node_Cur_Length(ret); /* MJD */
3295             nextchar(pRExC_state);
3296             {
3297                 /* len is STRLEN which is unsigned, need to copy to signed */
3298                 IV iv = len;
3299                 if (iv < 0)
3300                     vFAIL("Internal disaster");
3301             }
3302             if (len > 0)
3303                 *flagp |= HASWIDTH;
3304             if (len == 1)
3305                 *flagp |= SIMPLE;
3306             if (!SIZE_ONLY)
3307                 STR_LEN(ret) = len;
3308             if (SIZE_ONLY)
3309                 RExC_size += STR_SZ(len);
3310             else
3311                 RExC_emit += STR_SZ(len);
3312         }
3313         break;
3314     }
3315
3316     /* If the encoding pragma is in effect recode the text of
3317      * any EXACT-kind nodes. */
3318     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3319         STRLEN oldlen = STR_LEN(ret);
3320         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3321
3322         if (RExC_utf8)
3323             SvUTF8_on(sv);
3324         if (sv_utf8_downgrade(sv, TRUE)) {
3325             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3326             STRLEN newlen = SvCUR(sv);
3327
3328             if (SvUTF8(sv))
3329                 RExC_utf8 = 1;
3330             if (!SIZE_ONLY) {
3331                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3332                                       (int)oldlen, STRING(ret),
3333                                       (int)newlen, s));
3334                 Copy(s, STRING(ret), newlen, char);
3335                 STR_LEN(ret) += newlen - oldlen;
3336                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3337             } else
3338                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3339         }
3340     }
3341
3342     return(ret);
3343 }
3344
3345 STATIC char *
3346 S_regwhite(pTHX_ char *p, char *e)
3347 {
3348     while (p < e) {
3349         if (isSPACE(*p))
3350             ++p;
3351         else if (*p == '#') {
3352             do {
3353                 p++;
3354             } while (p < e && *p != '\n');
3355         }
3356         else
3357             break;
3358     }
3359     return p;
3360 }
3361
3362 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3363    Character classes ([:foo:]) can also be negated ([:^foo:]).
3364    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3365    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3366    but trigger failures because they are currently unimplemented. */
3367
3368 #define POSIXCC_DONE(c)   ((c) == ':')
3369 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3370 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3371
3372 STATIC I32
3373 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3374 {
3375     char *posixcc = 0;
3376     I32 namedclass = OOB_NAMEDCLASS;
3377
3378     if (value == '[' && RExC_parse + 1 < RExC_end &&
3379         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3380         POSIXCC(UCHARAT(RExC_parse))) {
3381         char  c = UCHARAT(RExC_parse);
3382         char* s = RExC_parse++;
3383         
3384         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3385             RExC_parse++;
3386         if (RExC_parse == RExC_end)
3387             /* Grandfather lone [:, [=, [. */
3388             RExC_parse = s;
3389         else {
3390             char* t = RExC_parse++; /* skip over the c */
3391
3392             if (UCHARAT(RExC_parse) == ']') {
3393                 RExC_parse++; /* skip over the ending ] */
3394                 posixcc = s + 1;
3395                 if (*s == ':') {
3396                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3397                     I32 skip = 5; /* the most common skip */
3398
3399                     switch (*posixcc) {
3400                     case 'a':
3401                         if (strnEQ(posixcc, "alnum", 5))
3402                             namedclass =
3403                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3404                         else if (strnEQ(posixcc, "alpha", 5))
3405                             namedclass =
3406                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3407                         else if (strnEQ(posixcc, "ascii", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3410                         break;
3411                     case 'b':
3412                         if (strnEQ(posixcc, "blank", 5))
3413                             namedclass =
3414                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3415                         break;
3416                     case 'c':
3417                         if (strnEQ(posixcc, "cntrl", 5))
3418                             namedclass =
3419                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3420                         break;
3421                     case 'd':
3422                         if (strnEQ(posixcc, "digit", 5))
3423                             namedclass =
3424                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3425                         break;
3426                     case 'g':
3427                         if (strnEQ(posixcc, "graph", 5))
3428                             namedclass =
3429                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3430                         break;
3431                     case 'l':
3432                         if (strnEQ(posixcc, "lower", 5))
3433                             namedclass =
3434                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3435                         break;
3436                     case 'p':
3437                         if (strnEQ(posixcc, "print", 5))
3438                             namedclass =
3439                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3440                         else if (strnEQ(posixcc, "punct", 5))
3441                             namedclass =
3442                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3443                         break;
3444                     case 's':
3445                         if (strnEQ(posixcc, "space", 5))
3446                             namedclass =
3447                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3448                         break;
3449                     case 'u':
3450                         if (strnEQ(posixcc, "upper", 5))
3451                             namedclass =
3452                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3453                         break;
3454                     case 'w': /* this is not POSIX, this is the Perl \w */
3455                         if (strnEQ(posixcc, "word", 4)) {
3456                             namedclass =
3457                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3458                             skip = 4;
3459                         }
3460                         break;
3461                     case 'x':
3462                         if (strnEQ(posixcc, "xdigit", 6)) {
3463                             namedclass =
3464                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3465                             skip = 6;
3466                         }
3467                         break;
3468                     }
3469                     if (namedclass == OOB_NAMEDCLASS ||
3470                         posixcc[skip] != ':' ||
3471                         posixcc[skip+1] != ']')
3472                     {
3473                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3474                                       t - s - 1, s + 1);
3475                     }
3476                 } else if (!SIZE_ONLY) {
3477                     /* [[=foo=]] and [[.foo.]] are still future. */
3478
3479                     /* adjust RExC_parse so the warning shows after
3480                        the class closes */
3481                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3482                         RExC_parse++;
3483                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3484                 }
3485             } else {
3486                 /* Maternal grandfather:
3487                  * "[:" ending in ":" but not in ":]" */
3488                 RExC_parse = s;
3489             }
3490         }
3491     }
3492
3493     return namedclass;
3494 }
3495
3496 STATIC void
3497 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3498 {
3499     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3500         char *s = RExC_parse;
3501         char  c = *s++;
3502
3503         while(*s && isALNUM(*s))
3504             s++;
3505         if (*s && c == *s && s[1] == ']') {
3506             if (ckWARN(WARN_REGEXP))
3507                 vWARN3(s+2,
3508                         "POSIX syntax [%c %c] belongs inside character classes",
3509                         c, c);
3510
3511             /* [[=foo=]] and [[.foo.]] are still future. */
3512             if (POSIXCC_NOTYET(c)) {
3513                 /* adjust RExC_parse so the error shows after
3514                    the class closes */
3515                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3516                     ;
3517                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3518             }
3519         }
3520     }
3521 }
3522
3523 STATIC regnode *
3524 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3525 {
3526     register UV value;
3527     register UV nextvalue;
3528     register IV prevvalue = OOB_UNICODE;
3529     register IV range = 0;
3530     register regnode *ret;
3531     STRLEN numlen;
3532     IV namedclass;
3533     char *rangebegin = 0;
3534     bool need_class = 0;
3535     SV *listsv = Nullsv;
3536     register char *e;
3537     UV n;
3538     bool optimize_invert   = TRUE;
3539     AV* unicode_alternate  = 0;
3540 #ifdef EBCDIC
3541     UV literal_endpoint = 0;
3542 #endif
3543
3544     ret = reganode(pRExC_state, ANYOF, 0);
3545
3546     if (!SIZE_ONLY)
3547         ANYOF_FLAGS(ret) = 0;
3548
3549     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3550         RExC_naughty++;
3551         RExC_parse++;
3552         if (!SIZE_ONLY)
3553             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3554     }
3555
3556     if (SIZE_ONLY)
3557         RExC_size += ANYOF_SKIP;
3558     else {
3559         RExC_emit += ANYOF_SKIP;
3560         if (FOLD)
3561             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3562         if (LOC)
3563             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3564         ANYOF_BITMAP_ZERO(ret);
3565         listsv = newSVpvn("# comment\n", 10);
3566     }
3567
3568     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3569
3570     if (!SIZE_ONLY && POSIXCC(nextvalue))
3571         checkposixcc(pRExC_state);
3572
3573     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3574     if (UCHARAT(RExC_parse) == ']')
3575         goto charclassloop;
3576
3577     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3578
3579     charclassloop:
3580
3581         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3582
3583         if (!range)
3584             rangebegin = RExC_parse;
3585         if (UTF) {
3586             value = utf8n_to_uvchr((U8*)RExC_parse,
3587                                    RExC_end - RExC_parse,
3588                                    &numlen, 0);
3589             RExC_parse += numlen;
3590         }
3591         else
3592             value = UCHARAT(RExC_parse++);
3593         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3594         if (value == '[' && POSIXCC(nextvalue))
3595             namedclass = regpposixcc(pRExC_state, value);
3596         else if (value == '\\') {
3597             if (UTF) {
3598                 value = utf8n_to_uvchr((U8*)RExC_parse,
3599                                    RExC_end - RExC_parse,
3600                                    &numlen, 0);
3601                 RExC_parse += numlen;
3602             }
3603             else
3604                 value = UCHARAT(RExC_parse++);
3605             /* Some compilers cannot handle switching on 64-bit integer
3606              * values, therefore value cannot be an UV.  Yes, this will
3607              * be a problem later if we want switch on Unicode.
3608              * A similar issue a little bit later when switching on
3609              * namedclass. --jhi */
3610             switch ((I32)value) {
3611             case 'w':   namedclass = ANYOF_ALNUM;       break;
3612             case 'W':   namedclass = ANYOF_NALNUM;      break;
3613             case 's':   namedclass = ANYOF_SPACE;       break;
3614             case 'S':   namedclass = ANYOF_NSPACE;      break;
3615             case 'd':   namedclass = ANYOF_DIGIT;       break;
3616             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3617             case 'p':
3618             case 'P':
3619                 if (RExC_parse >= RExC_end)
3620                     vFAIL2("Empty \\%c{}", (U8)value);
3621                 if (*RExC_parse == '{') {
3622                     U8 c = (U8)value;
3623                     e = strchr(RExC_parse++, '}');
3624                     if (!e)
3625                         vFAIL2("Missing right brace on \\%c{}", c);
3626                     while (isSPACE(UCHARAT(RExC_parse)))
3627                         RExC_parse++;
3628                     if (e == RExC_parse)
3629                         vFAIL2("Empty \\%c{}", c);
3630                     n = e - RExC_parse;
3631                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3632                         n--;
3633                 }
3634                 else {
3635                     e = RExC_parse;
3636                     n = 1;
3637                 }
3638                 if (!SIZE_ONLY) {
3639                     if (UCHARAT(RExC_parse) == '^') {
3640                          RExC_parse++;
3641                          n--;
3642                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3643                          while (isSPACE(UCHARAT(RExC_parse))) {
3644                               RExC_parse++;
3645                               n--;
3646                          }
3647                     }
3648                     if (value == 'p')
3649                          Perl_sv_catpvf(aTHX_ listsv,
3650                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3651                     else
3652                          Perl_sv_catpvf(aTHX_ listsv,
3653                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3654                 }
3655                 RExC_parse = e + 1;
3656                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3657                 continue;
3658             case 'n':   value = '\n';                   break;
3659             case 'r':   value = '\r';                   break;
3660             case 't':   value = '\t';                   break;
3661             case 'f':   value = '\f';                   break;
3662             case 'b':   value = '\b';                   break;
3663             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3664             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3665             case 'x':
3666                 if (*RExC_parse == '{') {
3667                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3668                         | PERL_SCAN_DISALLOW_PREFIX;
3669                     e = strchr(RExC_parse++, '}');
3670                     if (!e)
3671                         vFAIL("Missing right brace on \\x{}");
3672
3673                     numlen = e - RExC_parse;
3674                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3675                     RExC_parse = e + 1;
3676                 }
3677                 else {
3678                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3679                     numlen = 2;
3680                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3681                     RExC_parse += numlen;
3682                 }
3683                 break;
3684             case 'c':
3685                 value = UCHARAT(RExC_parse++);
3686                 value = toCTRL(value);
3687                 break;
3688             case '0': case '1': case '2': case '3': case '4':
3689             case '5': case '6': case '7': case '8': case '9':
3690             {
3691                 I32 flags = 0;
3692                 numlen = 3;
3693                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3694                 RExC_parse += numlen;
3695                 break;
3696             }
3697             default:
3698                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3699                     vWARN2(RExC_parse,
3700                            "Unrecognized escape \\%c in character class passed through",
3701                            (int)value);
3702                 break;
3703             }
3704         } /* end of \blah */
3705 #ifdef EBCDIC
3706         else
3707             literal_endpoint++;
3708 #endif
3709
3710         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3711
3712             if (!SIZE_ONLY && !need_class)
3713                 ANYOF_CLASS_ZERO(ret);
3714
3715             need_class = 1;
3716
3717             /* a bad range like a-\d, a-[:digit:] ? */
3718             if (range) {
3719                 if (!SIZE_ONLY) {
3720                     if (ckWARN(WARN_REGEXP))
3721                         vWARN4(RExC_parse,
3722                                "False [] range \"%*.*s\"",
3723                                RExC_parse - rangebegin,
3724                                RExC_parse - rangebegin,
3725                                rangebegin);
3726                     if (prevvalue < 256) {
3727                         ANYOF_BITMAP_SET(ret, prevvalue);
3728                         ANYOF_BITMAP_SET(ret, '-');
3729                     }
3730                     else {
3731                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3732                         Perl_sv_catpvf(aTHX_ listsv,
3733                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3734                     }
3735                 }
3736
3737                 range = 0; /* this was not a true range */
3738             }
3739
3740             if (!SIZE_ONLY) {
3741                 if (namedclass > OOB_NAMEDCLASS)
3742                     optimize_invert = FALSE;
3743                 /* Possible truncation here but in some 64-bit environments
3744                  * the compiler gets heartburn about switch on 64-bit values.
3745                  * A similar issue a little earlier when switching on value.
3746                  * --jhi */
3747                 switch ((I32)namedclass) {
3748                 case ANYOF_ALNUM:
3749                     if (LOC)
3750                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3751                     else {
3752                         for (value = 0; value < 256; value++)
3753                             if (isALNUM(value))
3754                                 ANYOF_BITMAP_SET(ret, value);
3755                     }
3756                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3757                     break;
3758                 case ANYOF_NALNUM:
3759                     if (LOC)
3760                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3761                     else {
3762                         for (value = 0; value < 256; value++)
3763                             if (!isALNUM(value))
3764                                 ANYOF_BITMAP_SET(ret, value);
3765                     }
3766                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3767                     break;
3768                 case ANYOF_ALNUMC:
3769                     if (LOC)
3770                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3771                     else {
3772                         for (value = 0; value < 256; value++)
3773                             if (isALNUMC(value))
3774                                 ANYOF_BITMAP_SET(ret, value);
3775                     }
3776                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3777                     break;
3778                 case ANYOF_NALNUMC:
3779                     if (LOC)
3780                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3781                     else {
3782                         for (value = 0; value < 256; value++)
3783                             if (!isALNUMC(value))
3784                                 ANYOF_BITMAP_SET(ret, value);
3785                     }
3786                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3787                     break;
3788                 case ANYOF_ALPHA:
3789                     if (LOC)
3790                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3791                     else {
3792                         for (value = 0; value < 256; value++)
3793                             if (isALPHA(value))
3794                                 ANYOF_BITMAP_SET(ret, value);
3795                     }
3796                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3797                     break;
3798                 case ANYOF_NALPHA:
3799                     if (LOC)
3800                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3801                     else {
3802                         for (value = 0; value < 256; value++)
3803                             if (!isALPHA(value))
3804                                 ANYOF_BITMAP_SET(ret, value);
3805                     }
3806                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3807                     break;
3808                 case ANYOF_ASCII:
3809                     if (LOC)
3810                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3811                     else {
3812 #ifndef EBCDIC
3813                         for (value = 0; value < 128; value++)
3814                             ANYOF_BITMAP_SET(ret, value);
3815 #else  /* EBCDIC */
3816                         for (value = 0; value < 256; value++) {
3817                             if (isASCII(value))
3818                                 ANYOF_BITMAP_SET(ret, value);
3819                         }
3820 #endif /* EBCDIC */
3821                     }
3822                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3823                     break;
3824                 case ANYOF_NASCII:
3825                     if (LOC)
3826                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3827                     else {
3828 #ifndef EBCDIC
3829                         for (value = 128; value < 256; value++)
3830                             ANYOF_BITMAP_SET(ret, value);
3831 #else  /* EBCDIC */
3832                         for (value = 0; value < 256; value++) {
3833                             if (!isASCII(value))
3834                                 ANYOF_BITMAP_SET(ret, value);
3835                         }
3836 #endif /* EBCDIC */
3837                     }
3838                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3839                     break;
3840                 case ANYOF_BLANK:
3841                     if (LOC)
3842                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3843                     else {
3844                         for (value = 0; value < 256; value++)
3845                             if (isBLANK(value))
3846                                 ANYOF_BITMAP_SET(ret, value);
3847                     }
3848                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3849                     break;
3850                 case ANYOF_NBLANK:
3851                     if (LOC)
3852                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3853                     else {
3854                         for (value = 0; value < 256; value++)
3855                             if (!isBLANK(value))
3856                                 ANYOF_BITMAP_SET(ret, value);
3857                     }
3858                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3859                     break;
3860                 case ANYOF_CNTRL:
3861                     if (LOC)
3862                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3863                     else {
3864                         for (value = 0; value < 256; value++)
3865                             if (isCNTRL(value))
3866                                 ANYOF_BITMAP_SET(ret, value);
3867                     }
3868                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3869                     break;
3870                 case ANYOF_NCNTRL:
3871                     if (LOC)
3872                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3873                     else {
3874                         for (value = 0; value < 256; value++)
3875                             if (!isCNTRL(value))
3876                                 ANYOF_BITMAP_SET(ret, value);
3877                     }
3878                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3879                     break;
3880                 case ANYOF_DIGIT:
3881                     if (LOC)
3882                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3883                     else {
3884                         /* consecutive digits assumed */
3885                         for (value = '0'; value <= '9'; value++)
3886                             ANYOF_BITMAP_SET(ret, value);
3887                     }
3888                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3889                     break;
3890                 case ANYOF_NDIGIT:
3891                     if (LOC)
3892                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3893                     else {
3894                         /* consecutive digits assumed */
3895                         for (value = 0; value < '0'; value++)
3896                             ANYOF_BITMAP_SET(ret, value);
3897                         for (value = '9' + 1; value < 256; value++)
3898                             ANYOF_BITMAP_SET(ret, value);
3899                     }
3900                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3901                     break;
3902                 case ANYOF_GRAPH:
3903                     if (LOC)
3904                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3905                     else {
3906                         for (value = 0; value < 256; value++)
3907                             if (isGRAPH(value))
3908                                 ANYOF_BITMAP_SET(ret, value);
3909                     }
3910                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3911                     break;
3912                 case ANYOF_NGRAPH:
3913                     if (LOC)
3914                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3915                     else {
3916                         for (value = 0; value < 256; value++)
3917                             if (!isGRAPH(value))
3918                                 ANYOF_BITMAP_SET(ret, value);
3919                     }
3920                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3921                     break;
3922                 case ANYOF_LOWER:
3923                     if (LOC)
3924                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3925                     else {
3926                         for (value = 0; value < 256; value++)
3927                             if (isLOWER(value))
3928                                 ANYOF_BITMAP_SET(ret, value);
3929                     }
3930                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3931                     break;
3932                 case ANYOF_NLOWER:
3933                     if (LOC)
3934                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3935                     else {
3936                         for (value = 0; value < 256; value++)
3937                             if (!isLOWER(value))
3938                                 ANYOF_BITMAP_SET(ret, value);
3939                     }
3940                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3941                     break;
3942                 case ANYOF_PRINT:
3943                     if (LOC)
3944                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3945                     else {
3946                         for (value = 0; value < 256; value++)
3947                             if (isPRINT(value))
3948                                 ANYOF_BITMAP_SET(ret, value);
3949                     }
3950                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3951                     break;
3952                 case ANYOF_NPRINT:
3953                     if (LOC)
3954                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3955                     else {
3956                         for (value = 0; value < 256; value++)
3957                             if (!isPRINT(value))
3958                                 ANYOF_BITMAP_SET(ret, value);
3959                     }
3960                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3961                     break;
3962                 case ANYOF_PSXSPC:
3963                     if (LOC)
3964                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3965                     else {
3966                         for (value = 0; value < 256; value++)
3967                             if (isPSXSPC(value))
3968                                 ANYOF_BITMAP_SET(ret, value);
3969                     }
3970                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3971                     break;
3972                 case ANYOF_NPSXSPC:
3973                     if (LOC)
3974                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3975                     else {
3976                         for (value = 0; value < 256; value++)
3977                             if (!isPSXSPC(value))
3978                                 ANYOF_BITMAP_SET(ret, value);
3979                     }
3980                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3981                     break;
3982                 case ANYOF_PUNCT:
3983                     if (LOC)
3984                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3985                     else {
3986                         for (value = 0; value < 256; value++)
3987                             if (isPUNCT(value))
3988                                 ANYOF_BITMAP_SET(ret, value);
3989                     }
3990                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3991                     break;
3992                 case ANYOF_NPUNCT:
3993                     if (LOC)
3994                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3995                     else {
3996                         for (value = 0; value < 256; value++)
3997                             if (!isPUNCT(value))
3998                                 ANYOF_BITMAP_SET(ret, value);
3999                     }
4000                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4001                     break;
4002                 case ANYOF_SPACE:
4003                     if (LOC)
4004                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4005                     else {
4006                         for (value = 0; value < 256; value++)
4007                             if (isSPACE(value))
4008                                 ANYOF_BITMAP_SET(ret, value);
4009                     }
4010                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4011                     break;
4012                 case ANYOF_NSPACE:
4013                     if (LOC)
4014                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4015                     else {
4016                         for (value = 0; value < 256; value++)
4017                             if (!isSPACE(value))
4018                                 ANYOF_BITMAP_SET(ret, value);
4019                     }
4020                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4021                     break;
4022                 case ANYOF_UPPER:
4023                     if (LOC)
4024                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4025                     else {
4026                         for (value = 0; value < 256; value++)
4027                             if (isUPPER(value))
4028                                 ANYOF_BITMAP_SET(ret, value);
4029                     }
4030                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4031                     break;
4032                 case ANYOF_NUPPER:
4033                     if (LOC)
4034                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4035                     else {
4036                         for (value = 0; value < 256; value++)
4037                             if (!isUPPER(value))
4038                                 ANYOF_BITMAP_SET(ret, value);
4039                     }
4040                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4041                     break;
4042                 case ANYOF_XDIGIT:
4043                     if (LOC)
4044                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4045                     else {
4046                         for (value = 0; value < 256; value++)
4047                             if (isXDIGIT(value))
4048                                 ANYOF_BITMAP_SET(ret, value);
4049                     }
4050                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4051                     break;
4052                 case ANYOF_NXDIGIT:
4053                     if (LOC)
4054                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4055                     else {
4056                         for (value = 0; value < 256; value++)
4057                             if (!isXDIGIT(value))
4058                                 ANYOF_BITMAP_SET(ret, value);
4059                     }
4060                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4061                     break;
4062                 default:
4063                     vFAIL("Invalid [::] class");
4064                     break;
4065                 }
4066                 if (LOC)
4067                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4068                 continue;
4069             }
4070         } /* end of namedclass \blah */
4071
4072         if (range) {
4073             if (prevvalue > (IV)value) /* b-a */ {
4074                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4075                               RExC_parse - rangebegin,
4076                               RExC_parse - rangebegin,
4077                               rangebegin);
4078                 range = 0; /* not a valid range */
4079             }
4080         }
4081         else {
4082             prevvalue = value; /* save the beginning of the range */
4083             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4084                 RExC_parse[1] != ']') {
4085                 RExC_parse++;
4086
4087                 /* a bad range like \w-, [:word:]- ? */
4088                 if (namedclass > OOB_NAMEDCLASS) {
4089                     if (ckWARN(WARN_REGEXP))
4090                         vWARN4(RExC_parse,
4091                                "False [] range \"%*.*s\"",
4092                                RExC_parse - rangebegin,
4093                                RExC_parse - rangebegin,
4094                                rangebegin);
4095                     if (!SIZE_ONLY)
4096                         ANYOF_BITMAP_SET(ret, '-');
4097                 } else
4098                     range = 1;  /* yeah, it's a range! */
4099                 continue;       /* but do it the next time */
4100             }
4101         }
4102
4103         /* now is the next time */
4104         if (!SIZE_ONLY) {
4105             IV i;
4106
4107             if (prevvalue < 256) {
4108                 IV ceilvalue = value < 256 ? value : 255;
4109
4110 #ifdef EBCDIC
4111                 /* In EBCDIC [\x89-\x91] should include
4112                  * the \x8e but [i-j] should not. */
4113                 if (literal_endpoint == 2 &&
4114                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4115                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4116                 {
4117                     if (isLOWER(prevvalue)) {
4118                         for (i = prevvalue; i <= ceilvalue; i++)
4119                             if (isLOWER(i))
4120                                 ANYOF_BITMAP_SET(ret, i);
4121                     } else {
4122                         for (i = prevvalue; i <= ceilvalue; i++)
4123                             if (isUPPER(i))
4124                                 ANYOF_BITMAP_SET(ret, i);
4125                     }
4126                 }
4127                 else
4128 #endif
4129                       for (i = prevvalue; i <= ceilvalue; i++)
4130                           ANYOF_BITMAP_SET(ret, i);
4131           }
4132           if (value > 255 || UTF) {
4133                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4134                 UV natvalue      = NATIVE_TO_UNI(value);
4135
4136                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4137                 if (prevnatvalue < natvalue) { /* what about > ? */
4138                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4139                                    prevnatvalue, natvalue);
4140                 }
4141                 else if (prevnatvalue == natvalue) {
4142                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4143                     if (FOLD) {
4144                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4145                          STRLEN foldlen;
4146                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4147
4148                          /* If folding and foldable and a single
4149                           * character, insert also the folded version
4150                           * to the charclass. */
4151                          if (f != value) {
4152                               if (foldlen == (STRLEN)UNISKIP(f))
4153                                   Perl_sv_catpvf(aTHX_ listsv,
4154                                                  "%04"UVxf"\n", f);
4155                               else {
4156                                   /* Any multicharacter foldings
4157                                    * require the following transform:
4158                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4159                                    * where E folds into "pq" and F folds
4160                                    * into "rst", all other characters
4161                                    * fold to single characters.  We save
4162                                    * away these multicharacter foldings,
4163                                    * to be later saved as part of the
4164                                    * additional "s" data. */
4165                                   SV *sv;
4166
4167                                   if (!unicode_alternate)
4168                                       unicode_alternate = newAV();
4169                                   sv = newSVpvn((char*)foldbuf, foldlen);
4170                                   SvUTF8_on(sv);
4171                                   av_push(unicode_alternate, sv);
4172                               }
4173                          }
4174
4175                          /* If folding and the value is one of the Greek
4176                           * sigmas insert a few more sigmas to make the
4177                           * folding rules of the sigmas to work right.
4178                           * Note that not all the possible combinations
4179                           * are handled here: some of them are handled
4180                           * by the standard folding rules, and some of
4181                           * them (literal or EXACTF cases) are handled
4182                           * during runtime in regexec.c:S_find_byclass(). */
4183                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4184                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4185                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4186                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4187                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4188                          }
4189                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4190                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4191                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4192                     }
4193                 }
4194             }
4195 #ifdef EBCDIC
4196             literal_endpoint = 0;
4197 #endif
4198         }
4199
4200         range = 0; /* this range (if it was one) is done now */
4201     }
4202
4203     if (need_class) {
4204         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4205         if (SIZE_ONLY)
4206             RExC_size += ANYOF_CLASS_ADD_SKIP;
4207         else
4208             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4209     }
4210
4211     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4212     if (!SIZE_ONLY &&
4213          /* If the only flag is folding (plus possibly inversion). */
4214         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4215        ) {
4216         for (value = 0; value < 256; ++value) {
4217             if (ANYOF_BITMAP_TEST(ret, value)) {
4218                 UV fold = PL_fold[value];
4219
4220                 if (fold != value)
4221                     ANYOF_BITMAP_SET(ret, fold);
4222             }
4223         }
4224         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4225     }
4226
4227     /* optimize inverted simple patterns (e.g. [^a-z]) */
4228     if (!SIZE_ONLY && optimize_invert &&
4229         /* If the only flag is inversion. */
4230         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4231         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4232             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4233         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4234     }
4235
4236     if (!SIZE_ONLY) {
4237         AV *av = newAV();
4238         SV *rv;
4239
4240         /* The 0th element stores the character class description
4241          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4242          * to initialize the appropriate swash (which gets stored in
4243          * the 1st element), and also useful for dumping the regnode.
4244          * The 2nd element stores the multicharacter foldings,
4245          * used later (regexec.c:S_reginclass()). */
4246         av_store(av, 0, listsv);
4247         av_store(av, 1, NULL);
4248         av_store(av, 2, (SV*)unicode_alternate);
4249         rv = newRV_noinc((SV*)av);
4250         n = add_data(pRExC_state, 1, "s");
4251         RExC_rx->data->data[n] = (void*)rv;
4252         ARG_SET(ret, n);
4253     }
4254
4255     return ret;
4256 }
4257
4258 STATIC char*
4259 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4260 {
4261     char* retval = RExC_parse++;
4262
4263     for (;;) {
4264         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4265                 RExC_parse[2] == '#') {
4266             while (*RExC_parse && *RExC_parse != ')')
4267                 RExC_parse++;
4268             RExC_parse++;
4269             continue;
4270         }
4271         if (RExC_flags & PMf_EXTENDED) {
4272             if (isSPACE(*RExC_parse)) {
4273                 RExC_parse++;
4274                 continue;
4275             }
4276             else if (*RExC_parse == '#') {
4277                 while (*RExC_parse && *RExC_parse != '\n')
4278                     RExC_parse++;
4279                 RExC_parse++;
4280                 continue;
4281             }
4282         }
4283         return retval;
4284     }
4285 }
4286
4287 /*
4288 - reg_node - emit a node
4289 */
4290 STATIC regnode *                        /* Location. */
4291 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4292 {
4293     register regnode *ret;
4294     register regnode *ptr;
4295
4296     ret = RExC_emit;
4297     if (SIZE_ONLY) {
4298         SIZE_ALIGN(RExC_size);
4299         RExC_size += 1;
4300         return(ret);
4301     }
4302
4303     NODE_ALIGN_FILL(ret);
4304     ptr = ret;
4305     FILL_ADVANCE_NODE(ptr, op);
4306     if (RExC_offsets) {         /* MJD */
4307         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4308               "reg_node", __LINE__, 
4309               reg_name[op],
4310               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4311               ? "Overwriting end of array!\n" : "OK",
4312               RExC_emit - RExC_emit_start,
4313               RExC_parse - RExC_start,
4314               RExC_offsets[0])); 
4315         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4316     }
4317             
4318     RExC_emit = ptr;
4319
4320     return(ret);
4321 }
4322
4323 /*
4324 - reganode - emit a node with an argument
4325 */
4326 STATIC regnode *                        /* Location. */
4327 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4328 {
4329     register regnode *ret;
4330     register regnode *ptr;
4331
4332     ret = RExC_emit;
4333     if (SIZE_ONLY) {
4334         SIZE_ALIGN(RExC_size);
4335         RExC_size += 2;
4336         return(ret);
4337     }
4338
4339     NODE_ALIGN_FILL(ret);
4340     ptr = ret;
4341     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4342     if (RExC_offsets) {         /* MJD */
4343         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4344               "reganode",
4345               __LINE__,
4346               reg_name[op],
4347               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4348               "Overwriting end of array!\n" : "OK",
4349               RExC_emit - RExC_emit_start,
4350               RExC_parse - RExC_start,
4351               RExC_offsets[0])); 
4352         Set_Cur_Node_Offset;
4353     }
4354             
4355     RExC_emit = ptr;
4356
4357     return(ret);
4358 }
4359
4360 /*
4361 - reguni - emit (if appropriate) a Unicode character
4362 */
4363 STATIC void
4364 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4365 {
4366     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4367 }
4368
4369 /*
4370 - reginsert - insert an operator in front of already-emitted operand
4371 *
4372 * Means relocating the operand.
4373 */
4374 STATIC void
4375 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4376 {
4377     register regnode *src;
4378     register regnode *dst;
4379     register regnode *place;
4380     register int offset = regarglen[(U8)op];
4381
4382 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4383
4384     if (SIZE_ONLY) {
4385         RExC_size += NODE_STEP_REGNODE + offset;
4386         return;
4387     }
4388
4389     src = RExC_emit;
4390     RExC_emit += NODE_STEP_REGNODE + offset;
4391     dst = RExC_emit;
4392     while (src > opnd) {
4393         StructCopy(--src, --dst, regnode);
4394         if (RExC_offsets) {     /* MJD 20010112 */
4395             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4396                   "reg_insert",
4397                   __LINE__,
4398                   reg_name[op],
4399                   dst - RExC_emit_start > RExC_offsets[0] 
4400                   ? "Overwriting end of array!\n" : "OK",
4401                   src - RExC_emit_start,
4402                   dst - RExC_emit_start,
4403                   RExC_offsets[0])); 
4404             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4405             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4406         }
4407     }
4408     
4409
4410     place = opnd;               /* Op node, where operand used to be. */
4411     if (RExC_offsets) {         /* MJD */
4412         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4413               "reginsert",
4414               __LINE__,
4415               reg_name[op],
4416               place - RExC_emit_start > RExC_offsets[0] 
4417               ? "Overwriting end of array!\n" : "OK",
4418               place - RExC_emit_start,
4419               RExC_parse - RExC_start,
4420               RExC_offsets[0])); 
4421         Set_Node_Offset(place, RExC_parse);
4422     }
4423     src = NEXTOPER(place);
4424     FILL_ADVANCE_NODE(place, op);
4425     Zero(src, offset, regnode);
4426 }
4427
4428 /*
4429 - regtail - set the next-pointer at the end of a node chain of p to val.
4430 */
4431 STATIC void
4432 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4433 {
4434     register regnode *scan;
4435     register regnode *temp;
4436
4437     if (SIZE_ONLY)
4438         return;
4439
4440     /* Find last node. */
4441     scan = p;
4442     for (;;) {
4443         temp = regnext(scan);
4444         if (temp == NULL)
4445             break;
4446         scan = temp;
4447     }
4448
4449     if (reg_off_by_arg[OP(scan)]) {
4450         ARG_SET(scan, val - scan);
4451     }
4452     else {
4453         NEXT_OFF(scan) = val - scan;
4454     }
4455 }
4456
4457 /*
4458 - regoptail - regtail on operand of first argument; nop if operandless
4459 */
4460 STATIC void
4461 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4462 {
4463     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4464     if (p == NULL || SIZE_ONLY)
4465         return;
4466     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4467         regtail(pRExC_state, NEXTOPER(p), val);
4468     }
4469     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4470         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4471     }
4472     else
4473         return;
4474 }
4475
4476 /*
4477  - regcurly - a little FSA that accepts {\d+,?\d*}
4478  */
4479 STATIC I32
4480 S_regcurly(pTHX_ register char *s)
4481 {
4482     if (*s++ != '{')
4483         return FALSE;
4484     if (!isDIGIT(*s))
4485         return FALSE;
4486     while (isDIGIT(*s))
4487         s++;
4488     if (*s == ',')
4489         s++;
4490     while (isDIGIT(*s))
4491         s++;
4492     if (*s != '}')
4493         return FALSE;
4494     return TRUE;
4495 }
4496
4497
4498 #ifdef DEBUGGING
4499
4500 STATIC regnode *
4501 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4502 {
4503     register U8 op = EXACT;     /* Arbitrary non-END op. */
4504     register regnode *next;
4505
4506     while (op != END && (!last || node < last)) {
4507         /* While that wasn't END last time... */
4508
4509         NODE_ALIGN(node);
4510         op = OP(node);
4511         if (op == CLOSE)
4512             l--;        
4513         next = regnext(node);
4514         /* Where, what. */
4515         if (OP(node) == OPTIMIZED)
4516             goto after_print;
4517         regprop(sv, node);
4518         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4519                       (int)(2*l + 1), "", SvPVX(sv));
4520         if (next == NULL)               /* Next ptr. */
4521             PerlIO_printf(Perl_debug_log, "(0)");
4522         else
4523             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4524         (void)PerlIO_putc(Perl_debug_log, '\n');
4525       after_print:
4526         if (PL_regkind[(U8)op] == BRANCHJ) {
4527             register regnode *nnode = (OP(next) == LONGJMP
4528                                        ? regnext(next)
4529                                        : next);
4530             if (last && nnode > last)
4531                 nnode = last;
4532             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4533         }
4534         else if (PL_regkind[(U8)op] == BRANCH) {
4535             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4536         }
4537         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4538             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4539                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4540         }
4541         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4542             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4543                              next, sv, l + 1);
4544         }
4545         else if ( op == PLUS || op == STAR) {
4546             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4547         }
4548         else if (op == ANYOF) {
4549             /* arglen 1 + class block */
4550             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4551                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4552             node = NEXTOPER(node);
4553         }
4554         else if (PL_regkind[(U8)op] == EXACT) {
4555             /* Literal string, where present. */
4556             node += NODE_SZ_STR(node) - 1;
4557             node = NEXTOPER(node);
4558         }
4559         else {
4560             node = NEXTOPER(node);
4561             node += regarglen[(U8)op];
4562         }
4563         if (op == CURLYX || op == OPEN)
4564             l++;
4565         else if (op == WHILEM)
4566             l--;
4567     }
4568     return node;
4569 }
4570
4571 #endif  /* DEBUGGING */
4572
4573 /*
4574  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4575  */
4576 void
4577 Perl_regdump(pTHX_ regexp *r)
4578 {
4579 #ifdef DEBUGGING
4580     SV *sv = sv_newmortal();
4581
4582     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4583
4584     /* Header fields of interest. */
4585     if (r->anchored_substr)
4586         PerlIO_printf(Perl_debug_log,
4587                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4588                       PL_colors[0],
4589                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4590                       SvPVX(r->anchored_substr),
4591                       PL_colors[1],
4592                       SvTAIL(r->anchored_substr) ? "$" : "",
4593                       (IV)r->anchored_offset);
4594     else if (r->anchored_utf8)
4595         PerlIO_printf(Perl_debug_log,
4596                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4597                       PL_colors[0],
4598                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4599                       SvPVX(r->anchored_utf8),
4600                       PL_colors[1],
4601                       SvTAIL(r->anchored_utf8) ? "$" : "",
4602                       (IV)r->anchored_offset);
4603     if (r->float_substr)
4604         PerlIO_printf(Perl_debug_log,
4605                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4606                       PL_colors[0],
4607                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4608                       SvPVX(r->float_substr),
4609                       PL_colors[1],
4610                       SvTAIL(r->float_substr) ? "$" : "",
4611                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4612     else if (r->float_utf8)
4613         PerlIO_printf(Perl_debug_log,
4614                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4615                       PL_colors[0],
4616                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4617                       SvPVX(r->float_utf8),
4618                       PL_colors[1],
4619                       SvTAIL(r->float_utf8) ? "$" : "",
4620                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4621     if (r->check_substr || r->check_utf8)
4622         PerlIO_printf(Perl_debug_log,
4623                       r->check_substr == r->float_substr
4624                       && r->check_utf8 == r->float_utf8
4625                       ? "(checking floating" : "(checking anchored");
4626     if (r->reganch & ROPT_NOSCAN)
4627         PerlIO_printf(Perl_debug_log, " noscan");
4628     if (r->reganch & ROPT_CHECK_ALL)
4629         PerlIO_printf(Perl_debug_log, " isall");
4630     if (r->check_substr || r->check_utf8)
4631         PerlIO_printf(Perl_debug_log, ") ");
4632
4633     if (r->regstclass) {
4634         regprop(sv, r->regstclass);
4635         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4636     }
4637     if (r->reganch & ROPT_ANCH) {
4638         PerlIO_printf(Perl_debug_log, "anchored");
4639         if (r->reganch & ROPT_ANCH_BOL)
4640             PerlIO_printf(Perl_debug_log, "(BOL)");
4641         if (r->reganch & ROPT_ANCH_MBOL)
4642             PerlIO_printf(Perl_debug_log, "(MBOL)");
4643         if (r->reganch & ROPT_ANCH_SBOL)
4644             PerlIO_printf(Perl_debug_log, "(SBOL)");
4645         if (r->reganch & ROPT_ANCH_GPOS)
4646             PerlIO_printf(Perl_debug_log, "(GPOS)");
4647         PerlIO_putc(Perl_debug_log, ' ');
4648     }
4649     if (r->reganch & ROPT_GPOS_SEEN)
4650         PerlIO_printf(Perl_debug_log, "GPOS ");
4651     if (r->reganch & ROPT_SKIP)
4652         PerlIO_printf(Perl_debug_log, "plus ");
4653     if (r->reganch & ROPT_IMPLICIT)
4654         PerlIO_printf(Perl_debug_log, "implicit ");
4655     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4656     if (r->reganch & ROPT_EVAL_SEEN)
4657         PerlIO_printf(Perl_debug_log, "with eval ");
4658     PerlIO_printf(Perl_debug_log, "\n");
4659     if (r->offsets) {
4660       U32 i;
4661       U32 len = r->offsets[0];
4662       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4663       for (i = 1; i <= len; i++)
4664         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4665                       (UV)r->offsets[i*2-1], 
4666                       (UV)r->offsets[i*2]);
4667       PerlIO_printf(Perl_debug_log, "\n");
4668     }
4669 #endif  /* DEBUGGING */
4670 }
4671
4672 #ifdef DEBUGGING
4673
4674 STATIC void
4675 S_put_byte(pTHX_ SV *sv, int c)
4676 {
4677     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4678         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4679     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4680         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4681     else
4682         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4683 }
4684
4685 #endif  /* DEBUGGING */
4686
4687 /*
4688 - regprop - printable representation of opcode
4689 */
4690 void
4691 Perl_regprop(pTHX_ SV *sv, regnode *o)
4692 {
4693 #ifdef DEBUGGING
4694     register int k;
4695
4696     sv_setpvn(sv, "", 0);
4697     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4698         /* It would be nice to FAIL() here, but this may be called from
4699            regexec.c, and it would be hard to supply pRExC_state. */
4700         Perl_croak(aTHX_ "Corrupted regexp opcode");
4701     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4702
4703     k = PL_regkind[(U8)OP(o)];
4704
4705     if (k == EXACT) {
4706         SV *dsv = sv_2mortal(newSVpvn("", 0));
4707         /* Using is_utf8_string() is a crude hack but it may
4708          * be the best for now since we have no flag "this EXACTish
4709          * node was UTF-8" --jhi */
4710         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4711         char *s    = do_utf8 ?
4712           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4713                          UNI_DISPLAY_REGEX) :
4714           STRING(o);
4715         int len = do_utf8 ?
4716           strlen(s) :
4717           STR_LEN(o);
4718         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4719                        PL_colors[0],
4720                        len, s,
4721                        PL_colors[1]);
4722     }
4723     else if (k == CURLY) {
4724         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4725             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4726         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4727     }
4728     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4729         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4730     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4731         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4732     else if (k == LOGICAL)
4733         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4734     else if (k == ANYOF) {
4735         int i, rangestart = -1;
4736         U8 flags = ANYOF_FLAGS(o);
4737         const char * const anyofs[] = { /* Should be synchronized with
4738                                          * ANYOF_ #xdefines in regcomp.h */
4739             "\\w",
4740             "\\W",
4741             "\\s",
4742             "\\S",
4743             "\\d",
4744             "\\D",
4745             "[:alnum:]",
4746             "[:^alnum:]",
4747             "[:alpha:]",
4748             "[:^alpha:]",
4749             "[:ascii:]",
4750             "[:^ascii:]",
4751             "[:ctrl:]",
4752             "[:^ctrl:]",
4753             "[:graph:]",
4754             "[:^graph:]",
4755             "[:lower:]",
4756             "[:^lower:]",
4757             "[:print:]",
4758             "[:^print:]",
4759             "[:punct:]",
4760             "[:^punct:]",
4761             "[:upper:]",
4762             "[:^upper:]",
4763             "[:xdigit:]",
4764             "[:^xdigit:]",
4765             "[:space:]",
4766             "[:^space:]",
4767             "[:blank:]",
4768             "[:^blank:]"
4769         };
4770
4771         if (flags & ANYOF_LOCALE)
4772             sv_catpv(sv, "{loc}");
4773         if (flags & ANYOF_FOLD)
4774             sv_catpv(sv, "{i}");
4775         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4776         if (flags & ANYOF_INVERT)
4777             sv_catpv(sv, "^");
4778         for (i = 0; i <= 256; i++) {
4779             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4780                 if (rangestart == -1)
4781                     rangestart = i;
4782             } else if (rangestart != -1) {
4783                 if (i <= rangestart + 3)
4784                     for (; rangestart < i; rangestart++)
4785                         put_byte(sv, rangestart);
4786                 else {
4787                     put_byte(sv, rangestart);
4788                     sv_catpv(sv, "-");
4789                     put_byte(sv, i - 1);
4790                 }
4791                 rangestart = -1;
4792             }
4793         }
4794
4795         if (o->flags & ANYOF_CLASS)
4796             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4797                 if (ANYOF_CLASS_TEST(o,i))
4798                     sv_catpv(sv, anyofs[i]);
4799
4800         if (flags & ANYOF_UNICODE)
4801             sv_catpv(sv, "{unicode}");
4802         else if (flags & ANYOF_UNICODE_ALL)
4803             sv_catpv(sv, "{unicode_all}");
4804
4805         {
4806             SV *lv;
4807             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4808         
4809             if (lv) {
4810                 if (sw) {
4811                     U8 s[UTF8_MAXLEN+1];
4812                 
4813                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4814                         U8 *e = uvchr_to_utf8(s, i);
4815                         
4816                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4817                             if (rangestart == -1)
4818                                 rangestart = i;
4819                         } else if (rangestart != -1) {
4820                             U8 *p;
4821                         
4822                             if (i <= rangestart + 3)
4823                                 for (; rangestart < i; rangestart++) {
4824                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4825                                         put_byte(sv, *p);
4826                                 }
4827                             else {
4828                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4829                                     put_byte(sv, *p);
4830                                 sv_catpv(sv, "-");
4831                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4832                                         put_byte(sv, *p);
4833                                 }
4834                                 rangestart = -1;
4835                             }
4836                         }
4837                         
4838                     sv_catpv(sv, "..."); /* et cetera */
4839                 }
4840
4841                 {
4842                     char *s = savepv(SvPVX(lv));
4843                     char *origs = s;
4844                 
4845                     while(*s && *s != '\n') s++;
4846                 
4847                     if (*s == '\n') {
4848                         char *t = ++s;
4849                         
4850                         while (*s) {
4851                             if (*s == '\n')
4852                                 *s = ' ';
4853                             s++;
4854                         }
4855                         if (s[-1] == ' ')
4856                             s[-1] = 0;
4857                         
4858                         sv_catpv(sv, t);
4859                     }
4860                 
4861                     Safefree(origs);
4862                 }
4863             }
4864         }
4865
4866         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4867     }
4868     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4869         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4870 #endif  /* DEBUGGING */
4871 }
4872
4873 SV *
4874 Perl_re_intuit_string(pTHX_ regexp *prog)
4875 {                               /* Assume that RE_INTUIT is set */
4876     DEBUG_r(
4877         {   STRLEN n_a;
4878             char *s = SvPV(prog->check_substr
4879                       ? prog->check_substr : prog->check_utf8, n_a);
4880
4881             if (!PL_colorset) reginitcolors();
4882             PerlIO_printf(Perl_debug_log,
4883                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4884                       PL_colors[4],
4885                       prog->check_substr ? "" : "utf8 ",
4886                       PL_colors[5],PL_colors[0],
4887                       s,
4888                       PL_colors[1],
4889                       (strlen(s) > 60 ? "..." : ""));
4890         } );
4891
4892     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4893 }
4894
4895 void
4896 Perl_pregfree(pTHX_ struct regexp *r)
4897 {
4898 #ifdef DEBUGGING
4899     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4900 #endif
4901
4902     if (!r || (--r->refcnt > 0))
4903         return;
4904     DEBUG_r({
4905          int len;
4906          char *s;
4907
4908          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4909                 r->prelen, 60, UNI_DISPLAY_REGEX)
4910             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4911          len = SvCUR(dsv);
4912          if (!PL_colorset)
4913               reginitcolors();
4914          PerlIO_printf(Perl_debug_log,
4915                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4916                        PL_colors[4],PL_colors[5],PL_colors[0],
4917                        len, len, s,
4918                        PL_colors[1],
4919                        len > 60 ? "..." : "");
4920     });
4921
4922     if (r->precomp)
4923         Safefree(r->precomp);
4924     if (r->offsets)             /* 20010421 MJD */
4925         Safefree(r->offsets);
4926     if (RX_MATCH_COPIED(r))
4927         Safefree(r->subbeg);
4928     if (r->substrs) {
4929         if (r->anchored_substr)
4930             SvREFCNT_dec(r->anchored_substr);
4931         if (r->anchored_utf8)
4932             SvREFCNT_dec(r->anchored_utf8);
4933         if (r->float_substr)
4934             SvREFCNT_dec(r->float_substr);
4935         if (r->float_utf8)
4936             SvREFCNT_dec(r->float_utf8);
4937         Safefree(r->substrs);
4938     }
4939     if (r->data) {
4940         int n = r->data->count;
4941         PAD* new_comppad = NULL;
4942         PAD* old_comppad;
4943
4944         while (--n >= 0) {
4945           /* If you add a ->what type here, update the comment in regcomp.h */
4946             switch (r->data->what[n]) {
4947             case 's':
4948                 SvREFCNT_dec((SV*)r->data->data[n]);
4949                 break;
4950             case 'f':
4951                 Safefree(r->data->data[n]);
4952                 break;
4953             case 'p':
4954                 new_comppad = (AV*)r->data->data[n];
4955                 break;
4956             case 'o':
4957                 if (new_comppad == NULL)
4958                     Perl_croak(aTHX_ "panic: pregfree comppad");
4959                 PAD_SAVE_LOCAL(old_comppad,
4960                     /* Watch out for global destruction's random ordering. */
4961                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4962                                 new_comppad : Null(PAD *)
4963                 );
4964                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4965                     op_free((OP_4tree*)r->data->data[n]);
4966                 }
4967
4968                 PAD_RESTORE_LOCAL(old_comppad);
4969                 SvREFCNT_dec((SV*)new_comppad);
4970                 new_comppad = NULL;
4971                 break;
4972             case 'n':
4973                 break;
4974             default:
4975                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4976             }
4977         }
4978         Safefree(r->data->what);
4979         Safefree(r->data);
4980     }
4981     Safefree(r->startp);
4982     Safefree(r->endp);
4983     Safefree(r);
4984 }
4985
4986 /*
4987  - regnext - dig the "next" pointer out of a node
4988  *
4989  * [Note, when REGALIGN is defined there are two places in regmatch()
4990  * that bypass this code for speed.]
4991  */
4992 regnode *
4993 Perl_regnext(pTHX_ register regnode *p)
4994 {
4995     register I32 offset;
4996
4997     if (p == &PL_regdummy)
4998         return(NULL);
4999
5000     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5001     if (offset == 0)
5002         return(NULL);
5003
5004     return(p+offset);
5005 }
5006
5007 STATIC void     
5008 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5009 {
5010     va_list args;
5011     STRLEN l1 = strlen(pat1);
5012     STRLEN l2 = strlen(pat2);
5013     char buf[512];
5014     SV *msv;
5015     char *message;
5016
5017     if (l1 > 510)
5018         l1 = 510;
5019     if (l1 + l2 > 510)
5020         l2 = 510 - l1;
5021     Copy(pat1, buf, l1 , char);
5022     Copy(pat2, buf + l1, l2 , char);
5023     buf[l1 + l2] = '\n';
5024     buf[l1 + l2 + 1] = '\0';
5025 #ifdef I_STDARG
5026     /* ANSI variant takes additional second argument */
5027     va_start(args, pat2);
5028 #else
5029     va_start(args);
5030 #endif
5031     msv = vmess(buf, &args);
5032     va_end(args);
5033     message = SvPV(msv,l1);
5034     if (l1 > 512)
5035         l1 = 512;
5036     Copy(message, buf, l1 , char);
5037     buf[l1] = '\0';                     /* Overwrite \n */
5038     Perl_croak(aTHX_ "%s", buf);
5039 }
5040
5041 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5042
5043 void
5044 Perl_save_re_context(pTHX)
5045 {
5046     SAVEI32(PL_reg_flags);              /* from regexec.c */
5047     SAVEPPTR(PL_bostr);
5048     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5049     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5050     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5051     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5052     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5053     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5054     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5055     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5056     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5057     PL_reg_start_tmp = 0;
5058     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5059     PL_reg_start_tmpl = 0;
5060     SAVEVPTR(PL_regdata);
5061     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5062     SAVEI32(PL_regnarrate);             /* from regexec.c */
5063     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5064     SAVEINT(PL_regindent);              /* from regexec.c */
5065     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5066     SAVEVPTR(PL_curcop);
5067     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5068     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5069     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5070     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5071     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5072     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5073     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5074     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5075     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5076     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5077     PL_reg_oldsaved = Nullch;
5078     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5079     PL_reg_oldsavedlen = 0;
5080     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5081     PL_reg_maxiter = 0;
5082     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5083     PL_reg_leftiter = 0;
5084     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5085     PL_reg_poscache = Nullch;
5086     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5087     PL_reg_poscache_size = 0;
5088     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5089     SAVEI32(PL_regnpar);                /* () count. */
5090     SAVEI32(PL_regsize);                /* from regexec.c */
5091
5092     {
5093         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5094         U32 i;
5095         GV *mgv;
5096         REGEXP *rx;
5097         char digits[16];
5098
5099         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5100             for (i = 1; i <= rx->nparens; i++) {
5101                 sprintf(digits, "%lu", (long)i);
5102                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5103                     save_scalar(mgv);
5104             }
5105         }
5106     }
5107
5108 #ifdef DEBUGGING
5109     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5110 #endif
5111 }
5112
5113 static void
5114 clear_re(pTHX_ void *r)
5115 {
5116     ReREFCNT_dec((regexp *)r);
5117 }
5118