This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* This file contains functions for compiling a regular expression.  See
9  * also regexec.c which funnily enough, contains functions for executing
10  * a regular expression.
11  *
12  * This file is also copied at build time to ext/re/re_comp.c, where
13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14  * This causes the main functions to be compiled under new names and with
15  * debugging support added, which makes "use re 'debug'" work.
16  */
17
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19  * confused with the original package (see point 3 below).  Thanks, Henry!
20  */
21
22 /* Additional note: this code is very heavily munged from Henry's version
23  * in places.  In some spots I've traded clarity for efficiency, so don't
24  * blame Henry for some of the lack of readability.
25  */
26
27 /* The names of the functions have been changed from regcomp and
28  * regexec to  pregcomp and pregexec in order to avoid conflicts
29  * with the POSIX routines of the same names.
30 */
31
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 #  ifndef PERL_IN_XSUB_RE
35 #    define PERL_IN_XSUB_RE
36 #  endif
37 /* need access to debugger hooks */
38 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39 #    define DEBUGGING
40 #  endif
41 #endif
42
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 #  define Perl_pregcomp my_regcomp
46 #  define Perl_regdump my_regdump
47 #  define Perl_regprop my_regprop
48 #  define Perl_pregfree my_regfree
49 #  define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 #  define Perl_regnext my_regnext
52 #  define Perl_save_re_context my_save_re_context
53 #  define Perl_reginitcolors my_reginitcolors
54
55 #  define PERL_NO_GET_CONTEXT
56 #endif
57
58 /*SUPPRESS 112*/
59 /*
60  * pregcomp and pregexec -- regsub and regerror are not used in perl
61  *
62  *      Copyright (c) 1986 by University of Toronto.
63  *      Written by Henry Spencer.  Not derived from licensed software.
64  *
65  *      Permission is granted to anyone to use this software for any
66  *      purpose on any computer system, and to redistribute it freely,
67  *      subject to the following restrictions:
68  *
69  *      1. The author is not responsible for the consequences of use of
70  *              this software, no matter how awful, even if they arise
71  *              from defects in it.
72  *
73  *      2. The origin of this software must not be misrepresented, either
74  *              by explicit claim or by omission.
75  *
76  *      3. Altered versions must be plainly marked as such, and must not
77  *              be misrepresented as being the original software.
78  *
79  *
80  ****    Alterations to Henry's code are...
81  ****
82  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83  ****    2000, 2001, 2002, 2003, by Larry Wall and others
84  ****
85  ****    You may distribute under the terms of either the GNU General Public
86  ****    License or the Artistic License, as specified in the README file.
87
88  *
89  * Beware that some of this code is subtly aware of the way operator
90  * precedence is structured in regular expressions.  Serious changes in
91  * regular-expression syntax might require a total rethink.
92  */
93 #include "EXTERN.h"
94 #define PERL_IN_REGCOMP_C
95 #include "perl.h"
96
97 #ifndef PERL_IN_XSUB_RE
98 #  include "INTERN.h"
99 #endif
100
101 #define REG_COMP_C
102 #include "regcomp.h"
103
104 #ifdef op
105 #undef op
106 #endif /* op */
107
108 #ifdef MSDOS
109 #  if defined(BUGGY_MSC6)
110  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 #    pragma optimize("a",off)
112  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 #    pragma optimize("w",on )
114 #  endif /* BUGGY_MSC6 */
115 #endif /* MSDOS */
116
117 #ifndef STATIC
118 #define STATIC  static
119 #endif
120
121 typedef struct RExC_state_t {
122     U32         flags;                  /* are we folding, multilining? */
123     char        *precomp;               /* uncompiled string. */
124     regexp      *rx;
125     char        *start;                 /* Start of input for compile */
126     char        *end;                   /* End of input for compile */
127     char        *parse;                 /* Input-scan pointer. */
128     I32         whilem_seen;            /* number of WHILEM in this expr */
129     regnode     *emit_start;            /* Start of emitted-code area */
130     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
131     I32         naughty;                /* How bad is this pattern? */
132     I32         sawback;                /* Did we see \1, ...? */
133     U32         seen;
134     I32         size;                   /* Code size. */
135     I32         npar;                   /* () count. */
136     I32         extralen;
137     I32         seen_zerolen;
138     I32         seen_evals;
139     I32         utf8;
140 #if ADD_TO_REGEXEC
141     char        *starttry;              /* -Dr: where regtry was called. */
142 #define RExC_starttry   (pRExC_state->starttry)
143 #endif
144 } RExC_state_t;
145
146 #define RExC_flags      (pRExC_state->flags)
147 #define RExC_precomp    (pRExC_state->precomp)
148 #define RExC_rx         (pRExC_state->rx)
149 #define RExC_start      (pRExC_state->start)
150 #define RExC_end        (pRExC_state->end)
151 #define RExC_parse      (pRExC_state->parse)
152 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
153 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit       (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty    (pRExC_state->naughty)
157 #define RExC_sawback    (pRExC_state->sawback)
158 #define RExC_seen       (pRExC_state->seen)
159 #define RExC_size       (pRExC_state->size)
160 #define RExC_npar       (pRExC_state->npar)
161 #define RExC_extralen   (pRExC_state->extralen)
162 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8       (pRExC_state->utf8)
165
166 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168         ((*s) == '{' && regcurly(s)))
169
170 #ifdef SPSTART
171 #undef SPSTART          /* dratted cpp namespace... */
172 #endif
173 /*
174  * Flags to be passed up and down.
175  */
176 #define WORST           0       /* Worst case. */
177 #define HASWIDTH        0x1     /* Known to match non-null strings. */
178 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART         0x4     /* Starts with * or +. */
180 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
181
182 /* Length of a variant. */
183
184 typedef struct scan_data_t {
185     I32 len_min;
186     I32 len_delta;
187     I32 pos_min;
188     I32 pos_delta;
189     SV *last_found;
190     I32 last_end;                       /* min value, <0 unless valid. */
191     I32 last_start_min;
192     I32 last_start_max;
193     SV **longest;                       /* Either &l_fixed, or &l_float. */
194     SV *longest_fixed;
195     I32 offset_fixed;
196     SV *longest_float;
197     I32 offset_float_min;
198     I32 offset_float_max;
199     I32 flags;
200     I32 whilem_c;
201     I32 *last_closep;
202     struct regnode_charclass_class *start_class;
203 } scan_data_t;
204
205 /*
206  * Forward declarations for pregcomp()'s friends.
207  */
208
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210                                       0, 0, 0, 0, 0, 0};
211
212 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL          0x1
214 #define SF_BEFORE_MEOL          0x2
215 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
218 #ifdef NO_UNARY_PLUS
219 #  define SF_FIX_SHIFT_EOL      (0+2)
220 #  define SF_FL_SHIFT_EOL               (0+4)
221 #else
222 #  define SF_FIX_SHIFT_EOL      (+2)
223 #  define SF_FL_SHIFT_EOL               (+4)
224 #endif
225
226 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF               0x40
232 #define SF_HAS_PAR              0x80
233 #define SF_IN_PAR               0x100
234 #define SF_HAS_EVAL             0x200
235 #define SCF_DO_SUBSTR           0x400
236 #define SCF_DO_STCLASS_AND      0x0800
237 #define SCF_DO_STCLASS_OR       0x1000
238 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS  0x2000
240
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244
245 #define OOB_UNICODE             12345678
246 #define OOB_NAMEDCLASS          -1
247
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
251
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
254
255 /*
256  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258  * op/pragma/warn/regcomp.
259  */
260 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
262
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
264
265 /*
266  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267  * arg. Show regex, up to a maximum length. If it's too long, chop and add
268  * "...".
269  */
270 #define FAIL(msg) STMT_START {                                          \
271     char *ellipses = "";                                                \
272     IV len = RExC_end - RExC_precomp;                                   \
273                                                                         \
274     if (!SIZE_ONLY)                                                     \
275         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
276     if (len > RegexLengthToShowInErrorMessages) {                       \
277         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
278         len = RegexLengthToShowInErrorMessages - 10;                    \
279         ellipses = "...";                                               \
280     }                                                                   \
281     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
282             msg, (int)len, RExC_precomp, ellipses);                     \
283 } STMT_END
284
285 /*
286  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287  * args. Show regex, up to a maximum length. If it's too long, chop and add
288  * "...".
289  */
290 #define FAIL2(pat,msg) STMT_START {                                     \
291     char *ellipses = "";                                                \
292     IV len = RExC_end - RExC_precomp;                                   \
293                                                                         \
294     if (!SIZE_ONLY)                                                     \
295         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
296     if (len > RegexLengthToShowInErrorMessages) {                       \
297         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
298         len = RegexLengthToShowInErrorMessages - 10;                    \
299         ellipses = "...";                                               \
300     }                                                                   \
301     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
302             msg, (int)len, RExC_precomp, ellipses);                     \
303 } STMT_END
304
305
306 /*
307  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308  */
309 #define Simple_vFAIL(m) STMT_START {                                    \
310     IV offset = RExC_parse - RExC_precomp;                              \
311     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
312             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
313 } STMT_END
314
315 /*
316  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317  */
318 #define vFAIL(m) STMT_START {                           \
319     if (!SIZE_ONLY)                                     \
320         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
321     Simple_vFAIL(m);                                    \
322 } STMT_END
323
324 /*
325  * Like Simple_vFAIL(), but accepts two arguments.
326  */
327 #define Simple_vFAIL2(m,a1) STMT_START {                        \
328     IV offset = RExC_parse - RExC_precomp;                      \
329     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
330             (int)offset, RExC_precomp, RExC_precomp + offset);  \
331 } STMT_END
332
333 /*
334  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335  */
336 #define vFAIL2(m,a1) STMT_START {                       \
337     if (!SIZE_ONLY)                                     \
338         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
339     Simple_vFAIL2(m, a1);                               \
340 } STMT_END
341
342
343 /*
344  * Like Simple_vFAIL(), but accepts three arguments.
345  */
346 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
347     IV offset = RExC_parse - RExC_precomp;                      \
348     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
349             (int)offset, RExC_precomp, RExC_precomp + offset);  \
350 } STMT_END
351
352 /*
353  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354  */
355 #define vFAIL3(m,a1,a2) STMT_START {                    \
356     if (!SIZE_ONLY)                                     \
357         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
358     Simple_vFAIL3(m, a1, a2);                           \
359 } STMT_END
360
361 /*
362  * Like Simple_vFAIL(), but accepts four arguments.
363  */
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
365     IV offset = RExC_parse - RExC_precomp;                      \
366     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
367             (int)offset, RExC_precomp, RExC_precomp + offset);  \
368 } STMT_END
369
370 /*
371  * Like Simple_vFAIL(), but accepts five arguments.
372  */
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
374     IV offset = RExC_parse - RExC_precomp;                      \
375     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
376             (int)offset, RExC_precomp, RExC_precomp + offset);  \
377 } STMT_END
378
379
380 #define vWARN(loc,m) STMT_START {                                       \
381     IV offset = loc - RExC_precomp;                                     \
382     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
383             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
384 } STMT_END
385
386 #define vWARNdep(loc,m) STMT_START {                                    \
387     IV offset = loc - RExC_precomp;                                     \
388     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
389             "%s" REPORT_LOCATION,                                       \
390             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
391 } STMT_END
392
393
394 #define vWARN2(loc, m, a1) STMT_START {                                 \
395     IV offset = loc - RExC_precomp;                                     \
396     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
397             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
398 } STMT_END
399
400 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
401     IV offset = loc - RExC_precomp;                                     \
402     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
403             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
404 } STMT_END
405
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
407     IV offset = loc - RExC_precomp;                                     \
408     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
409             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410 } STMT_END
411
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
413     IV offset = loc - RExC_precomp;                                     \
414     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
415             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416 } STMT_END
417
418
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START {                  \
421     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422 } STMT_END
423
424 /* Macros for recording node offsets.   20001227 mjd@plover.com 
425  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
426  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
427  * Element 0 holds the number n.
428  */
429
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
432
433
434 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
435     if (! SIZE_ONLY) {                                                  \
436         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
437                 __LINE__, (node), (byte)));                             \
438         if((node) < 0) {                                                \
439             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440         } else {                                                        \
441             RExC_offsets[2*(node)-1] = (byte);                          \
442         }                                                               \
443     }                                                                   \
444 } STMT_END
445
446 #define Set_Node_Offset(node,byte) \
447     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
451     if (! SIZE_ONLY) {                                                  \
452         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
453                 __LINE__, (node), (len)));                              \
454         if((node) < 0) {                                                \
455             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456         } else {                                                        \
457             RExC_offsets[2*(node)] = (len);                             \
458         }                                                               \
459     }                                                                   \
460 } STMT_END
461
462 #define Set_Node_Length(node,len) \
463     Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466     Set_Node_Length(node, RExC_parse - parse_start)
467
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
472 static void clear_re(pTHX_ void *r);
473
474 /* Mark that we cannot extend a found fixed substring at this point.
475    Updata the longest found anchored substring and the longest found
476    floating substrings if needed. */
477
478 STATIC void
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
480 {
481     STRLEN l = CHR_SVLEN(data->last_found);
482     STRLEN old_l = CHR_SVLEN(*data->longest);
483
484     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485         SvSetMagicSV(*data->longest, data->last_found);
486         if (*data->longest == data->longest_fixed) {
487             data->offset_fixed = l ? data->last_start_min : data->pos_min;
488             if (data->flags & SF_BEFORE_EOL)
489                 data->flags
490                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491             else
492                 data->flags &= ~SF_FIX_BEFORE_EOL;
493         }
494         else {
495             data->offset_float_min = l ? data->last_start_min : data->pos_min;
496             data->offset_float_max = (l
497                                       ? data->last_start_max
498                                       : data->pos_min + data->pos_delta);
499             if ((U32)data->offset_float_max > (U32)I32_MAX)
500                 data->offset_float_max = I32_MAX;
501             if (data->flags & SF_BEFORE_EOL)
502                 data->flags
503                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504             else
505                 data->flags &= ~SF_FL_BEFORE_EOL;
506         }
507     }
508     SvCUR_set(data->last_found, 0);
509     {
510         SV * sv = data->last_found;
511         MAGIC *mg =
512             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513         if (mg && mg->mg_len > 0)
514             mg->mg_len = 0;
515     }
516     data->last_end = -1;
517     data->flags &= ~SF_BEFORE_EOL;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC void
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
523 {
524     ANYOF_CLASS_ZERO(cl);
525     ANYOF_BITMAP_SETALL(cl);
526     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
527     if (LOC)
528         cl->flags |= ANYOF_LOCALE;
529 }
530
531 /* Can match anything (initialization) */
532 STATIC int
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534 {
535     int value;
536
537     for (value = 0; value <= ANYOF_MAX; value += 2)
538         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539             return 1;
540     if (!(cl->flags & ANYOF_UNICODE_ALL))
541         return 0;
542     if (!ANYOF_BITMAP_TESTALLSET(cl))
543         return 0;
544     return 1;
545 }
546
547 /* Can match anything (initialization) */
548 STATIC void
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
550 {
551     Zero(cl, 1, struct regnode_charclass_class);
552     cl->type = ANYOF;
553     cl_anything(pRExC_state, cl);
554 }
555
556 STATIC void
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
558 {
559     Zero(cl, 1, struct regnode_charclass_class);
560     cl->type = ANYOF;
561     cl_anything(pRExC_state, cl);
562     if (LOC)
563         cl->flags |= ANYOF_LOCALE;
564 }
565
566 /* 'And' a given class with another one.  Can create false positives */
567 /* We assume that cl is not inverted */
568 STATIC void
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570          struct regnode_charclass_class *and_with)
571 {
572     if (!(and_with->flags & ANYOF_CLASS)
573         && !(cl->flags & ANYOF_CLASS)
574         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575         && !(and_with->flags & ANYOF_FOLD)
576         && !(cl->flags & ANYOF_FOLD)) {
577         int i;
578
579         if (and_with->flags & ANYOF_INVERT)
580             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581                 cl->bitmap[i] &= ~and_with->bitmap[i];
582         else
583             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584                 cl->bitmap[i] &= and_with->bitmap[i];
585     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586     if (!(and_with->flags & ANYOF_EOS))
587         cl->flags &= ~ANYOF_EOS;
588
589     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590         !(and_with->flags & ANYOF_INVERT)) {
591         cl->flags &= ~ANYOF_UNICODE_ALL;
592         cl->flags |= ANYOF_UNICODE;
593         ARG_SET(cl, ARG(and_with));
594     }
595     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596         !(and_with->flags & ANYOF_INVERT))
597         cl->flags &= ~ANYOF_UNICODE_ALL;
598     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599         !(and_with->flags & ANYOF_INVERT))
600         cl->flags &= ~ANYOF_UNICODE;
601 }
602
603 /* 'OR' a given class with another one.  Can create false positives */
604 /* We assume that cl is not inverted */
605 STATIC void
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
607 {
608     if (or_with->flags & ANYOF_INVERT) {
609         /* We do not use
610          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611          *   <= (B1 | !B2) | (CL1 | !CL2)
612          * which is wasteful if CL2 is small, but we ignore CL2:
613          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614          * XXXX Can we handle case-fold?  Unclear:
615          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617          */
618         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619              && !(or_with->flags & ANYOF_FOLD)
620              && !(cl->flags & ANYOF_FOLD) ) {
621             int i;
622
623             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624                 cl->bitmap[i] |= ~or_with->bitmap[i];
625         } /* XXXX: logic is complicated otherwise */
626         else {
627             cl_anything(pRExC_state, cl);
628         }
629     } else {
630         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632              && (!(or_with->flags & ANYOF_FOLD)
633                  || (cl->flags & ANYOF_FOLD)) ) {
634             int i;
635
636             /* OR char bitmap and class bitmap separately */
637             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638                 cl->bitmap[i] |= or_with->bitmap[i];
639             if (or_with->flags & ANYOF_CLASS) {
640                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641                     cl->classflags[i] |= or_with->classflags[i];
642                 cl->flags |= ANYOF_CLASS;
643             }
644         }
645         else { /* XXXX: logic is complicated, leave it along for a moment. */
646             cl_anything(pRExC_state, cl);
647         }
648     }
649     if (or_with->flags & ANYOF_EOS)
650         cl->flags |= ANYOF_EOS;
651
652     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653         ARG(cl) != ARG(or_with)) {
654         cl->flags |= ANYOF_UNICODE_ALL;
655         cl->flags &= ~ANYOF_UNICODE;
656     }
657     if (or_with->flags & ANYOF_UNICODE_ALL) {
658         cl->flags |= ANYOF_UNICODE_ALL;
659         cl->flags &= ~ANYOF_UNICODE;
660     }
661 }
662
663 /*
664  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665  * These need to be revisited when a newer toolchain becomes available.
666  */
667 #if defined(__sparc64__) && defined(__GNUC__)
668 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669 #       undef  SPARC64_GCC_WORKAROUND
670 #       define SPARC64_GCC_WORKAROUND 1
671 #   endif
672 #endif
673
674 /* REx optimizer.  Converts nodes into quickier variants "in place".
675    Finds fixed substrings.  */
676
677 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
678    to the position after last scanned or to NULL. */
679
680 STATIC I32
681 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
682                         /* scanp: Start here (read-write). */
683                         /* deltap: Write maxlen-minlen here. */
684                         /* last: Stop before this one. */
685 {
686     I32 min = 0, pars = 0, code;
687     regnode *scan = *scanp, *next;
688     I32 delta = 0;
689     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
690     int is_inf_internal = 0;            /* The studied chunk is infinite */
691     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692     scan_data_t data_fake;
693     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
694
695     while (scan && OP(scan) != END && scan < last) {
696         /* Peephole optimizer: */
697
698         if (PL_regkind[(U8)OP(scan)] == EXACT) {
699             /* Merge several consecutive EXACTish nodes into one. */
700             regnode *n = regnext(scan);
701             U32 stringok = 1;
702 #ifdef DEBUGGING
703             regnode *stop = scan;
704 #endif
705
706             next = scan + NODE_SZ_STR(scan);
707             /* Skip NOTHING, merge EXACT*. */
708             while (n &&
709                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
710                      (stringok && (OP(n) == OP(scan))))
711                    && NEXT_OFF(n)
712                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713                 if (OP(n) == TAIL || n > next)
714                     stringok = 0;
715                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
716                     NEXT_OFF(scan) += NEXT_OFF(n);
717                     next = n + NODE_STEP_REGNODE;
718 #ifdef DEBUGGING
719                     if (stringok)
720                         stop = n;
721 #endif
722                     n = regnext(n);
723                 }
724                 else if (stringok) {
725                     int oldl = STR_LEN(scan);
726                     regnode *nnext = regnext(n);
727
728                     if (oldl + STR_LEN(n) > U8_MAX)
729                         break;
730                     NEXT_OFF(scan) += NEXT_OFF(n);
731                     STR_LEN(scan) += STR_LEN(n);
732                     next = n + NODE_SZ_STR(n);
733                     /* Now we can overwrite *n : */
734                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
735 #ifdef DEBUGGING
736                     stop = next - 1;
737 #endif
738                     n = nnext;
739                 }
740             }
741
742             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
743 /*
744   Two problematic code points in Unicode casefolding of EXACT nodes:
745
746    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
748
749    which casefold to
750
751    Unicode                      UTF-8
752
753    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
754    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
755
756    This means that in case-insensitive matching (or "loose matching",
757    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758    length of the above casefolded versions) can match a target string
759    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760    This would rather mess up the minimum length computation.
761
762    What we'll do is to look for the tail four bytes, and then peek
763    at the preceding two bytes to see whether we need to decrease
764    the minimum length by four (six minus two).
765
766    Thanks to the design of UTF-8, there cannot be false matches:
767    A sequence of valid UTF-8 bytes cannot be a subsequence of
768    another valid sequence of UTF-8 bytes.
769
770 */
771                  char *s0 = STRING(scan), *s, *t;
772                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773                  char *t0 = "\xcc\x88\xcc\x81";
774                  char *t1 = t0 + 3;
775                  
776                  for (s = s0 + 2;
777                       s < s2 && (t = ninstr(s, s1, t0, t1));
778                       s = t + 4) {
779                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
781                            min -= 4;
782                  }
783             }
784
785 #ifdef DEBUGGING
786             /* Allow dumping */
787             n = scan + NODE_SZ_STR(scan);
788             while (n <= stop) {
789                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
790                     OP(n) = OPTIMIZED;
791                     NEXT_OFF(n) = 0;
792                 }
793                 n++;
794             }
795 #endif
796         }
797         /* Follow the next-chain of the current node and optimize
798            away all the NOTHINGs from it.  */
799         if (OP(scan) != CURLYX) {
800             int max = (reg_off_by_arg[OP(scan)]
801                        ? I32_MAX
802                        /* I32 may be smaller than U16 on CRAYs! */
803                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
804             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805             int noff;
806             regnode *n = scan;
807         
808             /* Skip NOTHING and LONGJMP. */
809             while ((n = regnext(n))
810                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
811                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
812                    && off + noff < max)
813                 off += noff;
814             if (reg_off_by_arg[OP(scan)])
815                 ARG(scan) = off;
816             else
817                 NEXT_OFF(scan) = off;
818         }
819         /* The principal pseudo-switch.  Cannot be a switch, since we
820            look into several different things.  */
821         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
822                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823             next = regnext(scan);
824             code = OP(scan);
825         
826             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
827                 I32 max1 = 0, min1 = I32_MAX, num = 0;
828                 struct regnode_charclass_class accum;
829                 
830                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
831                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
832                 if (flags & SCF_DO_STCLASS)
833                     cl_init_zero(pRExC_state, &accum);
834                 while (OP(scan) == code) {
835                     I32 deltanext, minnext, f = 0, fake;
836                     struct regnode_charclass_class this_class;
837
838                     num++;
839                     data_fake.flags = 0;
840                     if (data) {         
841                         data_fake.whilem_c = data->whilem_c;
842                         data_fake.last_closep = data->last_closep;
843                     }
844                     else
845                         data_fake.last_closep = &fake;
846                     next = regnext(scan);
847                     scan = NEXTOPER(scan);
848                     if (code != BRANCH)
849                         scan = NEXTOPER(scan);
850                     if (flags & SCF_DO_STCLASS) {
851                         cl_init(pRExC_state, &this_class);
852                         data_fake.start_class = &this_class;
853                         f = SCF_DO_STCLASS_AND;
854                     }           
855                     if (flags & SCF_WHILEM_VISITED_POS)
856                         f |= SCF_WHILEM_VISITED_POS;
857                     /* we suppose the run is continuous, last=next...*/
858                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
859                                           next, &data_fake, f);
860                     if (min1 > minnext)
861                         min1 = minnext;
862                     if (max1 < minnext + deltanext)
863                         max1 = minnext + deltanext;
864                     if (deltanext == I32_MAX)
865                         is_inf = is_inf_internal = 1;
866                     scan = next;
867                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
868                         pars++;
869                     if (data && (data_fake.flags & SF_HAS_EVAL))
870                         data->flags |= SF_HAS_EVAL;
871                     if (data)
872                         data->whilem_c = data_fake.whilem_c;
873                     if (flags & SCF_DO_STCLASS)
874                         cl_or(pRExC_state, &accum, &this_class);
875                     if (code == SUSPEND)
876                         break;
877                 }
878                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
879                     min1 = 0;
880                 if (flags & SCF_DO_SUBSTR) {
881                     data->pos_min += min1;
882                     data->pos_delta += max1 - min1;
883                     if (max1 != min1 || is_inf)
884                         data->longest = &(data->longest_float);
885                 }
886                 min += min1;
887                 delta += max1 - min1;
888                 if (flags & SCF_DO_STCLASS_OR) {
889                     cl_or(pRExC_state, data->start_class, &accum);
890                     if (min1) {
891                         cl_and(data->start_class, &and_with);
892                         flags &= ~SCF_DO_STCLASS;
893                     }
894                 }
895                 else if (flags & SCF_DO_STCLASS_AND) {
896                     if (min1) {
897                         cl_and(data->start_class, &accum);
898                         flags &= ~SCF_DO_STCLASS;
899                     }
900                     else {
901                         /* Switch to OR mode: cache the old value of
902                          * data->start_class */
903                         StructCopy(data->start_class, &and_with,
904                                    struct regnode_charclass_class);
905                         flags &= ~SCF_DO_STCLASS_AND;
906                         StructCopy(&accum, data->start_class,
907                                    struct regnode_charclass_class);
908                         flags |= SCF_DO_STCLASS_OR;
909                         data->start_class->flags |= ANYOF_EOS;
910                     }
911                 }
912             }
913             else if (code == BRANCHJ)   /* single branch is optimized. */
914                 scan = NEXTOPER(NEXTOPER(scan));
915             else                        /* single branch is optimized. */
916                 scan = NEXTOPER(scan);
917             continue;
918         }
919         else if (OP(scan) == EXACT) {
920             I32 l = STR_LEN(scan);
921             UV uc = *((U8*)STRING(scan));
922             if (UTF) {
923                 U8 *s = (U8*)STRING(scan);
924                 l = utf8_length(s, s + l);
925                 uc = utf8_to_uvchr(s, NULL);
926             }
927             min += l;
928             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
929                 /* The code below prefers earlier match for fixed
930                    offset, later match for variable offset.  */
931                 if (data->last_end == -1) { /* Update the start info. */
932                     data->last_start_min = data->pos_min;
933                     data->last_start_max = is_inf
934                         ? I32_MAX : data->pos_min + data->pos_delta;
935                 }
936                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
937                 {
938                     SV * sv = data->last_found;
939                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
941                     if (mg && mg->mg_len >= 0)
942                         mg->mg_len += utf8_length((U8*)STRING(scan),
943                                                   (U8*)STRING(scan)+STR_LEN(scan));
944                 }
945                 if (UTF)
946                     SvUTF8_on(data->last_found);
947                 data->last_end = data->pos_min + l;
948                 data->pos_min += l; /* As in the first entry. */
949                 data->flags &= ~SF_BEFORE_EOL;
950             }
951             if (flags & SCF_DO_STCLASS_AND) {
952                 /* Check whether it is compatible with what we know already! */
953                 int compat = 1;
954
955                 if (uc >= 0x100 ||
956                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
957                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
958                     && (!(data->start_class->flags & ANYOF_FOLD)
959                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
960                     )
961                     compat = 0;
962                 ANYOF_CLASS_ZERO(data->start_class);
963                 ANYOF_BITMAP_ZERO(data->start_class);
964                 if (compat)
965                     ANYOF_BITMAP_SET(data->start_class, uc);
966                 data->start_class->flags &= ~ANYOF_EOS;
967                 if (uc < 0x100)
968                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
969             }
970             else if (flags & SCF_DO_STCLASS_OR) {
971                 /* false positive possible if the class is case-folded */
972                 if (uc < 0x100)
973                     ANYOF_BITMAP_SET(data->start_class, uc);
974                 else
975                     data->start_class->flags |= ANYOF_UNICODE_ALL;
976                 data->start_class->flags &= ~ANYOF_EOS;
977                 cl_and(data->start_class, &and_with);
978             }
979             flags &= ~SCF_DO_STCLASS;
980         }
981         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
982             I32 l = STR_LEN(scan);
983             UV uc = *((U8*)STRING(scan));
984
985             /* Search for fixed substrings supports EXACT only. */
986             if (flags & SCF_DO_SUBSTR)
987                 scan_commit(pRExC_state, data);
988             if (UTF) {
989                 U8 *s = (U8 *)STRING(scan);
990                 l = utf8_length(s, s + l);
991                 uc = utf8_to_uvchr(s, NULL);
992             }
993             min += l;
994             if (data && (flags & SCF_DO_SUBSTR))
995                 data->pos_min += l;
996             if (flags & SCF_DO_STCLASS_AND) {
997                 /* Check whether it is compatible with what we know already! */
998                 int compat = 1;
999
1000                 if (uc >= 0x100 ||
1001                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1002                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
1003                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
1004                     compat = 0;
1005                 ANYOF_CLASS_ZERO(data->start_class);
1006                 ANYOF_BITMAP_ZERO(data->start_class);
1007                 if (compat) {
1008                     ANYOF_BITMAP_SET(data->start_class, uc);
1009                     data->start_class->flags &= ~ANYOF_EOS;
1010                     data->start_class->flags |= ANYOF_FOLD;
1011                     if (OP(scan) == EXACTFL)
1012                         data->start_class->flags |= ANYOF_LOCALE;
1013                 }
1014             }
1015             else if (flags & SCF_DO_STCLASS_OR) {
1016                 if (data->start_class->flags & ANYOF_FOLD) {
1017                     /* false positive possible if the class is case-folded.
1018                        Assume that the locale settings are the same... */
1019                     if (uc < 0x100)
1020                         ANYOF_BITMAP_SET(data->start_class, uc);
1021                     data->start_class->flags &= ~ANYOF_EOS;
1022                 }
1023                 cl_and(data->start_class, &and_with);
1024             }
1025             flags &= ~SCF_DO_STCLASS;
1026         }
1027         else if (strchr((char*)PL_varies,OP(scan))) {
1028             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1029             I32 f = flags, pos_before = 0;
1030             regnode *oscan = scan;
1031             struct regnode_charclass_class this_class;
1032             struct regnode_charclass_class *oclass = NULL;
1033             I32 next_is_eval = 0;
1034
1035             switch (PL_regkind[(U8)OP(scan)]) {
1036             case WHILEM:                /* End of (?:...)* . */
1037                 scan = NEXTOPER(scan);
1038                 goto finish;
1039             case PLUS:
1040                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1041                     next = NEXTOPER(scan);
1042                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1043                         mincount = 1;
1044                         maxcount = REG_INFTY;
1045                         next = regnext(scan);
1046                         scan = NEXTOPER(scan);
1047                         goto do_curly;
1048                     }
1049                 }
1050                 if (flags & SCF_DO_SUBSTR)
1051                     data->pos_min++;
1052                 min++;
1053                 /* Fall through. */
1054             case STAR:
1055                 if (flags & SCF_DO_STCLASS) {
1056                     mincount = 0;
1057                     maxcount = REG_INFTY;
1058                     next = regnext(scan);
1059                     scan = NEXTOPER(scan);
1060                     goto do_curly;
1061                 }
1062                 is_inf = is_inf_internal = 1;
1063                 scan = regnext(scan);
1064                 if (flags & SCF_DO_SUBSTR) {
1065                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1066                     data->longest = &(data->longest_float);
1067                 }
1068                 goto optimize_curly_tail;
1069             case CURLY:
1070                 mincount = ARG1(scan);
1071                 maxcount = ARG2(scan);
1072                 next = regnext(scan);
1073                 if (OP(scan) == CURLYX) {
1074                     I32 lp = (data ? *(data->last_closep) : 0);
1075
1076                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1077                 }
1078                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1079                 next_is_eval = (OP(scan) == EVAL);
1080               do_curly:
1081                 if (flags & SCF_DO_SUBSTR) {
1082                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1083                     pos_before = data->pos_min;
1084                 }
1085                 if (data) {
1086                     fl = data->flags;
1087                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1088                     if (is_inf)
1089                         data->flags |= SF_IS_INF;
1090                 }
1091                 if (flags & SCF_DO_STCLASS) {
1092                     cl_init(pRExC_state, &this_class);
1093                     oclass = data->start_class;
1094                     data->start_class = &this_class;
1095                     f |= SCF_DO_STCLASS_AND;
1096                     f &= ~SCF_DO_STCLASS_OR;
1097                 }
1098                 /* These are the cases when once a subexpression
1099                    fails at a particular position, it cannot succeed
1100                    even after backtracking at the enclosing scope.
1101                 
1102                    XXXX what if minimal match and we are at the
1103                         initial run of {n,m}? */
1104                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105                     f &= ~SCF_WHILEM_VISITED_POS;
1106
1107                 /* This will finish on WHILEM, setting scan, or on NULL: */
1108                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1109                                       mincount == 0
1110                                         ? (f & ~SCF_DO_SUBSTR) : f);
1111
1112                 if (flags & SCF_DO_STCLASS)
1113                     data->start_class = oclass;
1114                 if (mincount == 0 || minnext == 0) {
1115                     if (flags & SCF_DO_STCLASS_OR) {
1116                         cl_or(pRExC_state, data->start_class, &this_class);
1117                     }
1118                     else if (flags & SCF_DO_STCLASS_AND) {
1119                         /* Switch to OR mode: cache the old value of
1120                          * data->start_class */
1121                         StructCopy(data->start_class, &and_with,
1122                                    struct regnode_charclass_class);
1123                         flags &= ~SCF_DO_STCLASS_AND;
1124                         StructCopy(&this_class, data->start_class,
1125                                    struct regnode_charclass_class);
1126                         flags |= SCF_DO_STCLASS_OR;
1127                         data->start_class->flags |= ANYOF_EOS;
1128                     }
1129                 } else {                /* Non-zero len */
1130                     if (flags & SCF_DO_STCLASS_OR) {
1131                         cl_or(pRExC_state, data->start_class, &this_class);
1132                         cl_and(data->start_class, &and_with);
1133                     }
1134                     else if (flags & SCF_DO_STCLASS_AND)
1135                         cl_and(data->start_class, &this_class);
1136                     flags &= ~SCF_DO_STCLASS;
1137                 }
1138                 if (!scan)              /* It was not CURLYX, but CURLY. */
1139                     scan = next;
1140                 if (ckWARN(WARN_REGEXP)
1141                        /* ? quantifier ok, except for (?{ ... }) */
1142                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1143                     && (minnext == 0) && (deltanext == 0)
1144                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1145                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1146                 {
1147                     vWARN(RExC_parse,
1148                           "Quantifier unexpected on zero-length expression");
1149                 }
1150
1151                 min += minnext * mincount;
1152                 is_inf_internal |= ((maxcount == REG_INFTY
1153                                      && (minnext + deltanext) > 0)
1154                                     || deltanext == I32_MAX);
1155                 is_inf |= is_inf_internal;
1156                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1157
1158                 /* Try powerful optimization CURLYX => CURLYN. */
1159                 if (  OP(oscan) == CURLYX && data
1160                       && data->flags & SF_IN_PAR
1161                       && !(data->flags & SF_HAS_EVAL)
1162                       && !deltanext && minnext == 1 ) {
1163                     /* Try to optimize to CURLYN.  */
1164                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1165                     regnode *nxt1 = nxt;
1166 #ifdef DEBUGGING
1167                     regnode *nxt2;
1168 #endif
1169
1170                     /* Skip open. */
1171                     nxt = regnext(nxt);
1172                     if (!strchr((char*)PL_simple,OP(nxt))
1173                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1174                              && STR_LEN(nxt) == 1))
1175                         goto nogo;
1176 #ifdef DEBUGGING
1177                     nxt2 = nxt;
1178 #endif
1179                     nxt = regnext(nxt);
1180                     if (OP(nxt) != CLOSE)
1181                         goto nogo;
1182                     /* Now we know that nxt2 is the only contents: */
1183                     oscan->flags = (U8)ARG(nxt);
1184                     OP(oscan) = CURLYN;
1185                     OP(nxt1) = NOTHING; /* was OPEN. */
1186 #ifdef DEBUGGING
1187                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1191                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1192                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1193 #endif
1194                 }
1195               nogo:
1196
1197                 /* Try optimization CURLYX => CURLYM. */
1198                 if (  OP(oscan) == CURLYX && data
1199                       && !(data->flags & SF_HAS_PAR)
1200                       && !(data->flags & SF_HAS_EVAL)
1201                       && !deltanext     /* atom is fixed width */
1202                       && minnext != 0   /* CURLYM can't handle zero width */
1203                 ) {
1204                     /* XXXX How to optimize if data == 0? */
1205                     /* Optimize to a simpler form.  */
1206                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1207                     regnode *nxt2;
1208
1209                     OP(oscan) = CURLYM;
1210                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1211                             && (OP(nxt2) != WHILEM))
1212                         nxt = nxt2;
1213                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1214                     /* Need to optimize away parenths. */
1215                     if (data->flags & SF_IN_PAR) {
1216                         /* Set the parenth number.  */
1217                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1218
1219                         if (OP(nxt) != CLOSE)
1220                             FAIL("Panic opt close");
1221                         oscan->flags = (U8)ARG(nxt);
1222                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1223                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1224 #ifdef DEBUGGING
1225                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1227                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1229 #endif
1230 #if 0
1231                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232                             regnode *nnxt = regnext(nxt1);
1233                         
1234                             if (nnxt == nxt) {
1235                                 if (reg_off_by_arg[OP(nxt1)])
1236                                     ARG_SET(nxt1, nxt2 - nxt1);
1237                                 else if (nxt2 - nxt1 < U16_MAX)
1238                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1239                                 else
1240                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1241                             }
1242                             nxt1 = nnxt;
1243                         }
1244 #endif
1245                         /* Optimize again: */
1246                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1247                                     NULL, 0);
1248                     }
1249                     else
1250                         oscan->flags = 0;
1251                 }
1252                 else if ((OP(oscan) == CURLYX)
1253                          && (flags & SCF_WHILEM_VISITED_POS)
1254                          /* See the comment on a similar expression above.
1255                             However, this time it not a subexpression
1256                             we care about, but the expression itself. */
1257                          && (maxcount == REG_INFTY)
1258                          && data && ++data->whilem_c < 16) {
1259                     /* This stays as CURLYX, we can put the count/of pair. */
1260                     /* Find WHILEM (as in regexec.c) */
1261                     regnode *nxt = oscan + NEXT_OFF(oscan);
1262
1263                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1264                         nxt += ARG(nxt);
1265                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1267                 }
1268                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1269                     pars++;
1270                 if (flags & SCF_DO_SUBSTR) {
1271                     SV *last_str = Nullsv;
1272                     int counted = mincount != 0;
1273
1274                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1275 #if defined(SPARC64_GCC_WORKAROUND)
1276                         I32 b = 0;
1277                         STRLEN l = 0;
1278                         char *s = NULL;
1279                         I32 old = 0;
1280
1281                         if (pos_before >= data->last_start_min)
1282                             b = pos_before;
1283                         else
1284                             b = data->last_start_min;
1285
1286                         l = 0;
1287                         s = SvPV(data->last_found, l);
1288                         old = b - data->last_start_min;
1289
1290 #else
1291                         I32 b = pos_before >= data->last_start_min
1292                             ? pos_before : data->last_start_min;
1293                         STRLEN l;
1294                         char *s = SvPV(data->last_found, l);
1295                         I32 old = b - data->last_start_min;
1296 #endif
1297
1298                         if (UTF)
1299                             old = utf8_hop((U8*)s, old) - (U8*)s;
1300                         
1301                         l -= old;
1302                         /* Get the added string: */
1303                         last_str = newSVpvn(s  + old, l);
1304                         if (UTF)
1305                             SvUTF8_on(last_str);
1306                         if (deltanext == 0 && pos_before == b) {
1307                             /* What was added is a constant string */
1308                             if (mincount > 1) {
1309                                 SvGROW(last_str, (mincount * l) + 1);
1310                                 repeatcpy(SvPVX(last_str) + l,
1311                                           SvPVX(last_str), l, mincount - 1);
1312                                 SvCUR(last_str) *= mincount;
1313                                 /* Add additional parts. */
1314                                 SvCUR_set(data->last_found,
1315                                           SvCUR(data->last_found) - l);
1316                                 sv_catsv(data->last_found, last_str);
1317                                 {
1318                                     SV * sv = data->last_found;
1319                                     MAGIC *mg =
1320                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1321                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322                                     if (mg && mg->mg_len >= 0)
1323                                         mg->mg_len += CHR_SVLEN(last_str);
1324                                 }
1325                                 data->last_end += l * (mincount - 1);
1326                             }
1327                         } else {
1328                             /* start offset must point into the last copy */
1329                             data->last_start_min += minnext * (mincount - 1);
1330                             data->last_start_max += is_inf ? I32_MAX
1331                                 : (maxcount - 1) * (minnext + data->pos_delta);
1332                         }
1333                     }
1334                     /* It is counted once already... */
1335                     data->pos_min += minnext * (mincount - counted);
1336                     data->pos_delta += - counted * deltanext +
1337                         (minnext + deltanext) * maxcount - minnext * mincount;
1338                     if (mincount != maxcount) {
1339                          /* Cannot extend fixed substrings found inside
1340                             the group.  */
1341                         scan_commit(pRExC_state,data);
1342                         if (mincount && last_str) {
1343                             sv_setsv(data->last_found, last_str);
1344                             data->last_end = data->pos_min;
1345                             data->last_start_min =
1346                                 data->pos_min - CHR_SVLEN(last_str);
1347                             data->last_start_max = is_inf
1348                                 ? I32_MAX
1349                                 : data->pos_min + data->pos_delta
1350                                 - CHR_SVLEN(last_str);
1351                         }
1352                         data->longest = &(data->longest_float);
1353                     }
1354                     SvREFCNT_dec(last_str);
1355                 }
1356                 if (data && (fl & SF_HAS_EVAL))
1357                     data->flags |= SF_HAS_EVAL;
1358               optimize_curly_tail:
1359                 if (OP(oscan) != CURLYX) {
1360                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1361                            && NEXT_OFF(next))
1362                         NEXT_OFF(oscan) += NEXT_OFF(next);
1363                 }
1364                 continue;
1365             default:                    /* REF and CLUMP only? */
1366                 if (flags & SCF_DO_SUBSTR) {
1367                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1368                     data->longest = &(data->longest_float);
1369                 }
1370                 is_inf = is_inf_internal = 1;
1371                 if (flags & SCF_DO_STCLASS_OR)
1372                     cl_anything(pRExC_state, data->start_class);
1373                 flags &= ~SCF_DO_STCLASS;
1374                 break;
1375             }
1376         }
1377         else if (strchr((char*)PL_simple,OP(scan))) {
1378             int value = 0;
1379
1380             if (flags & SCF_DO_SUBSTR) {
1381                 scan_commit(pRExC_state,data);
1382                 data->pos_min++;
1383             }
1384             min++;
1385             if (flags & SCF_DO_STCLASS) {
1386                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1387
1388                 /* Some of the logic below assumes that switching
1389                    locale on will only add false positives. */
1390                 switch (PL_regkind[(U8)OP(scan)]) {
1391                 case SANY:
1392                 default:
1393                   do_default:
1394                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1396                         cl_anything(pRExC_state, data->start_class);
1397                     break;
1398                 case REG_ANY:
1399                     if (OP(scan) == SANY)
1400                         goto do_default;
1401                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403                                  || (data->start_class->flags & ANYOF_CLASS));
1404                         cl_anything(pRExC_state, data->start_class);
1405                     }
1406                     if (flags & SCF_DO_STCLASS_AND || !value)
1407                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1408                     break;
1409                 case ANYOF:
1410                     if (flags & SCF_DO_STCLASS_AND)
1411                         cl_and(data->start_class,
1412                                (struct regnode_charclass_class*)scan);
1413                     else
1414                         cl_or(pRExC_state, data->start_class,
1415                               (struct regnode_charclass_class*)scan);
1416                     break;
1417                 case ALNUM:
1418                     if (flags & SCF_DO_STCLASS_AND) {
1419                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421                             for (value = 0; value < 256; value++)
1422                                 if (!isALNUM(value))
1423                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1424                         }
1425                     }
1426                     else {
1427                         if (data->start_class->flags & ANYOF_LOCALE)
1428                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1429                         else {
1430                             for (value = 0; value < 256; value++)
1431                                 if (isALNUM(value))
1432                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1433                         }
1434                     }
1435                     break;
1436                 case ALNUML:
1437                     if (flags & SCF_DO_STCLASS_AND) {
1438                         if (data->start_class->flags & ANYOF_LOCALE)
1439                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1440                     }
1441                     else {
1442                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443                         data->start_class->flags |= ANYOF_LOCALE;
1444                     }
1445                     break;
1446                 case NALNUM:
1447                     if (flags & SCF_DO_STCLASS_AND) {
1448                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450                             for (value = 0; value < 256; value++)
1451                                 if (isALNUM(value))
1452                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1453                         }
1454                     }
1455                     else {
1456                         if (data->start_class->flags & ANYOF_LOCALE)
1457                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458                         else {
1459                             for (value = 0; value < 256; value++)
1460                                 if (!isALNUM(value))
1461                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1462                         }
1463                     }
1464                     break;
1465                 case NALNUML:
1466                     if (flags & SCF_DO_STCLASS_AND) {
1467                         if (data->start_class->flags & ANYOF_LOCALE)
1468                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1469                     }
1470                     else {
1471                         data->start_class->flags |= ANYOF_LOCALE;
1472                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1473                     }
1474                     break;
1475                 case SPACE:
1476                     if (flags & SCF_DO_STCLASS_AND) {
1477                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479                             for (value = 0; value < 256; value++)
1480                                 if (!isSPACE(value))
1481                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1482                         }
1483                     }
1484                     else {
1485                         if (data->start_class->flags & ANYOF_LOCALE)
1486                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487                         else {
1488                             for (value = 0; value < 256; value++)
1489                                 if (isSPACE(value))
1490                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1491                         }
1492                     }
1493                     break;
1494                 case SPACEL:
1495                     if (flags & SCF_DO_STCLASS_AND) {
1496                         if (data->start_class->flags & ANYOF_LOCALE)
1497                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1498                     }
1499                     else {
1500                         data->start_class->flags |= ANYOF_LOCALE;
1501                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1502                     }
1503                     break;
1504                 case NSPACE:
1505                     if (flags & SCF_DO_STCLASS_AND) {
1506                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508                             for (value = 0; value < 256; value++)
1509                                 if (isSPACE(value))
1510                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1511                         }
1512                     }
1513                     else {
1514                         if (data->start_class->flags & ANYOF_LOCALE)
1515                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1516                         else {
1517                             for (value = 0; value < 256; value++)
1518                                 if (!isSPACE(value))
1519                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1520                         }
1521                     }
1522                     break;
1523                 case NSPACEL:
1524                     if (flags & SCF_DO_STCLASS_AND) {
1525                         if (data->start_class->flags & ANYOF_LOCALE) {
1526                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527                             for (value = 0; value < 256; value++)
1528                                 if (!isSPACE(value))
1529                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1530                         }
1531                     }
1532                     else {
1533                         data->start_class->flags |= ANYOF_LOCALE;
1534                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1535                     }
1536                     break;
1537                 case DIGIT:
1538                     if (flags & SCF_DO_STCLASS_AND) {
1539                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540                         for (value = 0; value < 256; value++)
1541                             if (!isDIGIT(value))
1542                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1543                     }
1544                     else {
1545                         if (data->start_class->flags & ANYOF_LOCALE)
1546                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1547                         else {
1548                             for (value = 0; value < 256; value++)
1549                                 if (isDIGIT(value))
1550                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1551                         }
1552                     }
1553                     break;
1554                 case NDIGIT:
1555                     if (flags & SCF_DO_STCLASS_AND) {
1556                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557                         for (value = 0; value < 256; value++)
1558                             if (isDIGIT(value))
1559                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1560                     }
1561                     else {
1562                         if (data->start_class->flags & ANYOF_LOCALE)
1563                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1564                         else {
1565                             for (value = 0; value < 256; value++)
1566                                 if (!isDIGIT(value))
1567                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1568                         }
1569                     }
1570                     break;
1571                 }
1572                 if (flags & SCF_DO_STCLASS_OR)
1573                     cl_and(data->start_class, &and_with);
1574                 flags &= ~SCF_DO_STCLASS;
1575             }
1576         }
1577         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1578             data->flags |= (OP(scan) == MEOL
1579                             ? SF_BEFORE_MEOL
1580                             : SF_BEFORE_SEOL);
1581         }
1582         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1583                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1584                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1585                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1586             /* Lookahead/lookbehind */
1587             I32 deltanext, minnext, fake = 0;
1588             regnode *nscan;
1589             struct regnode_charclass_class intrnl;
1590             int f = 0;
1591
1592             data_fake.flags = 0;
1593             if (data) {         
1594                 data_fake.whilem_c = data->whilem_c;
1595                 data_fake.last_closep = data->last_closep;
1596             }
1597             else
1598                 data_fake.last_closep = &fake;
1599             if ( flags & SCF_DO_STCLASS && !scan->flags
1600                  && OP(scan) == IFMATCH ) { /* Lookahead */
1601                 cl_init(pRExC_state, &intrnl);
1602                 data_fake.start_class = &intrnl;
1603                 f |= SCF_DO_STCLASS_AND;
1604             }
1605             if (flags & SCF_WHILEM_VISITED_POS)
1606                 f |= SCF_WHILEM_VISITED_POS;
1607             next = regnext(scan);
1608             nscan = NEXTOPER(NEXTOPER(scan));
1609             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1610             if (scan->flags) {
1611                 if (deltanext) {
1612                     vFAIL("Variable length lookbehind not implemented");
1613                 }
1614                 else if (minnext > U8_MAX) {
1615                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1616                 }
1617                 scan->flags = (U8)minnext;
1618             }
1619             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1620                 pars++;
1621             if (data && (data_fake.flags & SF_HAS_EVAL))
1622                 data->flags |= SF_HAS_EVAL;
1623             if (data)
1624                 data->whilem_c = data_fake.whilem_c;
1625             if (f & SCF_DO_STCLASS_AND) {
1626                 int was = (data->start_class->flags & ANYOF_EOS);
1627
1628                 cl_and(data->start_class, &intrnl);
1629                 if (was)
1630                     data->start_class->flags |= ANYOF_EOS;
1631             }
1632         }
1633         else if (OP(scan) == OPEN) {
1634             pars++;
1635         }
1636         else if (OP(scan) == CLOSE) {
1637             if ((I32)ARG(scan) == is_par) {
1638                 next = regnext(scan);
1639
1640                 if ( next && (OP(next) != WHILEM) && next < last)
1641                     is_par = 0;         /* Disable optimization */
1642             }
1643             if (data)
1644                 *(data->last_closep) = ARG(scan);
1645         }
1646         else if (OP(scan) == EVAL) {
1647                 if (data)
1648                     data->flags |= SF_HAS_EVAL;
1649         }
1650         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1651                 if (flags & SCF_DO_SUBSTR) {
1652                     scan_commit(pRExC_state,data);
1653                     data->longest = &(data->longest_float);
1654                 }
1655                 is_inf = is_inf_internal = 1;
1656                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1657                     cl_anything(pRExC_state, data->start_class);
1658                 flags &= ~SCF_DO_STCLASS;
1659         }
1660         /* Else: zero-length, ignore. */
1661         scan = regnext(scan);
1662     }
1663
1664   finish:
1665     *scanp = scan;
1666     *deltap = is_inf_internal ? I32_MAX : delta;
1667     if (flags & SCF_DO_SUBSTR && is_inf)
1668         data->pos_delta = I32_MAX - data->pos_min;
1669     if (is_par > U8_MAX)
1670         is_par = 0;
1671     if (is_par && pars==1 && data) {
1672         data->flags |= SF_IN_PAR;
1673         data->flags &= ~SF_HAS_PAR;
1674     }
1675     else if (pars && data) {
1676         data->flags |= SF_HAS_PAR;
1677         data->flags &= ~SF_IN_PAR;
1678     }
1679     if (flags & SCF_DO_STCLASS_OR)
1680         cl_and(data->start_class, &and_with);
1681     return min;
1682 }
1683
1684 STATIC I32
1685 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1686 {
1687     if (RExC_rx->data) {
1688         Renewc(RExC_rx->data,
1689                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1690                char, struct reg_data);
1691         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692         RExC_rx->data->count += n;
1693     }
1694     else {
1695         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1696              char, struct reg_data);
1697         New(1208, RExC_rx->data->what, n, U8);
1698         RExC_rx->data->count = n;
1699     }
1700     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701     return RExC_rx->data->count - n;
1702 }
1703
1704 void
1705 Perl_reginitcolors(pTHX)
1706 {
1707     int i = 0;
1708     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1709         
1710     if (s) {
1711         PL_colors[0] = s = savepv(s);
1712         while (++i < 6) {
1713             s = strchr(s, '\t');
1714             if (s) {
1715                 *s = '\0';
1716                 PL_colors[i] = ++s;
1717             }
1718             else
1719                 PL_colors[i] = s = "";
1720         }
1721     } else {
1722         while (i < 6)
1723             PL_colors[i++] = "";
1724     }
1725     PL_colorset = 1;
1726 }
1727
1728
1729 /*
1730  - pregcomp - compile a regular expression into internal code
1731  *
1732  * We can't allocate space until we know how big the compiled form will be,
1733  * but we can't compile it (and thus know how big it is) until we've got a
1734  * place to put the code.  So we cheat:  we compile it twice, once with code
1735  * generation turned off and size counting turned on, and once "for real".
1736  * This also means that we don't allocate space until we are sure that the
1737  * thing really will compile successfully, and we never have to move the
1738  * code and thus invalidate pointers into it.  (Note that it has to be in
1739  * one piece because free() must be able to free it all.) [NB: not true in perl]
1740  *
1741  * Beware that the optimization-preparation code in here knows about some
1742  * of the structure of the compiled regexp.  [I'll say.]
1743  */
1744 regexp *
1745 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1746 {
1747     register regexp *r;
1748     regnode *scan;
1749     regnode *first;
1750     I32 flags;
1751     I32 minlen = 0;
1752     I32 sawplus = 0;
1753     I32 sawopen = 0;
1754     scan_data_t data;
1755     RExC_state_t RExC_state;
1756     RExC_state_t *pRExC_state = &RExC_state;
1757
1758     if (exp == NULL)
1759         FAIL("NULL regexp argument");
1760
1761     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1762
1763     RExC_precomp = exp;
1764     DEBUG_r({
1765          if (!PL_colorset) reginitcolors();
1766          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1767                        PL_colors[4],PL_colors[5],PL_colors[0],
1768                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1769     });
1770     RExC_flags = pm->op_pmflags;
1771     RExC_sawback = 0;
1772
1773     RExC_seen = 0;
1774     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1775     RExC_seen_evals = 0;
1776     RExC_extralen = 0;
1777
1778     /* First pass: determine size, legality. */
1779     RExC_parse = exp;
1780     RExC_start = exp;
1781     RExC_end = xend;
1782     RExC_naughty = 0;
1783     RExC_npar = 1;
1784     RExC_size = 0L;
1785     RExC_emit = &PL_regdummy;
1786     RExC_whilem_seen = 0;
1787 #if 0 /* REGC() is (currently) a NOP at the first pass.
1788        * Clever compilers notice this and complain. --jhi */
1789     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1790 #endif
1791     if (reg(pRExC_state, 0, &flags) == NULL) {
1792         RExC_precomp = Nullch;
1793         return(NULL);
1794     }
1795     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1796
1797     /* Small enough for pointer-storage convention?
1798        If extralen==0, this means that we will not need long jumps. */
1799     if (RExC_size >= 0x10000L && RExC_extralen)
1800         RExC_size += RExC_extralen;
1801     else
1802         RExC_extralen = 0;
1803     if (RExC_whilem_seen > 15)
1804         RExC_whilem_seen = 15;
1805
1806     /* Allocate space and initialize. */
1807     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1808          char, regexp);
1809     if (r == NULL)
1810         FAIL("Regexp out of space");
1811
1812 #ifdef DEBUGGING
1813     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1814     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1815 #endif
1816     r->refcnt = 1;
1817     r->prelen = xend - exp;
1818     r->precomp = savepvn(RExC_precomp, r->prelen);
1819     r->subbeg = NULL;
1820     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1821     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1822
1823     r->substrs = 0;                     /* Useful during FAIL. */
1824     r->startp = 0;                      /* Useful during FAIL. */
1825     r->endp = 0;                        /* Useful during FAIL. */
1826
1827     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1828     if (r->offsets) {
1829       r->offsets[0] = RExC_size; 
1830     }
1831     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1832                           "%s %"UVuf" bytes for offset annotations.\n", 
1833                           r->offsets ? "Got" : "Couldn't get", 
1834                           (UV)((2*RExC_size+1) * sizeof(U32))));
1835
1836     RExC_rx = r;
1837
1838     /* Second pass: emit code. */
1839     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1840     RExC_parse = exp;
1841     RExC_end = xend;
1842     RExC_naughty = 0;
1843     RExC_npar = 1;
1844     RExC_emit_start = r->program;
1845     RExC_emit = r->program;
1846     /* Store the count of eval-groups for security checks: */
1847     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1848     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1849     r->data = 0;
1850     if (reg(pRExC_state, 0, &flags) == NULL)
1851         return(NULL);
1852
1853     /* Dig out information for optimizations. */
1854     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1855     pm->op_pmflags = RExC_flags;
1856     if (UTF)
1857         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1858     r->regstclass = NULL;
1859     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1860         r->reganch |= ROPT_NAUGHTY;
1861     scan = r->program + 1;              /* First BRANCH. */
1862
1863     /* XXXX To minimize changes to RE engine we always allocate
1864        3-units-long substrs field. */
1865     Newz(1004, r->substrs, 1, struct reg_substr_data);
1866
1867     StructCopy(&zero_scan_data, &data, scan_data_t);
1868     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1869     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1870         I32 fake;
1871         STRLEN longest_float_length, longest_fixed_length;
1872         struct regnode_charclass_class ch_class;
1873         int stclass_flag;
1874         I32 last_close = 0;
1875
1876         first = scan;
1877         /* Skip introductions and multiplicators >= 1. */
1878         while ((OP(first) == OPEN && (sawopen = 1)) ||
1879                /* An OR of *one* alternative - should not happen now. */
1880             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1881             (OP(first) == PLUS) ||
1882             (OP(first) == MINMOD) ||
1883                /* An {n,m} with n>0 */
1884             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1885                 if (OP(first) == PLUS)
1886                     sawplus = 1;
1887                 else
1888                     first += regarglen[(U8)OP(first)];
1889                 first = NEXTOPER(first);
1890         }
1891
1892         /* Starting-point info. */
1893       again:
1894         if (PL_regkind[(U8)OP(first)] == EXACT) {
1895             if (OP(first) == EXACT)
1896                 ;       /* Empty, get anchored substr later. */
1897             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1898                 r->regstclass = first;
1899         }
1900         else if (strchr((char*)PL_simple,OP(first)))
1901             r->regstclass = first;
1902         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1903                  PL_regkind[(U8)OP(first)] == NBOUND)
1904             r->regstclass = first;
1905         else if (PL_regkind[(U8)OP(first)] == BOL) {
1906             r->reganch |= (OP(first) == MBOL
1907                            ? ROPT_ANCH_MBOL
1908                            : (OP(first) == SBOL
1909                               ? ROPT_ANCH_SBOL
1910                               : ROPT_ANCH_BOL));
1911             first = NEXTOPER(first);
1912             goto again;
1913         }
1914         else if (OP(first) == GPOS) {
1915             r->reganch |= ROPT_ANCH_GPOS;
1916             first = NEXTOPER(first);
1917             goto again;
1918         }
1919         else if (!sawopen && (OP(first) == STAR &&
1920             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1921             !(r->reganch & ROPT_ANCH) )
1922         {
1923             /* turn .* into ^.* with an implied $*=1 */
1924             int type = OP(NEXTOPER(first));
1925
1926             if (type == REG_ANY)
1927                 type = ROPT_ANCH_MBOL;
1928             else
1929                 type = ROPT_ANCH_SBOL;
1930
1931             r->reganch |= type | ROPT_IMPLICIT;
1932             first = NEXTOPER(first);
1933             goto again;
1934         }
1935         if (sawplus && (!sawopen || !RExC_sawback)
1936             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1937             /* x+ must match at the 1st pos of run of x's */
1938             r->reganch |= ROPT_SKIP;
1939
1940         /* Scan is after the zeroth branch, first is atomic matcher. */
1941         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1942                               (IV)(first - scan + 1)));
1943         /*
1944         * If there's something expensive in the r.e., find the
1945         * longest literal string that must appear and make it the
1946         * regmust.  Resolve ties in favor of later strings, since
1947         * the regstart check works with the beginning of the r.e.
1948         * and avoiding duplication strengthens checking.  Not a
1949         * strong reason, but sufficient in the absence of others.
1950         * [Now we resolve ties in favor of the earlier string if
1951         * it happens that c_offset_min has been invalidated, since the
1952         * earlier string may buy us something the later one won't.]
1953         */
1954         minlen = 0;
1955
1956         data.longest_fixed = newSVpvn("",0);
1957         data.longest_float = newSVpvn("",0);
1958         data.last_found = newSVpvn("",0);
1959         data.longest = &(data.longest_fixed);
1960         first = scan;
1961         if (!r->regstclass) {
1962             cl_init(pRExC_state, &ch_class);
1963             data.start_class = &ch_class;
1964             stclass_flag = SCF_DO_STCLASS_AND;
1965         } else                          /* XXXX Check for BOUND? */
1966             stclass_flag = 0;
1967         data.last_closep = &last_close;
1968
1969         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1970                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1971         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1972              && data.last_start_min == 0 && data.last_end > 0
1973              && !RExC_seen_zerolen
1974              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1975             r->reganch |= ROPT_CHECK_ALL;
1976         scan_commit(pRExC_state, &data);
1977         SvREFCNT_dec(data.last_found);
1978
1979         longest_float_length = CHR_SVLEN(data.longest_float);
1980         if (longest_float_length
1981             || (data.flags & SF_FL_BEFORE_EOL
1982                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1983                     || (RExC_flags & PMf_MULTILINE)))) {
1984             int t;
1985
1986             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1987                 && data.offset_fixed == data.offset_float_min
1988                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1989                     goto remove_float;          /* As in (a)+. */
1990
1991             if (SvUTF8(data.longest_float)) {
1992                 r->float_utf8 = data.longest_float;
1993                 r->float_substr = Nullsv;
1994             } else {
1995                 r->float_substr = data.longest_float;
1996                 r->float_utf8 = Nullsv;
1997             }
1998             r->float_min_offset = data.offset_float_min;
1999             r->float_max_offset = data.offset_float_max;
2000             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2001                        && (!(data.flags & SF_FL_BEFORE_MEOL)
2002                            || (RExC_flags & PMf_MULTILINE)));
2003             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2004         }
2005         else {
2006           remove_float:
2007             r->float_substr = r->float_utf8 = Nullsv;
2008             SvREFCNT_dec(data.longest_float);
2009             longest_float_length = 0;
2010         }
2011
2012         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2013         if (longest_fixed_length
2014             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2015                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2016                     || (RExC_flags & PMf_MULTILINE)))) {
2017             int t;
2018
2019             if (SvUTF8(data.longest_fixed)) {
2020                 r->anchored_utf8 = data.longest_fixed;
2021                 r->anchored_substr = Nullsv;
2022             } else {
2023                 r->anchored_substr = data.longest_fixed;
2024                 r->anchored_utf8 = Nullsv;
2025             }
2026             r->anchored_offset = data.offset_fixed;
2027             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2028                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2029                      || (RExC_flags & PMf_MULTILINE)));
2030             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2031         }
2032         else {
2033             r->anchored_substr = r->anchored_utf8 = Nullsv;
2034             SvREFCNT_dec(data.longest_fixed);
2035             longest_fixed_length = 0;
2036         }
2037         if (r->regstclass
2038             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2039             r->regstclass = NULL;
2040         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2041             && stclass_flag
2042             && !(data.start_class->flags & ANYOF_EOS)
2043             && !cl_is_anything(data.start_class))
2044         {
2045             I32 n = add_data(pRExC_state, 1, "f");
2046
2047             New(1006, RExC_rx->data->data[n], 1,
2048                 struct regnode_charclass_class);
2049             StructCopy(data.start_class,
2050                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2051                        struct regnode_charclass_class);
2052             r->regstclass = (regnode*)RExC_rx->data->data[n];
2053             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2054             PL_regdata = r->data; /* for regprop() */
2055             DEBUG_r({ SV *sv = sv_newmortal();
2056                       regprop(sv, (regnode*)data.start_class);
2057                       PerlIO_printf(Perl_debug_log,
2058                                     "synthetic stclass `%s'.\n",
2059                                     SvPVX(sv));});
2060         }
2061
2062         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2063         if (longest_fixed_length > longest_float_length) {
2064             r->check_substr = r->anchored_substr;
2065             r->check_utf8 = r->anchored_utf8;
2066             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2067             if (r->reganch & ROPT_ANCH_SINGLE)
2068                 r->reganch |= ROPT_NOSCAN;
2069         }
2070         else {
2071             r->check_substr = r->float_substr;
2072             r->check_utf8 = r->float_utf8;
2073             r->check_offset_min = data.offset_float_min;
2074             r->check_offset_max = data.offset_float_max;
2075         }
2076         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2077            This should be changed ASAP!  */
2078         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2079             r->reganch |= RE_USE_INTUIT;
2080             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2081                 r->reganch |= RE_INTUIT_TAIL;
2082         }
2083     }
2084     else {
2085         /* Several toplevels. Best we can is to set minlen. */
2086         I32 fake;
2087         struct regnode_charclass_class ch_class;
2088         I32 last_close = 0;
2089         
2090         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2091         scan = r->program + 1;
2092         cl_init(pRExC_state, &ch_class);
2093         data.start_class = &ch_class;
2094         data.last_closep = &last_close;
2095         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2096         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2097                 = r->float_substr = r->float_utf8 = Nullsv;
2098         if (!(data.start_class->flags & ANYOF_EOS)
2099             && !cl_is_anything(data.start_class))
2100         {
2101             I32 n = add_data(pRExC_state, 1, "f");
2102
2103             New(1006, RExC_rx->data->data[n], 1,
2104                 struct regnode_charclass_class);
2105             StructCopy(data.start_class,
2106                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2107                        struct regnode_charclass_class);
2108             r->regstclass = (regnode*)RExC_rx->data->data[n];
2109             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2110             DEBUG_r({ SV* sv = sv_newmortal();
2111                       regprop(sv, (regnode*)data.start_class);
2112                       PerlIO_printf(Perl_debug_log,
2113                                     "synthetic stclass `%s'.\n",
2114                                     SvPVX(sv));});
2115         }
2116     }
2117
2118     r->minlen = minlen;
2119     if (RExC_seen & REG_SEEN_GPOS)
2120         r->reganch |= ROPT_GPOS_SEEN;
2121     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2122         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2123     if (RExC_seen & REG_SEEN_EVAL)
2124         r->reganch |= ROPT_EVAL_SEEN;
2125     if (RExC_seen & REG_SEEN_CANY)
2126         r->reganch |= ROPT_CANY_SEEN;
2127     Newz(1002, r->startp, RExC_npar, I32);
2128     Newz(1002, r->endp, RExC_npar, I32);
2129     PL_regdata = r->data; /* for regprop() */
2130     DEBUG_r(regdump(r));
2131     return(r);
2132 }
2133
2134 /*
2135  - reg - regular expression, i.e. main body or parenthesized thing
2136  *
2137  * Caller must absorb opening parenthesis.
2138  *
2139  * Combining parenthesis handling with the base level of regular expression
2140  * is a trifle forced, but the need to tie the tails of the branches to what
2141  * follows makes it hard to avoid.
2142  */
2143 STATIC regnode *
2144 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2145     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2146 {
2147     register regnode *ret;              /* Will be the head of the group. */
2148     register regnode *br;
2149     register regnode *lastbr;
2150     register regnode *ender = 0;
2151     register I32 parno = 0;
2152     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2153
2154     /* for (?g), (?gc), and (?o) warnings; warning
2155        about (?c) will warn about (?g) -- japhy    */
2156
2157     I32 wastedflags = 0x00,
2158         wasted_o    = 0x01,
2159         wasted_g    = 0x02,
2160         wasted_gc   = 0x02 | 0x04,
2161         wasted_c    = 0x04;
2162
2163     char * parse_start = RExC_parse; /* MJD */
2164     char *oregcomp_parse = RExC_parse;
2165     char c;
2166
2167     *flagp = 0;                         /* Tentatively. */
2168
2169
2170     /* Make an OPEN node, if parenthesized. */
2171     if (paren) {
2172         if (*RExC_parse == '?') { /* (?...) */
2173             U32 posflags = 0, negflags = 0;
2174             U32 *flagsp = &posflags;
2175             int logical = 0;
2176             char *seqstart = RExC_parse;
2177
2178             RExC_parse++;
2179             paren = *RExC_parse++;
2180             ret = NULL;                 /* For look-ahead/behind. */
2181             switch (paren) {
2182             case '<':           /* (?<...) */
2183                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2184                 if (*RExC_parse == '!')
2185                     paren = ',';
2186                 if (*RExC_parse != '=' && *RExC_parse != '!')
2187                     goto unknown;
2188                 RExC_parse++;
2189             case '=':           /* (?=...) */
2190             case '!':           /* (?!...) */
2191                 RExC_seen_zerolen++;
2192             case ':':           /* (?:...) */
2193             case '>':           /* (?>...) */
2194                 break;
2195             case '$':           /* (?$...) */
2196             case '@':           /* (?@...) */
2197                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2198                 break;
2199             case '#':           /* (?#...) */
2200                 while (*RExC_parse && *RExC_parse != ')')
2201                     RExC_parse++;
2202                 if (*RExC_parse != ')')
2203                     FAIL("Sequence (?#... not terminated");
2204                 nextchar(pRExC_state);
2205                 *flagp = TRYAGAIN;
2206                 return NULL;
2207             case 'p':           /* (?p...) */
2208                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2209                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2210                 /* FALL THROUGH*/
2211             case '?':           /* (??...) */
2212                 logical = 1;
2213                 if (*RExC_parse != '{')
2214                     goto unknown;
2215                 paren = *RExC_parse++;
2216                 /* FALL THROUGH */
2217             case '{':           /* (?{...}) */
2218             {
2219                 I32 count = 1, n = 0;
2220                 char c;
2221                 char *s = RExC_parse;
2222                 SV *sv;
2223                 OP_4tree *sop, *rop;
2224
2225                 RExC_seen_zerolen++;
2226                 RExC_seen |= REG_SEEN_EVAL;
2227                 while (count && (c = *RExC_parse)) {
2228                     if (c == '\\' && RExC_parse[1])
2229                         RExC_parse++;
2230                     else if (c == '{')
2231                         count++;
2232                     else if (c == '}')
2233                         count--;
2234                     RExC_parse++;
2235                 }
2236                 if (*RExC_parse != ')')
2237                 {
2238                     RExC_parse = s;             
2239                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2240                 }
2241                 if (!SIZE_ONLY) {
2242                     PAD *pad;
2243                 
2244                     if (RExC_parse - 1 - s)
2245                         sv = newSVpvn(s, RExC_parse - 1 - s);
2246                     else
2247                         sv = newSVpvn("", 0);
2248
2249                     ENTER;
2250                     Perl_save_re_context(aTHX);
2251                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2252                     sop->op_private |= OPpREFCOUNTED;
2253                     /* re_dup will OpREFCNT_inc */
2254                     OpREFCNT_set(sop, 1);
2255                     LEAVE;
2256
2257                     n = add_data(pRExC_state, 3, "nop");
2258                     RExC_rx->data->data[n] = (void*)rop;
2259                     RExC_rx->data->data[n+1] = (void*)sop;
2260                     RExC_rx->data->data[n+2] = (void*)pad;
2261                     SvREFCNT_dec(sv);
2262                 }
2263                 else {                                          /* First pass */
2264                     if (PL_reginterp_cnt < ++RExC_seen_evals
2265                         && IN_PERL_RUNTIME)
2266                         /* No compiled RE interpolated, has runtime
2267                            components ===> unsafe.  */
2268                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2269                     if (PL_tainting && PL_tainted)
2270                         FAIL("Eval-group in insecure regular expression");
2271                 }
2272                 
2273                 nextchar(pRExC_state);
2274                 if (logical) {
2275                     ret = reg_node(pRExC_state, LOGICAL);
2276                     if (!SIZE_ONLY)
2277                         ret->flags = 2;
2278                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2279                     /* deal with the length of this later - MJD */
2280                     return ret;
2281                 }
2282                 ret = reganode(pRExC_state, EVAL, n);
2283                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2284                 Set_Node_Offset(ret, parse_start);
2285                 return ret;
2286             }
2287             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2288             {
2289                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2290                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2291                         || RExC_parse[1] == '<'
2292                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2293                         I32 flag;
2294                         
2295                         ret = reg_node(pRExC_state, LOGICAL);
2296                         if (!SIZE_ONLY)
2297                             ret->flags = 1;
2298                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2299                         goto insert_if;
2300                     }
2301                 }
2302                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2303                     /* (?(1)...) */
2304                     parno = atoi(RExC_parse++);
2305
2306                     while (isDIGIT(*RExC_parse))
2307                         RExC_parse++;
2308                     ret = reganode(pRExC_state, GROUPP, parno);
2309                     
2310                     if ((c = *nextchar(pRExC_state)) != ')')
2311                         vFAIL("Switch condition not recognized");
2312                   insert_if:
2313                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2314                     br = regbranch(pRExC_state, &flags, 1);
2315                     if (br == NULL)
2316                         br = reganode(pRExC_state, LONGJMP, 0);
2317                     else
2318                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2319                     c = *nextchar(pRExC_state);
2320                     if (flags&HASWIDTH)
2321                         *flagp |= HASWIDTH;
2322                     if (c == '|') {
2323                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2324                         regbranch(pRExC_state, &flags, 1);
2325                         regtail(pRExC_state, ret, lastbr);
2326                         if (flags&HASWIDTH)
2327                             *flagp |= HASWIDTH;
2328                         c = *nextchar(pRExC_state);
2329                     }
2330                     else
2331                         lastbr = NULL;
2332                     if (c != ')')
2333                         vFAIL("Switch (?(condition)... contains too many branches");
2334                     ender = reg_node(pRExC_state, TAIL);
2335                     regtail(pRExC_state, br, ender);
2336                     if (lastbr) {
2337                         regtail(pRExC_state, lastbr, ender);
2338                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2339                     }
2340                     else
2341                         regtail(pRExC_state, ret, ender);
2342                     return ret;
2343                 }
2344                 else {
2345                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2346                 }
2347             }
2348             case 0:
2349                 RExC_parse--; /* for vFAIL to print correctly */
2350                 vFAIL("Sequence (? incomplete");
2351                 break;
2352             default:
2353                 --RExC_parse;
2354               parse_flags:      /* (?i) */
2355                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2356                     /* (?g), (?gc) and (?o) are useless here
2357                        and must be globally applied -- japhy */
2358
2359                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2360                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2362                             if (! (wastedflags & wflagbit) ) {
2363                                 wastedflags |= wflagbit;
2364                                 vWARN5(
2365                                     RExC_parse + 1,
2366                                     "Useless (%s%c) - %suse /%c modifier",
2367                                     flagsp == &negflags ? "?-" : "?",
2368                                     *RExC_parse,
2369                                     flagsp == &negflags ? "don't " : "",
2370                                     *RExC_parse
2371                                 );
2372                             }
2373                         }
2374                     }
2375                     else if (*RExC_parse == 'c') {
2376                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2377                             if (! (wastedflags & wasted_c) ) {
2378                                 wastedflags |= wasted_gc;
2379                                 vWARN3(
2380                                     RExC_parse + 1,
2381                                     "Useless (%sc) - %suse /gc modifier",
2382                                     flagsp == &negflags ? "?-" : "?",
2383                                     flagsp == &negflags ? "don't " : ""
2384                                 );
2385                             }
2386                         }
2387                     }
2388                     else { pmflag(flagsp, *RExC_parse); }
2389
2390                     ++RExC_parse;
2391                 }
2392                 if (*RExC_parse == '-') {
2393                     flagsp = &negflags;
2394                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2395                     ++RExC_parse;
2396                     goto parse_flags;
2397                 }
2398                 RExC_flags |= posflags;
2399                 RExC_flags &= ~negflags;
2400                 if (*RExC_parse == ':') {
2401                     RExC_parse++;
2402                     paren = ':';
2403                     break;
2404                 }               
2405               unknown:
2406                 if (*RExC_parse != ')') {
2407                     RExC_parse++;
2408                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2409                 }
2410                 nextchar(pRExC_state);
2411                 *flagp = TRYAGAIN;
2412                 return NULL;
2413             }
2414         }
2415         else {                  /* (...) */
2416             parno = RExC_npar;
2417             RExC_npar++;
2418             ret = reganode(pRExC_state, OPEN, parno);
2419             Set_Node_Length(ret, 1); /* MJD */
2420             Set_Node_Offset(ret, RExC_parse); /* MJD */
2421             open = 1;
2422         }
2423     }
2424     else                        /* ! paren */
2425         ret = NULL;
2426
2427     /* Pick up the branches, linking them together. */
2428     parse_start = RExC_parse;   /* MJD */
2429     br = regbranch(pRExC_state, &flags, 1);
2430     /*     branch_len = (paren != 0); */
2431     
2432     if (br == NULL)
2433         return(NULL);
2434     if (*RExC_parse == '|') {
2435         if (!SIZE_ONLY && RExC_extralen) {
2436             reginsert(pRExC_state, BRANCHJ, br);
2437         }
2438         else {                  /* MJD */
2439             reginsert(pRExC_state, BRANCH, br);
2440             Set_Node_Length(br, paren != 0);
2441             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2442         }
2443         have_branch = 1;
2444         if (SIZE_ONLY)
2445             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2446     }
2447     else if (paren == ':') {
2448         *flagp |= flags&SIMPLE;
2449     }
2450     if (open) {                         /* Starts with OPEN. */
2451         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2452     }
2453     else if (paren != '?')              /* Not Conditional */
2454         ret = br;
2455     *flagp |= flags & (SPSTART | HASWIDTH);
2456     lastbr = br;
2457     while (*RExC_parse == '|') {
2458         if (!SIZE_ONLY && RExC_extralen) {
2459             ender = reganode(pRExC_state, LONGJMP,0);
2460             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2461         }
2462         if (SIZE_ONLY)
2463             RExC_extralen += 2;         /* Account for LONGJMP. */
2464         nextchar(pRExC_state);
2465         br = regbranch(pRExC_state, &flags, 0);
2466         
2467         if (br == NULL)
2468             return(NULL);
2469         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2470         lastbr = br;
2471         if (flags&HASWIDTH)
2472             *flagp |= HASWIDTH;
2473         *flagp |= flags&SPSTART;
2474     }
2475
2476     if (have_branch || paren != ':') {
2477         /* Make a closing node, and hook it on the end. */
2478         switch (paren) {
2479         case ':':
2480             ender = reg_node(pRExC_state, TAIL);
2481             break;
2482         case 1:
2483             ender = reganode(pRExC_state, CLOSE, parno);
2484             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2485             Set_Node_Length(ender,1); /* MJD */
2486             break;
2487         case '<':
2488         case ',':
2489         case '=':
2490         case '!':
2491             *flagp &= ~HASWIDTH;
2492             /* FALL THROUGH */
2493         case '>':
2494             ender = reg_node(pRExC_state, SUCCEED);
2495             break;
2496         case 0:
2497             ender = reg_node(pRExC_state, END);
2498             break;
2499         }
2500         regtail(pRExC_state, lastbr, ender);
2501
2502         if (have_branch) {
2503             /* Hook the tails of the branches to the closing node. */
2504             for (br = ret; br != NULL; br = regnext(br)) {
2505                 regoptail(pRExC_state, br, ender);
2506             }
2507         }
2508     }
2509
2510     {
2511         char *p;
2512         static char parens[] = "=!<,>";
2513
2514         if (paren && (p = strchr(parens, paren))) {
2515             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2516             int flag = (p - parens) > 1;
2517
2518             if (paren == '>')
2519                 node = SUSPEND, flag = 0;
2520             reginsert(pRExC_state, node,ret);
2521             Set_Node_Cur_Length(ret);
2522             Set_Node_Offset(ret, parse_start + 1);
2523             ret->flags = flag;
2524             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2525         }
2526     }
2527
2528     /* Check for proper termination. */
2529     if (paren) {
2530         RExC_flags = oregflags;
2531         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2532             RExC_parse = oregcomp_parse;
2533             vFAIL("Unmatched (");
2534         }
2535     }
2536     else if (!paren && RExC_parse < RExC_end) {
2537         if (*RExC_parse == ')') {
2538             RExC_parse++;
2539             vFAIL("Unmatched )");
2540         }
2541         else
2542             FAIL("Junk on end of regexp");      /* "Can't happen". */
2543         /* NOTREACHED */
2544     }
2545
2546     return(ret);
2547 }
2548
2549 /*
2550  - regbranch - one alternative of an | operator
2551  *
2552  * Implements the concatenation operator.
2553  */
2554 STATIC regnode *
2555 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2556 {
2557     register regnode *ret;
2558     register regnode *chain = NULL;
2559     register regnode *latest;
2560     I32 flags = 0, c = 0;
2561
2562     if (first)
2563         ret = NULL;
2564     else {
2565         if (!SIZE_ONLY && RExC_extralen)
2566             ret = reganode(pRExC_state, BRANCHJ,0);
2567         else {
2568             ret = reg_node(pRExC_state, BRANCH);
2569             Set_Node_Length(ret, 1);
2570         }
2571     }
2572         
2573     if (!first && SIZE_ONLY)
2574         RExC_extralen += 1;                     /* BRANCHJ */
2575
2576     *flagp = WORST;                     /* Tentatively. */
2577
2578     RExC_parse--;
2579     nextchar(pRExC_state);
2580     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2581         flags &= ~TRYAGAIN;
2582         latest = regpiece(pRExC_state, &flags);
2583         if (latest == NULL) {
2584             if (flags & TRYAGAIN)
2585                 continue;
2586             return(NULL);
2587         }
2588         else if (ret == NULL)
2589             ret = latest;
2590         *flagp |= flags&HASWIDTH;
2591         if (chain == NULL)      /* First piece. */
2592             *flagp |= flags&SPSTART;
2593         else {
2594             RExC_naughty++;
2595             regtail(pRExC_state, chain, latest);
2596         }
2597         chain = latest;
2598         c++;
2599     }
2600     if (chain == NULL) {        /* Loop ran zero times. */
2601         chain = reg_node(pRExC_state, NOTHING);
2602         if (ret == NULL)
2603             ret = chain;
2604     }
2605     if (c == 1) {
2606         *flagp |= flags&SIMPLE;
2607     }
2608
2609     return(ret);
2610 }
2611
2612 /*
2613  - regpiece - something followed by possible [*+?]
2614  *
2615  * Note that the branching code sequences used for ? and the general cases
2616  * of * and + are somewhat optimized:  they use the same NOTHING node as
2617  * both the endmarker for their branch list and the body of the last branch.
2618  * It might seem that this node could be dispensed with entirely, but the
2619  * endmarker role is not redundant.
2620  */
2621 STATIC regnode *
2622 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2623 {
2624     register regnode *ret;
2625     register char op;
2626     register char *next;
2627     I32 flags;
2628     char *origparse = RExC_parse;
2629     char *maxpos;
2630     I32 min;
2631     I32 max = REG_INFTY;
2632     char *parse_start;
2633
2634     ret = regatom(pRExC_state, &flags);
2635     if (ret == NULL) {
2636         if (flags & TRYAGAIN)
2637             *flagp |= TRYAGAIN;
2638         return(NULL);
2639     }
2640
2641     op = *RExC_parse;
2642
2643     if (op == '{' && regcurly(RExC_parse)) {
2644         parse_start = RExC_parse; /* MJD */
2645         next = RExC_parse + 1;
2646         maxpos = Nullch;
2647         while (isDIGIT(*next) || *next == ',') {
2648             if (*next == ',') {
2649                 if (maxpos)
2650                     break;
2651                 else
2652                     maxpos = next;
2653             }
2654             next++;
2655         }
2656         if (*next == '}') {             /* got one */
2657             if (!maxpos)
2658                 maxpos = next;
2659             RExC_parse++;
2660             min = atoi(RExC_parse);
2661             if (*maxpos == ',')
2662                 maxpos++;
2663             else
2664                 maxpos = RExC_parse;
2665             max = atoi(maxpos);
2666             if (!max && *maxpos != '0')
2667                 max = REG_INFTY;                /* meaning "infinity" */
2668             else if (max >= REG_INFTY)
2669                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2670             RExC_parse = next;
2671             nextchar(pRExC_state);
2672
2673         do_curly:
2674             if ((flags&SIMPLE)) {
2675                 RExC_naughty += 2 + RExC_naughty / 2;
2676                 reginsert(pRExC_state, CURLY, ret);
2677                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2678                 Set_Node_Cur_Length(ret);
2679             }
2680             else {
2681                 regnode *w = reg_node(pRExC_state, WHILEM);
2682
2683                 w->flags = 0;
2684                 regtail(pRExC_state, ret, w);
2685                 if (!SIZE_ONLY && RExC_extralen) {
2686                     reginsert(pRExC_state, LONGJMP,ret);
2687                     reginsert(pRExC_state, NOTHING,ret);
2688                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2689                 }
2690                 reginsert(pRExC_state, CURLYX,ret);
2691                                 /* MJD hk */
2692                 Set_Node_Offset(ret, parse_start+1);
2693                 Set_Node_Length(ret, 
2694                                 op == '{' ? (RExC_parse - parse_start) : 1);
2695                 
2696                 if (!SIZE_ONLY && RExC_extralen)
2697                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2698                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2699                 if (SIZE_ONLY)
2700                     RExC_whilem_seen++, RExC_extralen += 3;
2701                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2702             }
2703             ret->flags = 0;
2704
2705             if (min > 0)
2706                 *flagp = WORST;
2707             if (max > 0)
2708                 *flagp |= HASWIDTH;
2709             if (max && max < min)
2710                 vFAIL("Can't do {n,m} with n > m");
2711             if (!SIZE_ONLY) {
2712                 ARG1_SET(ret, (U16)min);
2713                 ARG2_SET(ret, (U16)max);
2714             }
2715
2716             goto nest_check;
2717         }
2718     }
2719
2720     if (!ISMULT1(op)) {
2721         *flagp = flags;
2722         return(ret);
2723     }
2724
2725 #if 0                           /* Now runtime fix should be reliable. */
2726
2727     /* if this is reinstated, don't forget to put this back into perldiag:
2728
2729             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2730
2731            (F) The part of the regexp subject to either the * or + quantifier
2732            could match an empty string. The {#} shows in the regular
2733            expression about where the problem was discovered.
2734
2735     */
2736
2737     if (!(flags&HASWIDTH) && op != '?')
2738       vFAIL("Regexp *+ operand could be empty");
2739 #endif
2740
2741     parse_start = RExC_parse;
2742     nextchar(pRExC_state);
2743
2744     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2745
2746     if (op == '*' && (flags&SIMPLE)) {
2747         reginsert(pRExC_state, STAR, ret);
2748         ret->flags = 0;
2749         RExC_naughty += 4;
2750     }
2751     else if (op == '*') {
2752         min = 0;
2753         goto do_curly;
2754     }
2755     else if (op == '+' && (flags&SIMPLE)) {
2756         reginsert(pRExC_state, PLUS, ret);
2757         ret->flags = 0;
2758         RExC_naughty += 3;
2759     }
2760     else if (op == '+') {
2761         min = 1;
2762         goto do_curly;
2763     }
2764     else if (op == '?') {
2765         min = 0; max = 1;
2766         goto do_curly;
2767     }
2768   nest_check:
2769     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2770         vWARN3(RExC_parse,
2771                "%.*s matches null string many times",
2772                RExC_parse - origparse,
2773                origparse);
2774     }
2775
2776     if (*RExC_parse == '?') {
2777         nextchar(pRExC_state);
2778         reginsert(pRExC_state, MINMOD, ret);
2779         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2780     }
2781     if (ISMULT2(RExC_parse)) {
2782         RExC_parse++;
2783         vFAIL("Nested quantifiers");
2784     }
2785
2786     return(ret);
2787 }
2788
2789 /*
2790  - regatom - the lowest level
2791  *
2792  * Optimization:  gobbles an entire sequence of ordinary characters so that
2793  * it can turn them into a single node, which is smaller to store and
2794  * faster to run.  Backslashed characters are exceptions, each becoming a
2795  * separate node; the code is simpler that way and it's not worth fixing.
2796  *
2797  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2798 STATIC regnode *
2799 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2800 {
2801     register regnode *ret = 0;
2802     I32 flags;
2803     char *parse_start = RExC_parse;
2804
2805     *flagp = WORST;             /* Tentatively. */
2806
2807 tryagain:
2808     switch (*RExC_parse) {
2809     case '^':
2810         RExC_seen_zerolen++;
2811         nextchar(pRExC_state);
2812         if (RExC_flags & PMf_MULTILINE)
2813             ret = reg_node(pRExC_state, MBOL);
2814         else if (RExC_flags & PMf_SINGLELINE)
2815             ret = reg_node(pRExC_state, SBOL);
2816         else
2817             ret = reg_node(pRExC_state, BOL);
2818         Set_Node_Length(ret, 1); /* MJD */
2819         break;
2820     case '$':
2821         nextchar(pRExC_state);
2822         if (*RExC_parse)
2823             RExC_seen_zerolen++;
2824         if (RExC_flags & PMf_MULTILINE)
2825             ret = reg_node(pRExC_state, MEOL);
2826         else if (RExC_flags & PMf_SINGLELINE)
2827             ret = reg_node(pRExC_state, SEOL);
2828         else
2829             ret = reg_node(pRExC_state, EOL);
2830         Set_Node_Length(ret, 1); /* MJD */
2831         break;
2832     case '.':
2833         nextchar(pRExC_state);
2834         if (RExC_flags & PMf_SINGLELINE)
2835             ret = reg_node(pRExC_state, SANY);
2836         else
2837             ret = reg_node(pRExC_state, REG_ANY);
2838         *flagp |= HASWIDTH|SIMPLE;
2839         RExC_naughty++;
2840         Set_Node_Length(ret, 1); /* MJD */
2841         break;
2842     case '[':
2843     {
2844         char *oregcomp_parse = ++RExC_parse;
2845         ret = regclass(pRExC_state);
2846         if (*RExC_parse != ']') {
2847             RExC_parse = oregcomp_parse;
2848             vFAIL("Unmatched [");
2849         }
2850         nextchar(pRExC_state);
2851         *flagp |= HASWIDTH|SIMPLE;
2852         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2853         break;
2854     }
2855     case '(':
2856         nextchar(pRExC_state);
2857         ret = reg(pRExC_state, 1, &flags);
2858         if (ret == NULL) {
2859                 if (flags & TRYAGAIN) {
2860                     if (RExC_parse == RExC_end) {
2861                          /* Make parent create an empty node if needed. */
2862                         *flagp |= TRYAGAIN;
2863                         return(NULL);
2864                     }
2865                     goto tryagain;
2866                 }
2867                 return(NULL);
2868         }
2869         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2870         break;
2871     case '|':
2872     case ')':
2873         if (flags & TRYAGAIN) {
2874             *flagp |= TRYAGAIN;
2875             return NULL;
2876         }
2877         vFAIL("Internal urp");
2878                                 /* Supposed to be caught earlier. */
2879         break;
2880     case '{':
2881         if (!regcurly(RExC_parse)) {
2882             RExC_parse++;
2883             goto defchar;
2884         }
2885         /* FALL THROUGH */
2886     case '?':
2887     case '+':
2888     case '*':
2889         RExC_parse++;
2890         vFAIL("Quantifier follows nothing");
2891         break;
2892     case '\\':
2893         switch (*++RExC_parse) {
2894         case 'A':
2895             RExC_seen_zerolen++;
2896             ret = reg_node(pRExC_state, SBOL);
2897             *flagp |= SIMPLE;
2898             nextchar(pRExC_state);
2899             Set_Node_Length(ret, 2); /* MJD */
2900             break;
2901         case 'G':
2902             ret = reg_node(pRExC_state, GPOS);
2903             RExC_seen |= REG_SEEN_GPOS;
2904             *flagp |= SIMPLE;
2905             nextchar(pRExC_state);
2906             Set_Node_Length(ret, 2); /* MJD */
2907             break;
2908         case 'Z':
2909             ret = reg_node(pRExC_state, SEOL);
2910             *flagp |= SIMPLE;
2911             RExC_seen_zerolen++;                /* Do not optimize RE away */
2912             nextchar(pRExC_state);
2913             break;
2914         case 'z':
2915             ret = reg_node(pRExC_state, EOS);
2916             *flagp |= SIMPLE;
2917             RExC_seen_zerolen++;                /* Do not optimize RE away */
2918             nextchar(pRExC_state);
2919             Set_Node_Length(ret, 2); /* MJD */
2920             break;
2921         case 'C':
2922             ret = reg_node(pRExC_state, CANY);
2923             RExC_seen |= REG_SEEN_CANY;
2924             *flagp |= HASWIDTH|SIMPLE;
2925             nextchar(pRExC_state);
2926             Set_Node_Length(ret, 2); /* MJD */
2927             break;
2928         case 'X':
2929             ret = reg_node(pRExC_state, CLUMP);
2930             *flagp |= HASWIDTH;
2931             nextchar(pRExC_state);
2932             Set_Node_Length(ret, 2); /* MJD */
2933             break;
2934         case 'w':
2935             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2936             *flagp |= HASWIDTH|SIMPLE;
2937             nextchar(pRExC_state);
2938             Set_Node_Length(ret, 2); /* MJD */
2939             break;
2940         case 'W':
2941             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2942             *flagp |= HASWIDTH|SIMPLE;
2943             nextchar(pRExC_state);
2944             Set_Node_Length(ret, 2); /* MJD */
2945             break;
2946         case 'b':
2947             RExC_seen_zerolen++;
2948             RExC_seen |= REG_SEEN_LOOKBEHIND;
2949             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2950             *flagp |= SIMPLE;
2951             nextchar(pRExC_state);
2952             Set_Node_Length(ret, 2); /* MJD */
2953             break;
2954         case 'B':
2955             RExC_seen_zerolen++;
2956             RExC_seen |= REG_SEEN_LOOKBEHIND;
2957             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2958             *flagp |= SIMPLE;
2959             nextchar(pRExC_state);
2960             Set_Node_Length(ret, 2); /* MJD */
2961             break;
2962         case 's':
2963             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2964             *flagp |= HASWIDTH|SIMPLE;
2965             nextchar(pRExC_state);
2966             Set_Node_Length(ret, 2); /* MJD */
2967             break;
2968         case 'S':
2969             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2970             *flagp |= HASWIDTH|SIMPLE;
2971             nextchar(pRExC_state);
2972             Set_Node_Length(ret, 2); /* MJD */
2973             break;
2974         case 'd':
2975             ret = reg_node(pRExC_state, DIGIT);
2976             *flagp |= HASWIDTH|SIMPLE;
2977             nextchar(pRExC_state);
2978             Set_Node_Length(ret, 2); /* MJD */
2979             break;
2980         case 'D':
2981             ret = reg_node(pRExC_state, NDIGIT);
2982             *flagp |= HASWIDTH|SIMPLE;
2983             nextchar(pRExC_state);
2984             Set_Node_Length(ret, 2); /* MJD */
2985             break;
2986         case 'p':
2987         case 'P':
2988             {   
2989                 char* oldregxend = RExC_end;
2990                 char* parse_start = RExC_parse - 2;
2991
2992                 if (RExC_parse[1] == '{') {
2993                   /* a lovely hack--pretend we saw [\pX] instead */
2994                     RExC_end = strchr(RExC_parse, '}');
2995                     if (!RExC_end) {
2996                         U8 c = (U8)*RExC_parse;
2997                         RExC_parse += 2;
2998                         RExC_end = oldregxend;
2999                         vFAIL2("Missing right brace on \\%c{}", c);
3000                     }
3001                     RExC_end++;
3002                 }
3003                 else {
3004                     RExC_end = RExC_parse + 2;
3005                     if (RExC_end > oldregxend)
3006                         RExC_end = oldregxend;
3007                 }
3008                 RExC_parse--;
3009
3010                 ret = regclass(pRExC_state);
3011
3012                 RExC_end = oldregxend;
3013                 RExC_parse--;
3014
3015                 Set_Node_Offset(ret, parse_start + 2);
3016                 Set_Node_Cur_Length(ret);
3017                 nextchar(pRExC_state);
3018                 *flagp |= HASWIDTH|SIMPLE;
3019             }
3020             break;
3021         case 'n':
3022         case 'r':
3023         case 't':
3024         case 'f':
3025         case 'e':
3026         case 'a':
3027         case 'x':
3028         case 'c':
3029         case '0':
3030             goto defchar;
3031         case '1': case '2': case '3': case '4':
3032         case '5': case '6': case '7': case '8': case '9':
3033             {
3034                 I32 num = atoi(RExC_parse);
3035
3036                 if (num > 9 && num >= RExC_npar)
3037                     goto defchar;
3038                 else {
3039                     char * parse_start = RExC_parse - 1; /* MJD */
3040                     while (isDIGIT(*RExC_parse))
3041                         RExC_parse++;
3042
3043                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3044                         vFAIL("Reference to nonexistent group");
3045                     RExC_sawback = 1;
3046                     ret = reganode(pRExC_state,
3047                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3048                                    num);
3049                     *flagp |= HASWIDTH;
3050                     
3051                     /* override incorrect value set in reganode MJD */
3052                     Set_Node_Offset(ret, parse_start+1); 
3053                     Set_Node_Cur_Length(ret); /* MJD */
3054                     RExC_parse--;
3055                     nextchar(pRExC_state);
3056                 }
3057             }
3058             break;
3059         case '\0':
3060             if (RExC_parse >= RExC_end)
3061                 FAIL("Trailing \\");
3062             /* FALL THROUGH */
3063         default:
3064             /* Do not generate `unrecognized' warnings here, we fall
3065                back into the quick-grab loop below */
3066             parse_start--;
3067             goto defchar;
3068         }
3069         break;
3070
3071     case '#':
3072         if (RExC_flags & PMf_EXTENDED) {
3073             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3074             if (RExC_parse < RExC_end)
3075                 goto tryagain;
3076         }
3077         /* FALL THROUGH */
3078
3079     default: {
3080             register STRLEN len;
3081             register UV ender;
3082             register char *p;
3083             char *oldp, *s;
3084             STRLEN numlen;
3085             STRLEN foldlen;
3086             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3087
3088             parse_start = RExC_parse - 1;
3089
3090             RExC_parse++;
3091
3092         defchar:
3093             ender = 0;
3094             ret = reg_node(pRExC_state,
3095                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3096             s = STRING(ret);
3097             for (len = 0, p = RExC_parse - 1;
3098               len < 127 && p < RExC_end;
3099               len++)
3100             {
3101                 oldp = p;
3102
3103                 if (RExC_flags & PMf_EXTENDED)
3104                     p = regwhite(p, RExC_end);
3105                 switch (*p) {
3106                 case '^':
3107                 case '$':
3108                 case '.':
3109                 case '[':
3110                 case '(':
3111                 case ')':
3112                 case '|':
3113                     goto loopdone;
3114                 case '\\':
3115                     switch (*++p) {
3116                     case 'A':
3117                     case 'C':
3118                     case 'X':
3119                     case 'G':
3120                     case 'Z':
3121                     case 'z':
3122                     case 'w':
3123                     case 'W':
3124                     case 'b':
3125                     case 'B':
3126                     case 's':
3127                     case 'S':
3128                     case 'd':
3129                     case 'D':
3130                     case 'p':
3131                     case 'P':
3132                         --p;
3133                         goto loopdone;
3134                     case 'n':
3135                         ender = '\n';
3136                         p++;
3137                         break;
3138                     case 'r':
3139                         ender = '\r';
3140                         p++;
3141                         break;
3142                     case 't':
3143                         ender = '\t';
3144                         p++;
3145                         break;
3146                     case 'f':
3147                         ender = '\f';
3148                         p++;
3149                         break;
3150                     case 'e':
3151                           ender = ASCII_TO_NATIVE('\033');
3152                         p++;
3153                         break;
3154                     case 'a':
3155                           ender = ASCII_TO_NATIVE('\007');
3156                         p++;
3157                         break;
3158                     case 'x':
3159                         if (*++p == '{') {
3160                             char* e = strchr(p, '}');
3161         
3162                             if (!e) {
3163                                 RExC_parse = p + 1;
3164                                 vFAIL("Missing right brace on \\x{}");
3165                             }
3166                             else {
3167                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3168                                     | PERL_SCAN_DISALLOW_PREFIX;
3169                                 numlen = e - p - 1;
3170                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3171                                 if (ender > 0xff)
3172                                     RExC_utf8 = 1;
3173                                 p = e + 1;
3174                             }
3175                         }
3176                         else {
3177                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3178                             numlen = 2;
3179                             ender = grok_hex(p, &numlen, &flags, NULL);
3180                             p += numlen;
3181                         }
3182                         break;
3183                     case 'c':
3184                         p++;
3185                         ender = UCHARAT(p++);
3186                         ender = toCTRL(ender);
3187                         break;
3188                     case '0': case '1': case '2': case '3':case '4':
3189                     case '5': case '6': case '7': case '8':case '9':
3190                         if (*p == '0' ||
3191                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3192                             I32 flags = 0;
3193                             numlen = 3;
3194                             ender = grok_oct(p, &numlen, &flags, NULL);
3195                             p += numlen;
3196                         }
3197                         else {
3198                             --p;
3199                             goto loopdone;
3200                         }
3201                         break;
3202                     case '\0':
3203                         if (p >= RExC_end)
3204                             FAIL("Trailing \\");
3205                         /* FALL THROUGH */
3206                     default:
3207                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3208                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3209                         goto normal_default;
3210                     }
3211                     break;
3212                 default:
3213                   normal_default:
3214                     if (UTF8_IS_START(*p) && UTF) {
3215                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3216                                                &numlen, 0);
3217                         p += numlen;
3218                     }
3219                     else
3220                         ender = *p++;
3221                     break;
3222                 }
3223                 if (RExC_flags & PMf_EXTENDED)
3224                     p = regwhite(p, RExC_end);
3225                 if (UTF && FOLD) {
3226                     /* Prime the casefolded buffer. */
3227                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3228                 }
3229                 if (ISMULT2(p)) { /* Back off on ?+*. */
3230                     if (len)
3231                         p = oldp;
3232                     else if (UTF) {
3233                          STRLEN unilen;
3234
3235                          if (FOLD) {
3236                               /* Emit all the Unicode characters. */
3237                               for (foldbuf = tmpbuf;
3238                                    foldlen;
3239                                    foldlen -= numlen) {
3240                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3241                                    if (numlen > 0) {
3242                                         reguni(pRExC_state, ender, s, &unilen);
3243                                         s       += unilen;
3244                                         len     += unilen;
3245                                         /* In EBCDIC the numlen
3246                                          * and unilen can differ. */
3247                                         foldbuf += numlen;
3248                                         if (numlen >= foldlen)
3249                                              break;
3250                                    }
3251                                    else
3252                                         break; /* "Can't happen." */
3253                               }
3254                          }
3255                          else {
3256                               reguni(pRExC_state, ender, s, &unilen);
3257                               if (unilen > 0) {
3258                                    s   += unilen;
3259                                    len += unilen;
3260                               }
3261                          }
3262                     }
3263                     else {
3264                         len++;
3265                         REGC((char)ender, s++);
3266                     }
3267                     break;
3268                 }
3269                 if (UTF) {
3270                      STRLEN unilen;
3271
3272                      if (FOLD) {
3273                           /* Emit all the Unicode characters. */
3274                           for (foldbuf = tmpbuf;
3275                                foldlen;
3276                                foldlen -= numlen) {
3277                                ender = utf8_to_uvchr(foldbuf, &numlen);
3278                                if (numlen > 0) {
3279                                     reguni(pRExC_state, ender, s, &unilen);
3280                                     len     += unilen;
3281                                     s       += unilen;
3282                                     /* In EBCDIC the numlen
3283                                      * and unilen can differ. */
3284                                     foldbuf += numlen;
3285                                     if (numlen >= foldlen)
3286                                          break;
3287                                }
3288                                else
3289                                     break;
3290                           }
3291                      }
3292                      else {
3293                           reguni(pRExC_state, ender, s, &unilen);
3294                           if (unilen > 0) {
3295                                s   += unilen;
3296                                len += unilen;
3297                           }
3298                      }
3299                      len--;
3300                 }
3301                 else
3302                     REGC((char)ender, s++);
3303             }
3304         loopdone:
3305             RExC_parse = p - 1;
3306             Set_Node_Cur_Length(ret); /* MJD */
3307             nextchar(pRExC_state);
3308             {
3309                 /* len is STRLEN which is unsigned, need to copy to signed */
3310                 IV iv = len;
3311                 if (iv < 0)
3312                     vFAIL("Internal disaster");
3313             }
3314             if (len > 0)
3315                 *flagp |= HASWIDTH;
3316             if (len == 1 && UNI_IS_INVARIANT(ender))
3317                 *flagp |= SIMPLE;
3318             if (!SIZE_ONLY)
3319                 STR_LEN(ret) = len;
3320             if (SIZE_ONLY)
3321                 RExC_size += STR_SZ(len);
3322             else
3323                 RExC_emit += STR_SZ(len);
3324         }
3325         break;
3326     }
3327
3328     /* If the encoding pragma is in effect recode the text of
3329      * any EXACT-kind nodes. */
3330     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3331         STRLEN oldlen = STR_LEN(ret);
3332         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3333
3334         if (RExC_utf8)
3335             SvUTF8_on(sv);
3336         if (sv_utf8_downgrade(sv, TRUE)) {
3337             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3338             STRLEN newlen = SvCUR(sv);
3339
3340             if (SvUTF8(sv))
3341                 RExC_utf8 = 1;
3342             if (!SIZE_ONLY) {
3343                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344                                       (int)oldlen, STRING(ret),
3345                                       (int)newlen, s));
3346                 Copy(s, STRING(ret), newlen, char);
3347                 STR_LEN(ret) += newlen - oldlen;
3348                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3349             } else
3350                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3351         }
3352     }
3353
3354     return(ret);
3355 }
3356
3357 STATIC char *
3358 S_regwhite(pTHX_ char *p, char *e)
3359 {
3360     while (p < e) {
3361         if (isSPACE(*p))
3362             ++p;
3363         else if (*p == '#') {
3364             do {
3365                 p++;
3366             } while (p < e && *p != '\n');
3367         }
3368         else
3369             break;
3370     }
3371     return p;
3372 }
3373
3374 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375    Character classes ([:foo:]) can also be negated ([:^foo:]).
3376    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3378    but trigger failures because they are currently unimplemented. */
3379
3380 #define POSIXCC_DONE(c)   ((c) == ':')
3381 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3383
3384 STATIC I32
3385 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3386 {
3387     char *posixcc = 0;
3388     I32 namedclass = OOB_NAMEDCLASS;
3389
3390     if (value == '[' && RExC_parse + 1 < RExC_end &&
3391         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3392         POSIXCC(UCHARAT(RExC_parse))) {
3393         char  c = UCHARAT(RExC_parse);
3394         char* s = RExC_parse++;
3395         
3396         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3397             RExC_parse++;
3398         if (RExC_parse == RExC_end)
3399             /* Grandfather lone [:, [=, [. */
3400             RExC_parse = s;
3401         else {
3402             char* t = RExC_parse++; /* skip over the c */
3403
3404             if (UCHARAT(RExC_parse) == ']') {
3405                 RExC_parse++; /* skip over the ending ] */
3406                 posixcc = s + 1;
3407                 if (*s == ':') {
3408                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3409                     I32 skip = 5; /* the most common skip */
3410
3411                     switch (*posixcc) {
3412                     case 'a':
3413                         if (strnEQ(posixcc, "alnum", 5))
3414                             namedclass =
3415                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3416                         else if (strnEQ(posixcc, "alpha", 5))
3417                             namedclass =
3418                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3419                         else if (strnEQ(posixcc, "ascii", 5))
3420                             namedclass =
3421                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3422                         break;
3423                     case 'b':
3424                         if (strnEQ(posixcc, "blank", 5))
3425                             namedclass =
3426                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3427                         break;
3428                     case 'c':
3429                         if (strnEQ(posixcc, "cntrl", 5))
3430                             namedclass =
3431                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3432                         break;
3433                     case 'd':
3434                         if (strnEQ(posixcc, "digit", 5))
3435                             namedclass =
3436                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3437                         break;
3438                     case 'g':
3439                         if (strnEQ(posixcc, "graph", 5))
3440                             namedclass =
3441                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3442                         break;
3443                     case 'l':
3444                         if (strnEQ(posixcc, "lower", 5))
3445                             namedclass =
3446                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3447                         break;
3448                     case 'p':
3449                         if (strnEQ(posixcc, "print", 5))
3450                             namedclass =
3451                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3452                         else if (strnEQ(posixcc, "punct", 5))
3453                             namedclass =
3454                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3455                         break;
3456                     case 's':
3457                         if (strnEQ(posixcc, "space", 5))
3458                             namedclass =
3459                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3460                         break;
3461                     case 'u':
3462                         if (strnEQ(posixcc, "upper", 5))
3463                             namedclass =
3464                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3465                         break;
3466                     case 'w': /* this is not POSIX, this is the Perl \w */
3467                         if (strnEQ(posixcc, "word", 4)) {
3468                             namedclass =
3469                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3470                             skip = 4;
3471                         }
3472                         break;
3473                     case 'x':
3474                         if (strnEQ(posixcc, "xdigit", 6)) {
3475                             namedclass =
3476                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3477                             skip = 6;
3478                         }
3479                         break;
3480                     }
3481                     if (namedclass == OOB_NAMEDCLASS ||
3482                         posixcc[skip] != ':' ||
3483                         posixcc[skip+1] != ']')
3484                     {
3485                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3486                                       t - s - 1, s + 1);
3487                     }
3488                 } else if (!SIZE_ONLY) {
3489                     /* [[=foo=]] and [[.foo.]] are still future. */
3490
3491                     /* adjust RExC_parse so the warning shows after
3492                        the class closes */
3493                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3494                         RExC_parse++;
3495                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3496                 }
3497             } else {
3498                 /* Maternal grandfather:
3499                  * "[:" ending in ":" but not in ":]" */
3500                 RExC_parse = s;
3501             }
3502         }
3503     }
3504
3505     return namedclass;
3506 }
3507
3508 STATIC void
3509 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3510 {
3511     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3512         char *s = RExC_parse;
3513         char  c = *s++;
3514
3515         while(*s && isALNUM(*s))
3516             s++;
3517         if (*s && c == *s && s[1] == ']') {
3518             if (ckWARN(WARN_REGEXP))
3519                 vWARN3(s+2,
3520                         "POSIX syntax [%c %c] belongs inside character classes",
3521                         c, c);
3522
3523             /* [[=foo=]] and [[.foo.]] are still future. */
3524             if (POSIXCC_NOTYET(c)) {
3525                 /* adjust RExC_parse so the error shows after
3526                    the class closes */
3527                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3528                     ;
3529                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3530             }
3531         }
3532     }
3533 }
3534
3535 STATIC regnode *
3536 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3537 {
3538     register UV value;
3539     register UV nextvalue;
3540     register IV prevvalue = OOB_UNICODE;
3541     register IV range = 0;
3542     register regnode *ret;
3543     STRLEN numlen;
3544     IV namedclass;
3545     char *rangebegin = 0;
3546     bool need_class = 0;
3547     SV *listsv = Nullsv;
3548     register char *e;
3549     UV n;
3550     bool optimize_invert   = TRUE;
3551     AV* unicode_alternate  = 0;
3552 #ifdef EBCDIC
3553     UV literal_endpoint = 0;
3554 #endif
3555
3556     ret = reganode(pRExC_state, ANYOF, 0);
3557
3558     if (!SIZE_ONLY)
3559         ANYOF_FLAGS(ret) = 0;
3560
3561     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3562         RExC_naughty++;
3563         RExC_parse++;
3564         if (!SIZE_ONLY)
3565             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3566     }
3567
3568     if (SIZE_ONLY)
3569         RExC_size += ANYOF_SKIP;
3570     else {
3571         RExC_emit += ANYOF_SKIP;
3572         if (FOLD)
3573             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3574         if (LOC)
3575             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3576         ANYOF_BITMAP_ZERO(ret);
3577         listsv = newSVpvn("# comment\n", 10);
3578     }
3579
3580     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3581
3582     if (!SIZE_ONLY && POSIXCC(nextvalue))
3583         checkposixcc(pRExC_state);
3584
3585     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3586     if (UCHARAT(RExC_parse) == ']')
3587         goto charclassloop;
3588
3589     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3590
3591     charclassloop:
3592
3593         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3594
3595         if (!range)
3596             rangebegin = RExC_parse;
3597         if (UTF) {
3598             value = utf8n_to_uvchr((U8*)RExC_parse,
3599                                    RExC_end - RExC_parse,
3600                                    &numlen, 0);
3601             RExC_parse += numlen;
3602         }
3603         else
3604             value = UCHARAT(RExC_parse++);
3605         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606         if (value == '[' && POSIXCC(nextvalue))
3607             namedclass = regpposixcc(pRExC_state, value);
3608         else if (value == '\\') {
3609             if (UTF) {
3610                 value = utf8n_to_uvchr((U8*)RExC_parse,
3611                                    RExC_end - RExC_parse,
3612                                    &numlen, 0);
3613                 RExC_parse += numlen;
3614             }
3615             else
3616                 value = UCHARAT(RExC_parse++);
3617             /* Some compilers cannot handle switching on 64-bit integer
3618              * values, therefore value cannot be an UV.  Yes, this will
3619              * be a problem later if we want switch on Unicode.
3620              * A similar issue a little bit later when switching on
3621              * namedclass. --jhi */
3622             switch ((I32)value) {
3623             case 'w':   namedclass = ANYOF_ALNUM;       break;
3624             case 'W':   namedclass = ANYOF_NALNUM;      break;
3625             case 's':   namedclass = ANYOF_SPACE;       break;
3626             case 'S':   namedclass = ANYOF_NSPACE;      break;
3627             case 'd':   namedclass = ANYOF_DIGIT;       break;
3628             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3629             case 'p':
3630             case 'P':
3631                 if (RExC_parse >= RExC_end)
3632                     vFAIL2("Empty \\%c{}", (U8)value);
3633                 if (*RExC_parse == '{') {
3634                     U8 c = (U8)value;
3635                     e = strchr(RExC_parse++, '}');
3636                     if (!e)
3637                         vFAIL2("Missing right brace on \\%c{}", c);
3638                     while (isSPACE(UCHARAT(RExC_parse)))
3639                         RExC_parse++;
3640                     if (e == RExC_parse)
3641                         vFAIL2("Empty \\%c{}", c);
3642                     n = e - RExC_parse;
3643                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3644                         n--;
3645                 }
3646                 else {
3647                     e = RExC_parse;
3648                     n = 1;
3649                 }
3650                 if (!SIZE_ONLY) {
3651                     if (UCHARAT(RExC_parse) == '^') {
3652                          RExC_parse++;
3653                          n--;
3654                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3655                          while (isSPACE(UCHARAT(RExC_parse))) {
3656                               RExC_parse++;
3657                               n--;
3658                          }
3659                     }
3660                     if (value == 'p')
3661                          Perl_sv_catpvf(aTHX_ listsv,
3662                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3663                     else
3664                          Perl_sv_catpvf(aTHX_ listsv,
3665                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3666                 }
3667                 RExC_parse = e + 1;
3668                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3669                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
3670                 break;
3671             case 'n':   value = '\n';                   break;
3672             case 'r':   value = '\r';                   break;
3673             case 't':   value = '\t';                   break;
3674             case 'f':   value = '\f';                   break;
3675             case 'b':   value = '\b';                   break;
3676             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3677             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3678             case 'x':
3679                 if (*RExC_parse == '{') {
3680                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3681                         | PERL_SCAN_DISALLOW_PREFIX;
3682                     e = strchr(RExC_parse++, '}');
3683                     if (!e)
3684                         vFAIL("Missing right brace on \\x{}");
3685
3686                     numlen = e - RExC_parse;
3687                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3688                     RExC_parse = e + 1;
3689                 }
3690                 else {
3691                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3692                     numlen = 2;
3693                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3694                     RExC_parse += numlen;
3695                 }
3696                 break;
3697             case 'c':
3698                 value = UCHARAT(RExC_parse++);
3699                 value = toCTRL(value);
3700                 break;
3701             case '0': case '1': case '2': case '3': case '4':
3702             case '5': case '6': case '7': case '8': case '9':
3703             {
3704                 I32 flags = 0;
3705                 numlen = 3;
3706                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3707                 RExC_parse += numlen;
3708                 break;
3709             }
3710             default:
3711                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3712                     vWARN2(RExC_parse,
3713                            "Unrecognized escape \\%c in character class passed through",
3714                            (int)value);
3715                 break;
3716             }
3717         } /* end of \blah */
3718 #ifdef EBCDIC
3719         else
3720             literal_endpoint++;
3721 #endif
3722
3723         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3724
3725             if (!SIZE_ONLY && !need_class)
3726                 ANYOF_CLASS_ZERO(ret);
3727
3728             need_class = 1;
3729
3730             /* a bad range like a-\d, a-[:digit:] ? */
3731             if (range) {
3732                 if (!SIZE_ONLY) {
3733                     if (ckWARN(WARN_REGEXP))
3734                         vWARN4(RExC_parse,
3735                                "False [] range \"%*.*s\"",
3736                                RExC_parse - rangebegin,
3737                                RExC_parse - rangebegin,
3738                                rangebegin);
3739                     if (prevvalue < 256) {
3740                         ANYOF_BITMAP_SET(ret, prevvalue);
3741                         ANYOF_BITMAP_SET(ret, '-');
3742                     }
3743                     else {
3744                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3745                         Perl_sv_catpvf(aTHX_ listsv,
3746                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3747                     }
3748                 }
3749
3750                 range = 0; /* this was not a true range */
3751             }
3752
3753             if (!SIZE_ONLY) {
3754                 if (namedclass > OOB_NAMEDCLASS)
3755                     optimize_invert = FALSE;
3756                 /* Possible truncation here but in some 64-bit environments
3757                  * the compiler gets heartburn about switch on 64-bit values.
3758                  * A similar issue a little earlier when switching on value.
3759                  * --jhi */
3760                 switch ((I32)namedclass) {
3761                 case ANYOF_ALNUM:
3762                     if (LOC)
3763                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3764                     else {
3765                         for (value = 0; value < 256; value++)
3766                             if (isALNUM(value))
3767                                 ANYOF_BITMAP_SET(ret, value);
3768                     }
3769                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3770                     break;
3771                 case ANYOF_NALNUM:
3772                     if (LOC)
3773                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3774                     else {
3775                         for (value = 0; value < 256; value++)
3776                             if (!isALNUM(value))
3777                                 ANYOF_BITMAP_SET(ret, value);
3778                     }
3779                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3780                     break;
3781                 case ANYOF_ALNUMC:
3782                     if (LOC)
3783                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3784                     else {
3785                         for (value = 0; value < 256; value++)
3786                             if (isALNUMC(value))
3787                                 ANYOF_BITMAP_SET(ret, value);
3788                     }
3789                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3790                     break;
3791                 case ANYOF_NALNUMC:
3792                     if (LOC)
3793                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3794                     else {
3795                         for (value = 0; value < 256; value++)
3796                             if (!isALNUMC(value))
3797                                 ANYOF_BITMAP_SET(ret, value);
3798                     }
3799                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3800                     break;
3801                 case ANYOF_ALPHA:
3802                     if (LOC)
3803                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3804                     else {
3805                         for (value = 0; value < 256; value++)
3806                             if (isALPHA(value))
3807                                 ANYOF_BITMAP_SET(ret, value);
3808                     }
3809                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3810                     break;
3811                 case ANYOF_NALPHA:
3812                     if (LOC)
3813                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3814                     else {
3815                         for (value = 0; value < 256; value++)
3816                             if (!isALPHA(value))
3817                                 ANYOF_BITMAP_SET(ret, value);
3818                     }
3819                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3820                     break;
3821                 case ANYOF_ASCII:
3822                     if (LOC)
3823                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3824                     else {
3825 #ifndef EBCDIC
3826                         for (value = 0; value < 128; value++)
3827                             ANYOF_BITMAP_SET(ret, value);
3828 #else  /* EBCDIC */
3829                         for (value = 0; value < 256; value++) {
3830                             if (isASCII(value))
3831                                 ANYOF_BITMAP_SET(ret, value);
3832                         }
3833 #endif /* EBCDIC */
3834                     }
3835                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3836                     break;
3837                 case ANYOF_NASCII:
3838                     if (LOC)
3839                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3840                     else {
3841 #ifndef EBCDIC
3842                         for (value = 128; value < 256; value++)
3843                             ANYOF_BITMAP_SET(ret, value);
3844 #else  /* EBCDIC */
3845                         for (value = 0; value < 256; value++) {
3846                             if (!isASCII(value))
3847                                 ANYOF_BITMAP_SET(ret, value);
3848                         }
3849 #endif /* EBCDIC */
3850                     }
3851                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3852                     break;
3853                 case ANYOF_BLANK:
3854                     if (LOC)
3855                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3856                     else {
3857                         for (value = 0; value < 256; value++)
3858                             if (isBLANK(value))
3859                                 ANYOF_BITMAP_SET(ret, value);
3860                     }
3861                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3862                     break;
3863                 case ANYOF_NBLANK:
3864                     if (LOC)
3865                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3866                     else {
3867                         for (value = 0; value < 256; value++)
3868                             if (!isBLANK(value))
3869                                 ANYOF_BITMAP_SET(ret, value);
3870                     }
3871                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3872                     break;
3873                 case ANYOF_CNTRL:
3874                     if (LOC)
3875                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3876                     else {
3877                         for (value = 0; value < 256; value++)
3878                             if (isCNTRL(value))
3879                                 ANYOF_BITMAP_SET(ret, value);
3880                     }
3881                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3882                     break;
3883                 case ANYOF_NCNTRL:
3884                     if (LOC)
3885                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3886                     else {
3887                         for (value = 0; value < 256; value++)
3888                             if (!isCNTRL(value))
3889                                 ANYOF_BITMAP_SET(ret, value);
3890                     }
3891                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3892                     break;
3893                 case ANYOF_DIGIT:
3894                     if (LOC)
3895                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3896                     else {
3897                         /* consecutive digits assumed */
3898                         for (value = '0'; value <= '9'; value++)
3899                             ANYOF_BITMAP_SET(ret, value);
3900                     }
3901                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3902                     break;
3903                 case ANYOF_NDIGIT:
3904                     if (LOC)
3905                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3906                     else {
3907                         /* consecutive digits assumed */
3908                         for (value = 0; value < '0'; value++)
3909                             ANYOF_BITMAP_SET(ret, value);
3910                         for (value = '9' + 1; value < 256; value++)
3911                             ANYOF_BITMAP_SET(ret, value);
3912                     }
3913                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3914                     break;
3915                 case ANYOF_GRAPH:
3916                     if (LOC)
3917                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3918                     else {
3919                         for (value = 0; value < 256; value++)
3920                             if (isGRAPH(value))
3921                                 ANYOF_BITMAP_SET(ret, value);
3922                     }
3923                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3924                     break;
3925                 case ANYOF_NGRAPH:
3926                     if (LOC)
3927                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3928                     else {
3929                         for (value = 0; value < 256; value++)
3930                             if (!isGRAPH(value))
3931                                 ANYOF_BITMAP_SET(ret, value);
3932                     }
3933                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3934                     break;
3935                 case ANYOF_LOWER:
3936                     if (LOC)
3937                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3938                     else {
3939                         for (value = 0; value < 256; value++)
3940                             if (isLOWER(value))
3941                                 ANYOF_BITMAP_SET(ret, value);
3942                     }
3943                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3944                     break;
3945                 case ANYOF_NLOWER:
3946                     if (LOC)
3947                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3948                     else {
3949                         for (value = 0; value < 256; value++)
3950                             if (!isLOWER(value))
3951                                 ANYOF_BITMAP_SET(ret, value);
3952                     }
3953                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3954                     break;
3955                 case ANYOF_PRINT:
3956                     if (LOC)
3957                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3958                     else {
3959                         for (value = 0; value < 256; value++)
3960                             if (isPRINT(value))
3961                                 ANYOF_BITMAP_SET(ret, value);
3962                     }
3963                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3964                     break;
3965                 case ANYOF_NPRINT:
3966                     if (LOC)
3967                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3968                     else {
3969                         for (value = 0; value < 256; value++)
3970                             if (!isPRINT(value))
3971                                 ANYOF_BITMAP_SET(ret, value);
3972                     }
3973                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3974                     break;
3975                 case ANYOF_PSXSPC:
3976                     if (LOC)
3977                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3978                     else {
3979                         for (value = 0; value < 256; value++)
3980                             if (isPSXSPC(value))
3981                                 ANYOF_BITMAP_SET(ret, value);
3982                     }
3983                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3984                     break;
3985                 case ANYOF_NPSXSPC:
3986                     if (LOC)
3987                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3988                     else {
3989                         for (value = 0; value < 256; value++)
3990                             if (!isPSXSPC(value))
3991                                 ANYOF_BITMAP_SET(ret, value);
3992                     }
3993                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3994                     break;
3995                 case ANYOF_PUNCT:
3996                     if (LOC)
3997                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3998                     else {
3999                         for (value = 0; value < 256; value++)
4000                             if (isPUNCT(value))
4001                                 ANYOF_BITMAP_SET(ret, value);
4002                     }
4003                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4004                     break;
4005                 case ANYOF_NPUNCT:
4006                     if (LOC)
4007                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4008                     else {
4009                         for (value = 0; value < 256; value++)
4010                             if (!isPUNCT(value))
4011                                 ANYOF_BITMAP_SET(ret, value);
4012                     }
4013                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4014                     break;
4015                 case ANYOF_SPACE:
4016                     if (LOC)
4017                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4018                     else {
4019                         for (value = 0; value < 256; value++)
4020                             if (isSPACE(value))
4021                                 ANYOF_BITMAP_SET(ret, value);
4022                     }
4023                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4024                     break;
4025                 case ANYOF_NSPACE:
4026                     if (LOC)
4027                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4028                     else {
4029                         for (value = 0; value < 256; value++)
4030                             if (!isSPACE(value))
4031                                 ANYOF_BITMAP_SET(ret, value);
4032                     }
4033                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4034                     break;
4035                 case ANYOF_UPPER:
4036                     if (LOC)
4037                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4038                     else {
4039                         for (value = 0; value < 256; value++)
4040                             if (isUPPER(value))
4041                                 ANYOF_BITMAP_SET(ret, value);
4042                     }
4043                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4044                     break;
4045                 case ANYOF_NUPPER:
4046                     if (LOC)
4047                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4048                     else {
4049                         for (value = 0; value < 256; value++)
4050                             if (!isUPPER(value))
4051                                 ANYOF_BITMAP_SET(ret, value);
4052                     }
4053                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4054                     break;
4055                 case ANYOF_XDIGIT:
4056                     if (LOC)
4057                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4058                     else {
4059                         for (value = 0; value < 256; value++)
4060                             if (isXDIGIT(value))
4061                                 ANYOF_BITMAP_SET(ret, value);
4062                     }
4063                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4064                     break;
4065                 case ANYOF_NXDIGIT:
4066                     if (LOC)
4067                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4068                     else {
4069                         for (value = 0; value < 256; value++)
4070                             if (!isXDIGIT(value))
4071                                 ANYOF_BITMAP_SET(ret, value);
4072                     }
4073                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4074                     break;
4075                 case ANYOF_MAX:
4076                     /* this is to handle \p and \P */
4077                     break;
4078                 default:
4079                     vFAIL("Invalid [::] class");
4080                     break;
4081                 }
4082                 if (LOC)
4083                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4084                 continue;
4085             }
4086         } /* end of namedclass \blah */
4087
4088         if (range) {
4089             if (prevvalue > (IV)value) /* b-a */ {
4090                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4091                               RExC_parse - rangebegin,
4092                               RExC_parse - rangebegin,
4093                               rangebegin);
4094                 range = 0; /* not a valid range */
4095             }
4096         }
4097         else {
4098             prevvalue = value; /* save the beginning of the range */
4099             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4100                 RExC_parse[1] != ']') {
4101                 RExC_parse++;
4102
4103                 /* a bad range like \w-, [:word:]- ? */
4104                 if (namedclass > OOB_NAMEDCLASS) {
4105                     if (ckWARN(WARN_REGEXP))
4106                         vWARN4(RExC_parse,
4107                                "False [] range \"%*.*s\"",
4108                                RExC_parse - rangebegin,
4109                                RExC_parse - rangebegin,
4110                                rangebegin);
4111                     if (!SIZE_ONLY)
4112                         ANYOF_BITMAP_SET(ret, '-');
4113                 } else
4114                     range = 1;  /* yeah, it's a range! */
4115                 continue;       /* but do it the next time */
4116             }
4117         }
4118
4119         /* now is the next time */
4120         if (!SIZE_ONLY) {
4121             IV i;
4122
4123             if (prevvalue < 256) {
4124                 IV ceilvalue = value < 256 ? value : 255;
4125
4126 #ifdef EBCDIC
4127                 /* In EBCDIC [\x89-\x91] should include
4128                  * the \x8e but [i-j] should not. */
4129                 if (literal_endpoint == 2 &&
4130                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4131                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4132                 {
4133                     if (isLOWER(prevvalue)) {
4134                         for (i = prevvalue; i <= ceilvalue; i++)
4135                             if (isLOWER(i))
4136                                 ANYOF_BITMAP_SET(ret, i);
4137                     } else {
4138                         for (i = prevvalue; i <= ceilvalue; i++)
4139                             if (isUPPER(i))
4140                                 ANYOF_BITMAP_SET(ret, i);
4141                     }
4142                 }
4143                 else
4144 #endif
4145                       for (i = prevvalue; i <= ceilvalue; i++)
4146                           ANYOF_BITMAP_SET(ret, i);
4147           }
4148           if (value > 255 || UTF) {
4149                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4150                 UV natvalue      = NATIVE_TO_UNI(value);
4151
4152                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4153                 if (prevnatvalue < natvalue) { /* what about > ? */
4154                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4155                                    prevnatvalue, natvalue);
4156                 }
4157                 else if (prevnatvalue == natvalue) {
4158                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4159                     if (FOLD) {
4160                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4161                          STRLEN foldlen;
4162                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4163
4164                          /* If folding and foldable and a single
4165                           * character, insert also the folded version
4166                           * to the charclass. */
4167                          if (f != value) {
4168                               if (foldlen == (STRLEN)UNISKIP(f))
4169                                   Perl_sv_catpvf(aTHX_ listsv,
4170                                                  "%04"UVxf"\n", f);
4171                               else {
4172                                   /* Any multicharacter foldings
4173                                    * require the following transform:
4174                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4175                                    * where E folds into "pq" and F folds
4176                                    * into "rst", all other characters
4177                                    * fold to single characters.  We save
4178                                    * away these multicharacter foldings,
4179                                    * to be later saved as part of the
4180                                    * additional "s" data. */
4181                                   SV *sv;
4182
4183                                   if (!unicode_alternate)
4184                                       unicode_alternate = newAV();
4185                                   sv = newSVpvn((char*)foldbuf, foldlen);
4186                                   SvUTF8_on(sv);
4187                                   av_push(unicode_alternate, sv);
4188                               }
4189                          }
4190
4191                          /* If folding and the value is one of the Greek
4192                           * sigmas insert a few more sigmas to make the
4193                           * folding rules of the sigmas to work right.
4194                           * Note that not all the possible combinations
4195                           * are handled here: some of them are handled
4196                           * by the standard folding rules, and some of
4197                           * them (literal or EXACTF cases) are handled
4198                           * during runtime in regexec.c:S_find_byclass(). */
4199                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4200                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4201                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4202                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4203                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4204                          }
4205                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4206                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4207                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4208                     }
4209                 }
4210             }
4211 #ifdef EBCDIC
4212             literal_endpoint = 0;
4213 #endif
4214         }
4215
4216         range = 0; /* this range (if it was one) is done now */
4217     }
4218
4219     if (need_class) {
4220         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4221         if (SIZE_ONLY)
4222             RExC_size += ANYOF_CLASS_ADD_SKIP;
4223         else
4224             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4225     }
4226
4227     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4228     if (!SIZE_ONLY &&
4229          /* If the only flag is folding (plus possibly inversion). */
4230         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4231        ) {
4232         for (value = 0; value < 256; ++value) {
4233             if (ANYOF_BITMAP_TEST(ret, value)) {
4234                 UV fold = PL_fold[value];
4235
4236                 if (fold != value)
4237                     ANYOF_BITMAP_SET(ret, fold);
4238             }
4239         }
4240         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4241     }
4242
4243     /* optimize inverted simple patterns (e.g. [^a-z]) */
4244     if (!SIZE_ONLY && optimize_invert &&
4245         /* If the only flag is inversion. */
4246         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4247         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4248             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4249         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4250     }
4251
4252     if (!SIZE_ONLY) {
4253         AV *av = newAV();
4254         SV *rv;
4255
4256         /* The 0th element stores the character class description
4257          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4258          * to initialize the appropriate swash (which gets stored in
4259          * the 1st element), and also useful for dumping the regnode.
4260          * The 2nd element stores the multicharacter foldings,
4261          * used later (regexec.c:S_reginclass()). */
4262         av_store(av, 0, listsv);
4263         av_store(av, 1, NULL);
4264         av_store(av, 2, (SV*)unicode_alternate);
4265         rv = newRV_noinc((SV*)av);
4266         n = add_data(pRExC_state, 1, "s");
4267         RExC_rx->data->data[n] = (void*)rv;
4268         ARG_SET(ret, n);
4269     }
4270
4271     return ret;
4272 }
4273
4274 STATIC char*
4275 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4276 {
4277     char* retval = RExC_parse++;
4278
4279     for (;;) {
4280         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4281                 RExC_parse[2] == '#') {
4282             while (*RExC_parse != ')') {
4283                 if (RExC_parse == RExC_end)
4284                     FAIL("Sequence (?#... not terminated");
4285                 RExC_parse++;
4286             }
4287             RExC_parse++;
4288             continue;
4289         }
4290         if (RExC_flags & PMf_EXTENDED) {
4291             if (isSPACE(*RExC_parse)) {
4292                 RExC_parse++;
4293                 continue;
4294             }
4295             else if (*RExC_parse == '#') {
4296                 while (RExC_parse < RExC_end)
4297                     if (*RExC_parse++ == '\n') break;
4298                 continue;
4299             }
4300         }
4301         return retval;
4302     }
4303 }
4304
4305 /*
4306 - reg_node - emit a node
4307 */
4308 STATIC regnode *                        /* Location. */
4309 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4310 {
4311     register regnode *ret;
4312     register regnode *ptr;
4313
4314     ret = RExC_emit;
4315     if (SIZE_ONLY) {
4316         SIZE_ALIGN(RExC_size);
4317         RExC_size += 1;
4318         return(ret);
4319     }
4320
4321     NODE_ALIGN_FILL(ret);
4322     ptr = ret;
4323     FILL_ADVANCE_NODE(ptr, op);
4324     if (RExC_offsets) {         /* MJD */
4325         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4326               "reg_node", __LINE__, 
4327               reg_name[op],
4328               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4329               ? "Overwriting end of array!\n" : "OK",
4330               RExC_emit - RExC_emit_start,
4331               RExC_parse - RExC_start,
4332               RExC_offsets[0])); 
4333         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4334     }
4335             
4336     RExC_emit = ptr;
4337
4338     return(ret);
4339 }
4340
4341 /*
4342 - reganode - emit a node with an argument
4343 */
4344 STATIC regnode *                        /* Location. */
4345 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4346 {
4347     register regnode *ret;
4348     register regnode *ptr;
4349
4350     ret = RExC_emit;
4351     if (SIZE_ONLY) {
4352         SIZE_ALIGN(RExC_size);
4353         RExC_size += 2;
4354         return(ret);
4355     }
4356
4357     NODE_ALIGN_FILL(ret);
4358     ptr = ret;
4359     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4360     if (RExC_offsets) {         /* MJD */
4361         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4362               "reganode",
4363               __LINE__,
4364               reg_name[op],
4365               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4366               "Overwriting end of array!\n" : "OK",
4367               RExC_emit - RExC_emit_start,
4368               RExC_parse - RExC_start,
4369               RExC_offsets[0])); 
4370         Set_Cur_Node_Offset;
4371     }
4372             
4373     RExC_emit = ptr;
4374
4375     return(ret);
4376 }
4377
4378 /*
4379 - reguni - emit (if appropriate) a Unicode character
4380 */
4381 STATIC void
4382 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4383 {
4384     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4385 }
4386
4387 /*
4388 - reginsert - insert an operator in front of already-emitted operand
4389 *
4390 * Means relocating the operand.
4391 */
4392 STATIC void
4393 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4394 {
4395     register regnode *src;
4396     register regnode *dst;
4397     register regnode *place;
4398     register int offset = regarglen[(U8)op];
4399
4400 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4401
4402     if (SIZE_ONLY) {
4403         RExC_size += NODE_STEP_REGNODE + offset;
4404         return;
4405     }
4406
4407     src = RExC_emit;
4408     RExC_emit += NODE_STEP_REGNODE + offset;
4409     dst = RExC_emit;
4410     while (src > opnd) {
4411         StructCopy(--src, --dst, regnode);
4412         if (RExC_offsets) {     /* MJD 20010112 */
4413             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4414                   "reg_insert",
4415                   __LINE__,
4416                   reg_name[op],
4417                   dst - RExC_emit_start > RExC_offsets[0] 
4418                   ? "Overwriting end of array!\n" : "OK",
4419                   src - RExC_emit_start,
4420                   dst - RExC_emit_start,
4421                   RExC_offsets[0])); 
4422             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4423             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4424         }
4425     }
4426     
4427
4428     place = opnd;               /* Op node, where operand used to be. */
4429     if (RExC_offsets) {         /* MJD */
4430         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4431               "reginsert",
4432               __LINE__,
4433               reg_name[op],
4434               place - RExC_emit_start > RExC_offsets[0] 
4435               ? "Overwriting end of array!\n" : "OK",
4436               place - RExC_emit_start,
4437               RExC_parse - RExC_start,
4438               RExC_offsets[0])); 
4439         Set_Node_Offset(place, RExC_parse);
4440         Set_Node_Length(place, 1);
4441     }
4442     src = NEXTOPER(place);
4443     FILL_ADVANCE_NODE(place, op);
4444     Zero(src, offset, regnode);
4445 }
4446
4447 /*
4448 - regtail - set the next-pointer at the end of a node chain of p to val.
4449 */
4450 STATIC void
4451 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4452 {
4453     register regnode *scan;
4454     register regnode *temp;
4455
4456     if (SIZE_ONLY)
4457         return;
4458
4459     /* Find last node. */
4460     scan = p;
4461     for (;;) {
4462         temp = regnext(scan);
4463         if (temp == NULL)
4464             break;
4465         scan = temp;
4466     }
4467
4468     if (reg_off_by_arg[OP(scan)]) {
4469         ARG_SET(scan, val - scan);
4470     }
4471     else {
4472         NEXT_OFF(scan) = val - scan;
4473     }
4474 }
4475
4476 /*
4477 - regoptail - regtail on operand of first argument; nop if operandless
4478 */
4479 STATIC void
4480 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4481 {
4482     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4483     if (p == NULL || SIZE_ONLY)
4484         return;
4485     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4486         regtail(pRExC_state, NEXTOPER(p), val);
4487     }
4488     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4489         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4490     }
4491     else
4492         return;
4493 }
4494
4495 /*
4496  - regcurly - a little FSA that accepts {\d+,?\d*}
4497  */
4498 STATIC I32
4499 S_regcurly(pTHX_ register char *s)
4500 {
4501     if (*s++ != '{')
4502         return FALSE;
4503     if (!isDIGIT(*s))
4504         return FALSE;
4505     while (isDIGIT(*s))
4506         s++;
4507     if (*s == ',')
4508         s++;
4509     while (isDIGIT(*s))
4510         s++;
4511     if (*s != '}')
4512         return FALSE;
4513     return TRUE;
4514 }
4515
4516
4517 #ifdef DEBUGGING
4518
4519 STATIC regnode *
4520 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4521 {
4522     register U8 op = EXACT;     /* Arbitrary non-END op. */
4523     register regnode *next;
4524
4525     while (op != END && (!last || node < last)) {
4526         /* While that wasn't END last time... */
4527
4528         NODE_ALIGN(node);
4529         op = OP(node);
4530         if (op == CLOSE)
4531             l--;        
4532         next = regnext(node);
4533         /* Where, what. */
4534         if (OP(node) == OPTIMIZED)
4535             goto after_print;
4536         regprop(sv, node);
4537         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4538                       (int)(2*l + 1), "", SvPVX(sv));
4539         if (next == NULL)               /* Next ptr. */
4540             PerlIO_printf(Perl_debug_log, "(0)");
4541         else
4542             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4543         (void)PerlIO_putc(Perl_debug_log, '\n');
4544       after_print:
4545         if (PL_regkind[(U8)op] == BRANCHJ) {
4546             register regnode *nnode = (OP(next) == LONGJMP
4547                                        ? regnext(next)
4548                                        : next);
4549             if (last && nnode > last)
4550                 nnode = last;
4551             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4552         }
4553         else if (PL_regkind[(U8)op] == BRANCH) {
4554             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4555         }
4556         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4557             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4558                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4559         }
4560         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4561             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4562                              next, sv, l + 1);
4563         }
4564         else if ( op == PLUS || op == STAR) {
4565             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4566         }
4567         else if (op == ANYOF) {
4568             /* arglen 1 + class block */
4569             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4570                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4571             node = NEXTOPER(node);
4572         }
4573         else if (PL_regkind[(U8)op] == EXACT) {
4574             /* Literal string, where present. */
4575             node += NODE_SZ_STR(node) - 1;
4576             node = NEXTOPER(node);
4577         }
4578         else {
4579             node = NEXTOPER(node);
4580             node += regarglen[(U8)op];
4581         }
4582         if (op == CURLYX || op == OPEN)
4583             l++;
4584         else if (op == WHILEM)
4585             l--;
4586     }
4587     return node;
4588 }
4589
4590 #endif  /* DEBUGGING */
4591
4592 /*
4593  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4594  */
4595 void
4596 Perl_regdump(pTHX_ regexp *r)
4597 {
4598 #ifdef DEBUGGING
4599     SV *sv = sv_newmortal();
4600
4601     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4602
4603     /* Header fields of interest. */
4604     if (r->anchored_substr)
4605         PerlIO_printf(Perl_debug_log,
4606                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4607                       PL_colors[0],
4608                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4609                       SvPVX(r->anchored_substr),
4610                       PL_colors[1],
4611                       SvTAIL(r->anchored_substr) ? "$" : "",
4612                       (IV)r->anchored_offset);
4613     else if (r->anchored_utf8)
4614         PerlIO_printf(Perl_debug_log,
4615                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4616                       PL_colors[0],
4617                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4618                       SvPVX(r->anchored_utf8),
4619                       PL_colors[1],
4620                       SvTAIL(r->anchored_utf8) ? "$" : "",
4621                       (IV)r->anchored_offset);
4622     if (r->float_substr)
4623         PerlIO_printf(Perl_debug_log,
4624                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4625                       PL_colors[0],
4626                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4627                       SvPVX(r->float_substr),
4628                       PL_colors[1],
4629                       SvTAIL(r->float_substr) ? "$" : "",
4630                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4631     else if (r->float_utf8)
4632         PerlIO_printf(Perl_debug_log,
4633                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4634                       PL_colors[0],
4635                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4636                       SvPVX(r->float_utf8),
4637                       PL_colors[1],
4638                       SvTAIL(r->float_utf8) ? "$" : "",
4639                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4640     if (r->check_substr || r->check_utf8)
4641         PerlIO_printf(Perl_debug_log,
4642                       r->check_substr == r->float_substr
4643                       && r->check_utf8 == r->float_utf8
4644                       ? "(checking floating" : "(checking anchored");
4645     if (r->reganch & ROPT_NOSCAN)
4646         PerlIO_printf(Perl_debug_log, " noscan");
4647     if (r->reganch & ROPT_CHECK_ALL)
4648         PerlIO_printf(Perl_debug_log, " isall");
4649     if (r->check_substr || r->check_utf8)
4650         PerlIO_printf(Perl_debug_log, ") ");
4651
4652     if (r->regstclass) {
4653         regprop(sv, r->regstclass);
4654         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4655     }
4656     if (r->reganch & ROPT_ANCH) {
4657         PerlIO_printf(Perl_debug_log, "anchored");
4658         if (r->reganch & ROPT_ANCH_BOL)
4659             PerlIO_printf(Perl_debug_log, "(BOL)");
4660         if (r->reganch & ROPT_ANCH_MBOL)
4661             PerlIO_printf(Perl_debug_log, "(MBOL)");
4662         if (r->reganch & ROPT_ANCH_SBOL)
4663             PerlIO_printf(Perl_debug_log, "(SBOL)");
4664         if (r->reganch & ROPT_ANCH_GPOS)
4665             PerlIO_printf(Perl_debug_log, "(GPOS)");
4666         PerlIO_putc(Perl_debug_log, ' ');
4667     }
4668     if (r->reganch & ROPT_GPOS_SEEN)
4669         PerlIO_printf(Perl_debug_log, "GPOS ");
4670     if (r->reganch & ROPT_SKIP)
4671         PerlIO_printf(Perl_debug_log, "plus ");
4672     if (r->reganch & ROPT_IMPLICIT)
4673         PerlIO_printf(Perl_debug_log, "implicit ");
4674     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4675     if (r->reganch & ROPT_EVAL_SEEN)
4676         PerlIO_printf(Perl_debug_log, "with eval ");
4677     PerlIO_printf(Perl_debug_log, "\n");
4678     if (r->offsets) {
4679       U32 i;
4680       U32 len = r->offsets[0];
4681       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4682       for (i = 1; i <= len; i++)
4683         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4684                       (UV)r->offsets[i*2-1], 
4685                       (UV)r->offsets[i*2]);
4686       PerlIO_printf(Perl_debug_log, "\n");
4687     }
4688 #endif  /* DEBUGGING */
4689 }
4690
4691 #ifdef DEBUGGING
4692
4693 STATIC void
4694 S_put_byte(pTHX_ SV *sv, int c)
4695 {
4696     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4697         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4698     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4699         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4700     else
4701         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4702 }
4703
4704 #endif  /* DEBUGGING */
4705
4706 /*
4707 - regprop - printable representation of opcode
4708 */
4709 void
4710 Perl_regprop(pTHX_ SV *sv, regnode *o)
4711 {
4712 #ifdef DEBUGGING
4713     register int k;
4714
4715     sv_setpvn(sv, "", 0);
4716     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4717         /* It would be nice to FAIL() here, but this may be called from
4718            regexec.c, and it would be hard to supply pRExC_state. */
4719         Perl_croak(aTHX_ "Corrupted regexp opcode");
4720     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4721
4722     k = PL_regkind[(U8)OP(o)];
4723
4724     if (k == EXACT) {
4725         SV *dsv = sv_2mortal(newSVpvn("", 0));
4726         /* Using is_utf8_string() is a crude hack but it may
4727          * be the best for now since we have no flag "this EXACTish
4728          * node was UTF-8" --jhi */
4729         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4730         char *s    = do_utf8 ?
4731           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4732                          UNI_DISPLAY_REGEX) :
4733           STRING(o);
4734         int len = do_utf8 ?
4735           strlen(s) :
4736           STR_LEN(o);
4737         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4738                        PL_colors[0],
4739                        len, s,
4740                        PL_colors[1]);
4741     }
4742     else if (k == CURLY) {
4743         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4744             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4745         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4746     }
4747     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4748         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4749     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4750         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4751     else if (k == LOGICAL)
4752         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4753     else if (k == ANYOF) {
4754         int i, rangestart = -1;
4755         U8 flags = ANYOF_FLAGS(o);
4756         const char * const anyofs[] = { /* Should be synchronized with
4757                                          * ANYOF_ #xdefines in regcomp.h */
4758             "\\w",
4759             "\\W",
4760             "\\s",
4761             "\\S",
4762             "\\d",
4763             "\\D",
4764             "[:alnum:]",
4765             "[:^alnum:]",
4766             "[:alpha:]",
4767             "[:^alpha:]",
4768             "[:ascii:]",
4769             "[:^ascii:]",
4770             "[:ctrl:]",
4771             "[:^ctrl:]",
4772             "[:graph:]",
4773             "[:^graph:]",
4774             "[:lower:]",
4775             "[:^lower:]",
4776             "[:print:]",
4777             "[:^print:]",
4778             "[:punct:]",
4779             "[:^punct:]",
4780             "[:upper:]",
4781             "[:^upper:]",
4782             "[:xdigit:]",
4783             "[:^xdigit:]",
4784             "[:space:]",
4785             "[:^space:]",
4786             "[:blank:]",
4787             "[:^blank:]"
4788         };
4789
4790         if (flags & ANYOF_LOCALE)
4791             sv_catpv(sv, "{loc}");
4792         if (flags & ANYOF_FOLD)
4793             sv_catpv(sv, "{i}");
4794         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4795         if (flags & ANYOF_INVERT)
4796             sv_catpv(sv, "^");
4797         for (i = 0; i <= 256; i++) {
4798             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4799                 if (rangestart == -1)
4800                     rangestart = i;
4801             } else if (rangestart != -1) {
4802                 if (i <= rangestart + 3)
4803                     for (; rangestart < i; rangestart++)
4804                         put_byte(sv, rangestart);
4805                 else {
4806                     put_byte(sv, rangestart);
4807                     sv_catpv(sv, "-");
4808                     put_byte(sv, i - 1);
4809                 }
4810                 rangestart = -1;
4811             }
4812         }
4813
4814         if (o->flags & ANYOF_CLASS)
4815             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4816                 if (ANYOF_CLASS_TEST(o,i))
4817                     sv_catpv(sv, anyofs[i]);
4818
4819         if (flags & ANYOF_UNICODE)
4820             sv_catpv(sv, "{unicode}");
4821         else if (flags & ANYOF_UNICODE_ALL)
4822             sv_catpv(sv, "{unicode_all}");
4823
4824         {
4825             SV *lv;
4826             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4827         
4828             if (lv) {
4829                 if (sw) {
4830                     U8 s[UTF8_MAXLEN+1];
4831                 
4832                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4833                         U8 *e = uvchr_to_utf8(s, i);
4834                         
4835                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4836                             if (rangestart == -1)
4837                                 rangestart = i;
4838                         } else if (rangestart != -1) {
4839                             U8 *p;
4840                         
4841                             if (i <= rangestart + 3)
4842                                 for (; rangestart < i; rangestart++) {
4843                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4844                                         put_byte(sv, *p);
4845                                 }
4846                             else {
4847                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4848                                     put_byte(sv, *p);
4849                                 sv_catpv(sv, "-");
4850                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4851                                         put_byte(sv, *p);
4852                                 }
4853                                 rangestart = -1;
4854                             }
4855                         }
4856                         
4857                     sv_catpv(sv, "..."); /* et cetera */
4858                 }
4859
4860                 {
4861                     char *s = savepv(SvPVX(lv));
4862                     char *origs = s;
4863                 
4864                     while(*s && *s != '\n') s++;
4865                 
4866                     if (*s == '\n') {
4867                         char *t = ++s;
4868                         
4869                         while (*s) {
4870                             if (*s == '\n')
4871                                 *s = ' ';
4872                             s++;
4873                         }
4874                         if (s[-1] == ' ')
4875                             s[-1] = 0;
4876                         
4877                         sv_catpv(sv, t);
4878                     }
4879                 
4880                     Safefree(origs);
4881                 }
4882             }
4883         }
4884
4885         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4886     }
4887     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4888         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4889 #endif  /* DEBUGGING */
4890 }
4891
4892 SV *
4893 Perl_re_intuit_string(pTHX_ regexp *prog)
4894 {                               /* Assume that RE_INTUIT is set */
4895     DEBUG_r(
4896         {   STRLEN n_a;
4897             char *s = SvPV(prog->check_substr
4898                       ? prog->check_substr : prog->check_utf8, n_a);
4899
4900             if (!PL_colorset) reginitcolors();
4901             PerlIO_printf(Perl_debug_log,
4902                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4903                       PL_colors[4],
4904                       prog->check_substr ? "" : "utf8 ",
4905                       PL_colors[5],PL_colors[0],
4906                       s,
4907                       PL_colors[1],
4908                       (strlen(s) > 60 ? "..." : ""));
4909         } );
4910
4911     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4912 }
4913
4914 void
4915 Perl_pregfree(pTHX_ struct regexp *r)
4916 {
4917 #ifdef DEBUGGING
4918     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4919 #endif
4920
4921     if (!r || (--r->refcnt > 0))
4922         return;
4923     DEBUG_r({
4924          int len;
4925          char *s;
4926
4927          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4928                 r->prelen, 60, UNI_DISPLAY_REGEX)
4929             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4930          len = SvCUR(dsv);
4931          if (!PL_colorset)
4932               reginitcolors();
4933          PerlIO_printf(Perl_debug_log,
4934                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4935                        PL_colors[4],PL_colors[5],PL_colors[0],
4936                        len, len, s,
4937                        PL_colors[1],
4938                        len > 60 ? "..." : "");
4939     });
4940
4941     if (r->precomp)
4942         Safefree(r->precomp);
4943     if (r->offsets)             /* 20010421 MJD */
4944         Safefree(r->offsets);
4945     if (RX_MATCH_COPIED(r))
4946         Safefree(r->subbeg);
4947     if (r->substrs) {
4948         if (r->anchored_substr)
4949             SvREFCNT_dec(r->anchored_substr);
4950         if (r->anchored_utf8)
4951             SvREFCNT_dec(r->anchored_utf8);
4952         if (r->float_substr)
4953             SvREFCNT_dec(r->float_substr);
4954         if (r->float_utf8)
4955             SvREFCNT_dec(r->float_utf8);
4956         Safefree(r->substrs);
4957     }
4958     if (r->data) {
4959         int n = r->data->count;
4960         PAD* new_comppad = NULL;
4961         PAD* old_comppad;
4962         PADOFFSET refcnt;
4963
4964         while (--n >= 0) {
4965           /* If you add a ->what type here, update the comment in regcomp.h */
4966             switch (r->data->what[n]) {
4967             case 's':
4968                 SvREFCNT_dec((SV*)r->data->data[n]);
4969                 break;
4970             case 'f':
4971                 Safefree(r->data->data[n]);
4972                 break;
4973             case 'p':
4974                 new_comppad = (AV*)r->data->data[n];
4975                 break;
4976             case 'o':
4977                 if (new_comppad == NULL)
4978                     Perl_croak(aTHX_ "panic: pregfree comppad");
4979                 PAD_SAVE_LOCAL(old_comppad,
4980                     /* Watch out for global destruction's random ordering. */
4981                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4982                                 new_comppad : Null(PAD *)
4983                 );
4984                 OP_REFCNT_LOCK;
4985                 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4986                 OP_REFCNT_UNLOCK;
4987                 if (!refcnt)
4988                     op_free((OP_4tree*)r->data->data[n]);
4989
4990                 PAD_RESTORE_LOCAL(old_comppad);
4991                 SvREFCNT_dec((SV*)new_comppad);
4992                 new_comppad = NULL;
4993                 break;
4994             case 'n':
4995                 break;
4996             default:
4997                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4998             }
4999         }
5000         Safefree(r->data->what);
5001         Safefree(r->data);
5002     }
5003     Safefree(r->startp);
5004     Safefree(r->endp);
5005     Safefree(r);
5006 }
5007
5008 /*
5009  - regnext - dig the "next" pointer out of a node
5010  *
5011  * [Note, when REGALIGN is defined there are two places in regmatch()
5012  * that bypass this code for speed.]
5013  */
5014 regnode *
5015 Perl_regnext(pTHX_ register regnode *p)
5016 {
5017     register I32 offset;
5018
5019     if (p == &PL_regdummy)
5020         return(NULL);
5021
5022     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5023     if (offset == 0)
5024         return(NULL);
5025
5026     return(p+offset);
5027 }
5028
5029 STATIC void     
5030 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5031 {
5032     va_list args;
5033     STRLEN l1 = strlen(pat1);
5034     STRLEN l2 = strlen(pat2);
5035     char buf[512];
5036     SV *msv;
5037     char *message;
5038
5039     if (l1 > 510)
5040         l1 = 510;
5041     if (l1 + l2 > 510)
5042         l2 = 510 - l1;
5043     Copy(pat1, buf, l1 , char);
5044     Copy(pat2, buf + l1, l2 , char);
5045     buf[l1 + l2] = '\n';
5046     buf[l1 + l2 + 1] = '\0';
5047 #ifdef I_STDARG
5048     /* ANSI variant takes additional second argument */
5049     va_start(args, pat2);
5050 #else
5051     va_start(args);
5052 #endif
5053     msv = vmess(buf, &args);
5054     va_end(args);
5055     message = SvPV(msv,l1);
5056     if (l1 > 512)
5057         l1 = 512;
5058     Copy(message, buf, l1 , char);
5059     buf[l1-1] = '\0';                   /* Overwrite \n */
5060     Perl_croak(aTHX_ "%s", buf);
5061 }
5062
5063 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5064
5065 void
5066 Perl_save_re_context(pTHX)
5067 {
5068     SAVEI32(PL_reg_flags);              /* from regexec.c */
5069     SAVEPPTR(PL_bostr);
5070     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5071     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5072     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5073     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5074     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5075     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5076     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5077     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5078     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5079     PL_reg_start_tmp = 0;
5080     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5081     PL_reg_start_tmpl = 0;
5082     SAVEVPTR(PL_regdata);
5083     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5084     SAVEI32(PL_regnarrate);             /* from regexec.c */
5085     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5086     SAVEINT(PL_regindent);              /* from regexec.c */
5087     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5088     SAVEVPTR(PL_curcop);
5089     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5090     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5091     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5092     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5093     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5094     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5095     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5096     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5097     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5098     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5099     PL_reg_oldsaved = Nullch;
5100     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5101     PL_reg_oldsavedlen = 0;
5102     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5103     PL_reg_maxiter = 0;
5104     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5105     PL_reg_leftiter = 0;
5106     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5107     PL_reg_poscache = Nullch;
5108     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5109     PL_reg_poscache_size = 0;
5110     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5111     SAVEI32(PL_regnpar);                /* () count. */
5112     SAVEI32(PL_regsize);                /* from regexec.c */
5113
5114     {
5115         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5116         U32 i;
5117         GV *mgv;
5118         REGEXP *rx;
5119         char digits[TYPE_CHARS(long)];
5120
5121         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5122             for (i = 1; i <= rx->nparens; i++) {
5123                 sprintf(digits, "%lu", (long)i);
5124                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5125                     save_scalar(mgv);
5126             }
5127         }
5128     }
5129
5130 #ifdef DEBUGGING
5131     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5132 #endif
5133 }
5134
5135 static void
5136 clear_re(pTHX_ void *r)
5137 {
5138     ReREFCNT_dec((regexp *)r);
5139 }
5140