This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add comment to top of reentr.c and fix typos in other files
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* This file contains functions for compiling a regular expression.  See
9  * also regexec.c which funnily enough, contains functions for executing
10  * a regular expression.
11  */
12
13 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
14  * confused with the original package (see point 3 below).  Thanks, Henry!
15  */
16
17 /* Additional note: this code is very heavily munged from Henry's version
18  * in places.  In some spots I've traded clarity for efficiency, so don't
19  * blame Henry for some of the lack of readability.
20  */
21
22 /* The names of the functions have been changed from regcomp and
23  * regexec to  pregcomp and pregexec in order to avoid conflicts
24  * with the POSIX routines of the same names.
25 */
26
27 #ifdef PERL_EXT_RE_BUILD
28 /* need to replace pregcomp et al, so enable that */
29 #  ifndef PERL_IN_XSUB_RE
30 #    define PERL_IN_XSUB_RE
31 #  endif
32 /* need access to debugger hooks */
33 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
34 #    define DEBUGGING
35 #  endif
36 #endif
37
38 #ifdef PERL_IN_XSUB_RE
39 /* We *really* need to overwrite these symbols: */
40 #  define Perl_pregcomp my_regcomp
41 #  define Perl_regdump my_regdump
42 #  define Perl_regprop my_regprop
43 #  define Perl_pregfree my_regfree
44 #  define Perl_re_intuit_string my_re_intuit_string
45 /* *These* symbols are masked to allow static link. */
46 #  define Perl_regnext my_regnext
47 #  define Perl_save_re_context my_save_re_context
48 #  define Perl_reginitcolors my_reginitcolors
49
50 #  define PERL_NO_GET_CONTEXT
51 #endif
52
53 /*SUPPRESS 112*/
54 /*
55  * pregcomp and pregexec -- regsub and regerror are not used in perl
56  *
57  *      Copyright (c) 1986 by University of Toronto.
58  *      Written by Henry Spencer.  Not derived from licensed software.
59  *
60  *      Permission is granted to anyone to use this software for any
61  *      purpose on any computer system, and to redistribute it freely,
62  *      subject to the following restrictions:
63  *
64  *      1. The author is not responsible for the consequences of use of
65  *              this software, no matter how awful, even if they arise
66  *              from defects in it.
67  *
68  *      2. The origin of this software must not be misrepresented, either
69  *              by explicit claim or by omission.
70  *
71  *      3. Altered versions must be plainly marked as such, and must not
72  *              be misrepresented as being the original software.
73  *
74  *
75  ****    Alterations to Henry's code are...
76  ****
77  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
78  ****    2000, 2001, 2002, 2003, by Larry Wall and others
79  ****
80  ****    You may distribute under the terms of either the GNU General Public
81  ****    License or the Artistic License, as specified in the README file.
82
83  *
84  * Beware that some of this code is subtly aware of the way operator
85  * precedence is structured in regular expressions.  Serious changes in
86  * regular-expression syntax might require a total rethink.
87  */
88 #include "EXTERN.h"
89 #define PERL_IN_REGCOMP_C
90 #include "perl.h"
91
92 #ifndef PERL_IN_XSUB_RE
93 #  include "INTERN.h"
94 #endif
95
96 #define REG_COMP_C
97 #include "regcomp.h"
98
99 #ifdef op
100 #undef op
101 #endif /* op */
102
103 #ifdef MSDOS
104 #  if defined(BUGGY_MSC6)
105  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
106 #    pragma optimize("a",off)
107  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
108 #    pragma optimize("w",on )
109 #  endif /* BUGGY_MSC6 */
110 #endif /* MSDOS */
111
112 #ifndef STATIC
113 #define STATIC  static
114 #endif
115
116 typedef struct RExC_state_t {
117     U32         flags;                  /* are we folding, multilining? */
118     char        *precomp;               /* uncompiled string. */
119     regexp      *rx;
120     char        *start;                 /* Start of input for compile */
121     char        *end;                   /* End of input for compile */
122     char        *parse;                 /* Input-scan pointer. */
123     I32         whilem_seen;            /* number of WHILEM in this expr */
124     regnode     *emit_start;            /* Start of emitted-code area */
125     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
126     I32         naughty;                /* How bad is this pattern? */
127     I32         sawback;                /* Did we see \1, ...? */
128     U32         seen;
129     I32         size;                   /* Code size. */
130     I32         npar;                   /* () count. */
131     I32         extralen;
132     I32         seen_zerolen;
133     I32         seen_evals;
134     I32         utf8;
135 #if ADD_TO_REGEXEC
136     char        *starttry;              /* -Dr: where regtry was called. */
137 #define RExC_starttry   (pRExC_state->starttry)
138 #endif
139 } RExC_state_t;
140
141 #define RExC_flags      (pRExC_state->flags)
142 #define RExC_precomp    (pRExC_state->precomp)
143 #define RExC_rx         (pRExC_state->rx)
144 #define RExC_start      (pRExC_state->start)
145 #define RExC_end        (pRExC_state->end)
146 #define RExC_parse      (pRExC_state->parse)
147 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
148 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
149 #define RExC_emit       (pRExC_state->emit)
150 #define RExC_emit_start (pRExC_state->emit_start)
151 #define RExC_naughty    (pRExC_state->naughty)
152 #define RExC_sawback    (pRExC_state->sawback)
153 #define RExC_seen       (pRExC_state->seen)
154 #define RExC_size       (pRExC_state->size)
155 #define RExC_npar       (pRExC_state->npar)
156 #define RExC_extralen   (pRExC_state->extralen)
157 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
158 #define RExC_seen_evals (pRExC_state->seen_evals)
159 #define RExC_utf8       (pRExC_state->utf8)
160
161 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
162 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
163         ((*s) == '{' && regcurly(s)))
164
165 #ifdef SPSTART
166 #undef SPSTART          /* dratted cpp namespace... */
167 #endif
168 /*
169  * Flags to be passed up and down.
170  */
171 #define WORST           0       /* Worst case. */
172 #define HASWIDTH        0x1     /* Known to match non-null strings. */
173 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
174 #define SPSTART         0x4     /* Starts with * or +. */
175 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
176
177 /* Length of a variant. */
178
179 typedef struct scan_data_t {
180     I32 len_min;
181     I32 len_delta;
182     I32 pos_min;
183     I32 pos_delta;
184     SV *last_found;
185     I32 last_end;                       /* min value, <0 unless valid. */
186     I32 last_start_min;
187     I32 last_start_max;
188     SV **longest;                       /* Either &l_fixed, or &l_float. */
189     SV *longest_fixed;
190     I32 offset_fixed;
191     SV *longest_float;
192     I32 offset_float_min;
193     I32 offset_float_max;
194     I32 flags;
195     I32 whilem_c;
196     I32 *last_closep;
197     struct regnode_charclass_class *start_class;
198 } scan_data_t;
199
200 /*
201  * Forward declarations for pregcomp()'s friends.
202  */
203
204 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205                                       0, 0, 0, 0, 0, 0};
206
207 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
208 #define SF_BEFORE_SEOL          0x1
209 #define SF_BEFORE_MEOL          0x2
210 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
212
213 #ifdef NO_UNARY_PLUS
214 #  define SF_FIX_SHIFT_EOL      (0+2)
215 #  define SF_FL_SHIFT_EOL               (0+4)
216 #else
217 #  define SF_FIX_SHIFT_EOL      (+2)
218 #  define SF_FL_SHIFT_EOL               (+4)
219 #endif
220
221 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
223
224 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
226 #define SF_IS_INF               0x40
227 #define SF_HAS_PAR              0x80
228 #define SF_IN_PAR               0x100
229 #define SF_HAS_EVAL             0x200
230 #define SCF_DO_SUBSTR           0x400
231 #define SCF_DO_STCLASS_AND      0x0800
232 #define SCF_DO_STCLASS_OR       0x1000
233 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
234 #define SCF_WHILEM_VISITED_POS  0x2000
235
236 #define UTF (RExC_utf8 != 0)
237 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
238 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
239
240 #define OOB_UNICODE             12345678
241 #define OOB_NAMEDCLASS          -1
242
243 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
244 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
245
246
247 /* length of regex to show in messages that don't mark a position within */
248 #define RegexLengthToShowInErrorMessages 127
249
250 /*
251  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
252  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
253  * op/pragma/warn/regcomp.
254  */
255 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
256 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
257
258 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
259
260 /*
261  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
262  * arg. Show regex, up to a maximum length. If it's too long, chop and add
263  * "...".
264  */
265 #define FAIL(msg) STMT_START {                                          \
266     char *ellipses = "";                                                \
267     IV len = RExC_end - RExC_precomp;                                   \
268                                                                         \
269     if (!SIZE_ONLY)                                                     \
270         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
271     if (len > RegexLengthToShowInErrorMessages) {                       \
272         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
273         len = RegexLengthToShowInErrorMessages - 10;                    \
274         ellipses = "...";                                               \
275     }                                                                   \
276     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
277             msg, (int)len, RExC_precomp, ellipses);                     \
278 } STMT_END
279
280 /*
281  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282  * args. Show regex, up to a maximum length. If it's too long, chop and add
283  * "...".
284  */
285 #define FAIL2(pat,msg) STMT_START {                                     \
286     char *ellipses = "";                                                \
287     IV len = RExC_end - RExC_precomp;                                   \
288                                                                         \
289     if (!SIZE_ONLY)                                                     \
290         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
291     if (len > RegexLengthToShowInErrorMessages) {                       \
292         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
293         len = RegexLengthToShowInErrorMessages - 10;                    \
294         ellipses = "...";                                               \
295     }                                                                   \
296     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
297             msg, (int)len, RExC_precomp, ellipses);                     \
298 } STMT_END
299
300
301 /*
302  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
303  */
304 #define Simple_vFAIL(m) STMT_START {                                    \
305     IV offset = RExC_parse - RExC_precomp;                              \
306     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
307             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
308 } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m) STMT_START {                           \
314     if (!SIZE_ONLY)                                     \
315         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
316     Simple_vFAIL(m);                                    \
317 } STMT_END
318
319 /*
320  * Like Simple_vFAIL(), but accepts two arguments.
321  */
322 #define Simple_vFAIL2(m,a1) STMT_START {                        \
323     IV offset = RExC_parse - RExC_precomp;                      \
324     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
325             (int)offset, RExC_precomp, RExC_precomp + offset);  \
326 } STMT_END
327
328 /*
329  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
330  */
331 #define vFAIL2(m,a1) STMT_START {                       \
332     if (!SIZE_ONLY)                                     \
333         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
334     Simple_vFAIL2(m, a1);                               \
335 } STMT_END
336
337
338 /*
339  * Like Simple_vFAIL(), but accepts three arguments.
340  */
341 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
342     IV offset = RExC_parse - RExC_precomp;                      \
343     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
344             (int)offset, RExC_precomp, RExC_precomp + offset);  \
345 } STMT_END
346
347 /*
348  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
349  */
350 #define vFAIL3(m,a1,a2) STMT_START {                    \
351     if (!SIZE_ONLY)                                     \
352         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
353     Simple_vFAIL3(m, a1, a2);                           \
354 } STMT_END
355
356 /*
357  * Like Simple_vFAIL(), but accepts four arguments.
358  */
359 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
360     IV offset = RExC_parse - RExC_precomp;                      \
361     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
362             (int)offset, RExC_precomp, RExC_precomp + offset);  \
363 } STMT_END
364
365 /*
366  * Like Simple_vFAIL(), but accepts five arguments.
367  */
368 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
369     IV offset = RExC_parse - RExC_precomp;                      \
370     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
371             (int)offset, RExC_precomp, RExC_precomp + offset);  \
372 } STMT_END
373
374
375 #define vWARN(loc,m) STMT_START {                                       \
376     IV offset = loc - RExC_precomp;                                     \
377     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
378             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
379 } STMT_END
380
381 #define vWARNdep(loc,m) STMT_START {                                    \
382     IV offset = loc - RExC_precomp;                                     \
383     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
384             "%s" REPORT_LOCATION,                                       \
385             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
386 } STMT_END
387
388
389 #define vWARN2(loc, m, a1) STMT_START {                                 \
390     IV offset = loc - RExC_precomp;                                     \
391     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
392             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
393 } STMT_END
394
395 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
396     IV offset = loc - RExC_precomp;                                     \
397     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
398             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
399 } STMT_END
400
401 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
402     IV offset = loc - RExC_precomp;                                     \
403     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
404             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 } STMT_END
406
407 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
408     IV offset = loc - RExC_precomp;                                     \
409     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
410             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
411 } STMT_END
412
413
414 /* Allow for side effects in s */
415 #define REGC(c,s) STMT_START {                  \
416     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
417 } STMT_END
418
419 /* Macros for recording node offsets.   20001227 mjd@plover.com 
420  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
421  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
422  * Element 0 holds the number n.
423  */
424
425 #define MJD_OFFSET_DEBUG(x)
426 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
427
428
429 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
430     if (! SIZE_ONLY) {                                                  \
431         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
432                 __LINE__, (node), (byte)));                             \
433         if((node) < 0) {                                                \
434             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
435         } else {                                                        \
436             RExC_offsets[2*(node)-1] = (byte);                          \
437         }                                                               \
438     }                                                                   \
439 } STMT_END
440
441 #define Set_Node_Offset(node,byte) \
442     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
443 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
444
445 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
446     if (! SIZE_ONLY) {                                                  \
447         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
448                 __LINE__, (node), (len)));                              \
449         if((node) < 0) {                                                \
450             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
451         } else {                                                        \
452             RExC_offsets[2*(node)] = (len);                             \
453         }                                                               \
454     }                                                                   \
455 } STMT_END
456
457 #define Set_Node_Length(node,len) \
458     Set_Node_Length_To_R((node)-RExC_emit_start, len)
459 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
460 #define Set_Node_Cur_Length(node) \
461     Set_Node_Length(node, RExC_parse - parse_start)
462
463 /* Get offsets and lengths */
464 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
465 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
466
467 static void clear_re(pTHX_ void *r);
468
469 /* Mark that we cannot extend a found fixed substring at this point.
470    Updata the longest found anchored substring and the longest found
471    floating substrings if needed. */
472
473 STATIC void
474 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
475 {
476     STRLEN l = CHR_SVLEN(data->last_found);
477     STRLEN old_l = CHR_SVLEN(*data->longest);
478
479     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
480         SvSetMagicSV(*data->longest, data->last_found);
481         if (*data->longest == data->longest_fixed) {
482             data->offset_fixed = l ? data->last_start_min : data->pos_min;
483             if (data->flags & SF_BEFORE_EOL)
484                 data->flags
485                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
486             else
487                 data->flags &= ~SF_FIX_BEFORE_EOL;
488         }
489         else {
490             data->offset_float_min = l ? data->last_start_min : data->pos_min;
491             data->offset_float_max = (l
492                                       ? data->last_start_max
493                                       : data->pos_min + data->pos_delta);
494             if ((U32)data->offset_float_max > (U32)I32_MAX)
495                 data->offset_float_max = I32_MAX;
496             if (data->flags & SF_BEFORE_EOL)
497                 data->flags
498                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
499             else
500                 data->flags &= ~SF_FL_BEFORE_EOL;
501         }
502     }
503     SvCUR_set(data->last_found, 0);
504     {
505         SV * sv = data->last_found;
506         MAGIC *mg =
507             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
508         if (mg && mg->mg_len > 0)
509             mg->mg_len = 0;
510     }
511     data->last_end = -1;
512     data->flags &= ~SF_BEFORE_EOL;
513 }
514
515 /* Can match anything (initialization) */
516 STATIC void
517 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
518 {
519     ANYOF_CLASS_ZERO(cl);
520     ANYOF_BITMAP_SETALL(cl);
521     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
522     if (LOC)
523         cl->flags |= ANYOF_LOCALE;
524 }
525
526 /* Can match anything (initialization) */
527 STATIC int
528 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
529 {
530     int value;
531
532     for (value = 0; value <= ANYOF_MAX; value += 2)
533         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
534             return 1;
535     if (!(cl->flags & ANYOF_UNICODE_ALL))
536         return 0;
537     if (!ANYOF_BITMAP_TESTALLSET(cl))
538         return 0;
539     return 1;
540 }
541
542 /* Can match anything (initialization) */
543 STATIC void
544 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
545 {
546     Zero(cl, 1, struct regnode_charclass_class);
547     cl->type = ANYOF;
548     cl_anything(pRExC_state, cl);
549 }
550
551 STATIC void
552 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
553 {
554     Zero(cl, 1, struct regnode_charclass_class);
555     cl->type = ANYOF;
556     cl_anything(pRExC_state, cl);
557     if (LOC)
558         cl->flags |= ANYOF_LOCALE;
559 }
560
561 /* 'And' a given class with another one.  Can create false positives */
562 /* We assume that cl is not inverted */
563 STATIC void
564 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
565          struct regnode_charclass_class *and_with)
566 {
567     if (!(and_with->flags & ANYOF_CLASS)
568         && !(cl->flags & ANYOF_CLASS)
569         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
570         && !(and_with->flags & ANYOF_FOLD)
571         && !(cl->flags & ANYOF_FOLD)) {
572         int i;
573
574         if (and_with->flags & ANYOF_INVERT)
575             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
576                 cl->bitmap[i] &= ~and_with->bitmap[i];
577         else
578             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
579                 cl->bitmap[i] &= and_with->bitmap[i];
580     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
581     if (!(and_with->flags & ANYOF_EOS))
582         cl->flags &= ~ANYOF_EOS;
583
584     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
585         !(and_with->flags & ANYOF_INVERT)) {
586         cl->flags &= ~ANYOF_UNICODE_ALL;
587         cl->flags |= ANYOF_UNICODE;
588         ARG_SET(cl, ARG(and_with));
589     }
590     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
591         !(and_with->flags & ANYOF_INVERT))
592         cl->flags &= ~ANYOF_UNICODE_ALL;
593     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
594         !(and_with->flags & ANYOF_INVERT))
595         cl->flags &= ~ANYOF_UNICODE;
596 }
597
598 /* 'OR' a given class with another one.  Can create false positives */
599 /* We assume that cl is not inverted */
600 STATIC void
601 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
602 {
603     if (or_with->flags & ANYOF_INVERT) {
604         /* We do not use
605          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
606          *   <= (B1 | !B2) | (CL1 | !CL2)
607          * which is wasteful if CL2 is small, but we ignore CL2:
608          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
609          * XXXX Can we handle case-fold?  Unclear:
610          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
611          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
612          */
613         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
614              && !(or_with->flags & ANYOF_FOLD)
615              && !(cl->flags & ANYOF_FOLD) ) {
616             int i;
617
618             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
619                 cl->bitmap[i] |= ~or_with->bitmap[i];
620         } /* XXXX: logic is complicated otherwise */
621         else {
622             cl_anything(pRExC_state, cl);
623         }
624     } else {
625         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
626         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
627              && (!(or_with->flags & ANYOF_FOLD)
628                  || (cl->flags & ANYOF_FOLD)) ) {
629             int i;
630
631             /* OR char bitmap and class bitmap separately */
632             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
633                 cl->bitmap[i] |= or_with->bitmap[i];
634             if (or_with->flags & ANYOF_CLASS) {
635                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
636                     cl->classflags[i] |= or_with->classflags[i];
637                 cl->flags |= ANYOF_CLASS;
638             }
639         }
640         else { /* XXXX: logic is complicated, leave it along for a moment. */
641             cl_anything(pRExC_state, cl);
642         }
643     }
644     if (or_with->flags & ANYOF_EOS)
645         cl->flags |= ANYOF_EOS;
646
647     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
648         ARG(cl) != ARG(or_with)) {
649         cl->flags |= ANYOF_UNICODE_ALL;
650         cl->flags &= ~ANYOF_UNICODE;
651     }
652     if (or_with->flags & ANYOF_UNICODE_ALL) {
653         cl->flags |= ANYOF_UNICODE_ALL;
654         cl->flags &= ~ANYOF_UNICODE;
655     }
656 }
657
658 /*
659  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
660  * These need to be revisited when a newer toolchain becomes available.
661  */
662 #if defined(__sparc64__) && defined(__GNUC__)
663 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
664 #       undef  SPARC64_GCC_WORKAROUND
665 #       define SPARC64_GCC_WORKAROUND 1
666 #   endif
667 #endif
668
669 /* REx optimizer.  Converts nodes into quickier variants "in place".
670    Finds fixed substrings.  */
671
672 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673    to the position after last scanned or to NULL. */
674
675 STATIC I32
676 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
677                         /* scanp: Start here (read-write). */
678                         /* deltap: Write maxlen-minlen here. */
679                         /* last: Stop before this one. */
680 {
681     I32 min = 0, pars = 0, code;
682     regnode *scan = *scanp, *next;
683     I32 delta = 0;
684     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
685     int is_inf_internal = 0;            /* The studied chunk is infinite */
686     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687     scan_data_t data_fake;
688     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
689
690     while (scan && OP(scan) != END && scan < last) {
691         /* Peephole optimizer: */
692
693         if (PL_regkind[(U8)OP(scan)] == EXACT) {
694             /* Merge several consecutive EXACTish nodes into one. */
695             regnode *n = regnext(scan);
696             U32 stringok = 1;
697 #ifdef DEBUGGING
698             regnode *stop = scan;
699 #endif
700
701             next = scan + NODE_SZ_STR(scan);
702             /* Skip NOTHING, merge EXACT*. */
703             while (n &&
704                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
705                      (stringok && (OP(n) == OP(scan))))
706                    && NEXT_OFF(n)
707                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708                 if (OP(n) == TAIL || n > next)
709                     stringok = 0;
710                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
711                     NEXT_OFF(scan) += NEXT_OFF(n);
712                     next = n + NODE_STEP_REGNODE;
713 #ifdef DEBUGGING
714                     if (stringok)
715                         stop = n;
716 #endif
717                     n = regnext(n);
718                 }
719                 else if (stringok) {
720                     int oldl = STR_LEN(scan);
721                     regnode *nnext = regnext(n);
722
723                     if (oldl + STR_LEN(n) > U8_MAX)
724                         break;
725                     NEXT_OFF(scan) += NEXT_OFF(n);
726                     STR_LEN(scan) += STR_LEN(n);
727                     next = n + NODE_SZ_STR(n);
728                     /* Now we can overwrite *n : */
729                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
730 #ifdef DEBUGGING
731                     stop = next - 1;
732 #endif
733                     n = nnext;
734                 }
735             }
736
737             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
738 /*
739   Two problematic code points in Unicode casefolding of EXACT nodes:
740
741    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
742    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
743
744    which casefold to
745
746    Unicode                      UTF-8
747
748    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
749    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
750
751    This means that in case-insensitive matching (or "loose matching",
752    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
753    length of the above casefolded versions) can match a target string
754    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
755    This would rather mess up the minimum length computation.
756
757    What we'll do is to look for the tail four bytes, and then peek
758    at the preceding two bytes to see whether we need to decrease
759    the minimum length by four (six minus two).
760
761    Thanks to the design of UTF-8, there cannot be false matches:
762    A sequence of valid UTF-8 bytes cannot be a subsequence of
763    another valid sequence of UTF-8 bytes.
764
765 */
766                  char *s0 = STRING(scan), *s, *t;
767                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
768                  char *t0 = "\xcc\x88\xcc\x81";
769                  char *t1 = t0 + 3;
770                  
771                  for (s = s0 + 2;
772                       s < s2 && (t = ninstr(s, s1, t0, t1));
773                       s = t + 4) {
774                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
775                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
776                            min -= 4;
777                  }
778             }
779
780 #ifdef DEBUGGING
781             /* Allow dumping */
782             n = scan + NODE_SZ_STR(scan);
783             while (n <= stop) {
784                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
785                     OP(n) = OPTIMIZED;
786                     NEXT_OFF(n) = 0;
787                 }
788                 n++;
789             }
790 #endif
791         }
792         /* Follow the next-chain of the current node and optimize
793            away all the NOTHINGs from it.  */
794         if (OP(scan) != CURLYX) {
795             int max = (reg_off_by_arg[OP(scan)]
796                        ? I32_MAX
797                        /* I32 may be smaller than U16 on CRAYs! */
798                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
799             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
800             int noff;
801             regnode *n = scan;
802         
803             /* Skip NOTHING and LONGJMP. */
804             while ((n = regnext(n))
805                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
806                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
807                    && off + noff < max)
808                 off += noff;
809             if (reg_off_by_arg[OP(scan)])
810                 ARG(scan) = off;
811             else
812                 NEXT_OFF(scan) = off;
813         }
814         /* The principal pseudo-switch.  Cannot be a switch, since we
815            look into several different things.  */
816         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
817                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
818             next = regnext(scan);
819             code = OP(scan);
820         
821             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
822                 I32 max1 = 0, min1 = I32_MAX, num = 0;
823                 struct regnode_charclass_class accum;
824                 
825                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
826                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
827                 if (flags & SCF_DO_STCLASS)
828                     cl_init_zero(pRExC_state, &accum);
829                 while (OP(scan) == code) {
830                     I32 deltanext, minnext, f = 0, fake;
831                     struct regnode_charclass_class this_class;
832
833                     num++;
834                     data_fake.flags = 0;
835                     if (data) {         
836                         data_fake.whilem_c = data->whilem_c;
837                         data_fake.last_closep = data->last_closep;
838                     }
839                     else
840                         data_fake.last_closep = &fake;
841                     next = regnext(scan);
842                     scan = NEXTOPER(scan);
843                     if (code != BRANCH)
844                         scan = NEXTOPER(scan);
845                     if (flags & SCF_DO_STCLASS) {
846                         cl_init(pRExC_state, &this_class);
847                         data_fake.start_class = &this_class;
848                         f = SCF_DO_STCLASS_AND;
849                     }           
850                     if (flags & SCF_WHILEM_VISITED_POS)
851                         f |= SCF_WHILEM_VISITED_POS;
852                     /* we suppose the run is continuous, last=next...*/
853                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
854                                           next, &data_fake, f);
855                     if (min1 > minnext)
856                         min1 = minnext;
857                     if (max1 < minnext + deltanext)
858                         max1 = minnext + deltanext;
859                     if (deltanext == I32_MAX)
860                         is_inf = is_inf_internal = 1;
861                     scan = next;
862                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
863                         pars++;
864                     if (data && (data_fake.flags & SF_HAS_EVAL))
865                         data->flags |= SF_HAS_EVAL;
866                     if (data)
867                         data->whilem_c = data_fake.whilem_c;
868                     if (flags & SCF_DO_STCLASS)
869                         cl_or(pRExC_state, &accum, &this_class);
870                     if (code == SUSPEND)
871                         break;
872                 }
873                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
874                     min1 = 0;
875                 if (flags & SCF_DO_SUBSTR) {
876                     data->pos_min += min1;
877                     data->pos_delta += max1 - min1;
878                     if (max1 != min1 || is_inf)
879                         data->longest = &(data->longest_float);
880                 }
881                 min += min1;
882                 delta += max1 - min1;
883                 if (flags & SCF_DO_STCLASS_OR) {
884                     cl_or(pRExC_state, data->start_class, &accum);
885                     if (min1) {
886                         cl_and(data->start_class, &and_with);
887                         flags &= ~SCF_DO_STCLASS;
888                     }
889                 }
890                 else if (flags & SCF_DO_STCLASS_AND) {
891                     if (min1) {
892                         cl_and(data->start_class, &accum);
893                         flags &= ~SCF_DO_STCLASS;
894                     }
895                     else {
896                         /* Switch to OR mode: cache the old value of
897                          * data->start_class */
898                         StructCopy(data->start_class, &and_with,
899                                    struct regnode_charclass_class);
900                         flags &= ~SCF_DO_STCLASS_AND;
901                         StructCopy(&accum, data->start_class,
902                                    struct regnode_charclass_class);
903                         flags |= SCF_DO_STCLASS_OR;
904                         data->start_class->flags |= ANYOF_EOS;
905                     }
906                 }
907             }
908             else if (code == BRANCHJ)   /* single branch is optimized. */
909                 scan = NEXTOPER(NEXTOPER(scan));
910             else                        /* single branch is optimized. */
911                 scan = NEXTOPER(scan);
912             continue;
913         }
914         else if (OP(scan) == EXACT) {
915             I32 l = STR_LEN(scan);
916             UV uc = *((U8*)STRING(scan));
917             if (UTF) {
918                 U8 *s = (U8*)STRING(scan);
919                 l = utf8_length(s, s + l);
920                 uc = utf8_to_uvchr(s, NULL);
921             }
922             min += l;
923             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
924                 /* The code below prefers earlier match for fixed
925                    offset, later match for variable offset.  */
926                 if (data->last_end == -1) { /* Update the start info. */
927                     data->last_start_min = data->pos_min;
928                     data->last_start_max = is_inf
929                         ? I32_MAX : data->pos_min + data->pos_delta;
930                 }
931                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
932                 {
933                     SV * sv = data->last_found;
934                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
935                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
936                     if (mg && mg->mg_len >= 0)
937                         mg->mg_len += utf8_length((U8*)STRING(scan),
938                                                   (U8*)STRING(scan)+STR_LEN(scan));
939                 }
940                 if (UTF)
941                     SvUTF8_on(data->last_found);
942                 data->last_end = data->pos_min + l;
943                 data->pos_min += l; /* As in the first entry. */
944                 data->flags &= ~SF_BEFORE_EOL;
945             }
946             if (flags & SCF_DO_STCLASS_AND) {
947                 /* Check whether it is compatible with what we know already! */
948                 int compat = 1;
949
950                 if (uc >= 0x100 ||
951                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
952                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
953                     && (!(data->start_class->flags & ANYOF_FOLD)
954                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
955                     )
956                     compat = 0;
957                 ANYOF_CLASS_ZERO(data->start_class);
958                 ANYOF_BITMAP_ZERO(data->start_class);
959                 if (compat)
960                     ANYOF_BITMAP_SET(data->start_class, uc);
961                 data->start_class->flags &= ~ANYOF_EOS;
962                 if (uc < 0x100)
963                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
964             }
965             else if (flags & SCF_DO_STCLASS_OR) {
966                 /* false positive possible if the class is case-folded */
967                 if (uc < 0x100)
968                     ANYOF_BITMAP_SET(data->start_class, uc);
969                 else
970                     data->start_class->flags |= ANYOF_UNICODE_ALL;
971                 data->start_class->flags &= ~ANYOF_EOS;
972                 cl_and(data->start_class, &and_with);
973             }
974             flags &= ~SCF_DO_STCLASS;
975         }
976         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
977             I32 l = STR_LEN(scan);
978             UV uc = *((U8*)STRING(scan));
979
980             /* Search for fixed substrings supports EXACT only. */
981             if (flags & SCF_DO_SUBSTR)
982                 scan_commit(pRExC_state, data);
983             if (UTF) {
984                 U8 *s = (U8 *)STRING(scan);
985                 l = utf8_length(s, s + l);
986                 uc = utf8_to_uvchr(s, NULL);
987             }
988             min += l;
989             if (data && (flags & SCF_DO_SUBSTR))
990                 data->pos_min += l;
991             if (flags & SCF_DO_STCLASS_AND) {
992                 /* Check whether it is compatible with what we know already! */
993                 int compat = 1;
994
995                 if (uc >= 0x100 ||
996                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
997                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
998                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
999                     compat = 0;
1000                 ANYOF_CLASS_ZERO(data->start_class);
1001                 ANYOF_BITMAP_ZERO(data->start_class);
1002                 if (compat) {
1003                     ANYOF_BITMAP_SET(data->start_class, uc);
1004                     data->start_class->flags &= ~ANYOF_EOS;
1005                     data->start_class->flags |= ANYOF_FOLD;
1006                     if (OP(scan) == EXACTFL)
1007                         data->start_class->flags |= ANYOF_LOCALE;
1008                 }
1009             }
1010             else if (flags & SCF_DO_STCLASS_OR) {
1011                 if (data->start_class->flags & ANYOF_FOLD) {
1012                     /* false positive possible if the class is case-folded.
1013                        Assume that the locale settings are the same... */
1014                     if (uc < 0x100)
1015                         ANYOF_BITMAP_SET(data->start_class, uc);
1016                     data->start_class->flags &= ~ANYOF_EOS;
1017                 }
1018                 cl_and(data->start_class, &and_with);
1019             }
1020             flags &= ~SCF_DO_STCLASS;
1021         }
1022         else if (strchr((char*)PL_varies,OP(scan))) {
1023             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1024             I32 f = flags, pos_before = 0;
1025             regnode *oscan = scan;
1026             struct regnode_charclass_class this_class;
1027             struct regnode_charclass_class *oclass = NULL;
1028             I32 next_is_eval = 0;
1029
1030             switch (PL_regkind[(U8)OP(scan)]) {
1031             case WHILEM:                /* End of (?:...)* . */
1032                 scan = NEXTOPER(scan);
1033                 goto finish;
1034             case PLUS:
1035                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1036                     next = NEXTOPER(scan);
1037                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1038                         mincount = 1;
1039                         maxcount = REG_INFTY;
1040                         next = regnext(scan);
1041                         scan = NEXTOPER(scan);
1042                         goto do_curly;
1043                     }
1044                 }
1045                 if (flags & SCF_DO_SUBSTR)
1046                     data->pos_min++;
1047                 min++;
1048                 /* Fall through. */
1049             case STAR:
1050                 if (flags & SCF_DO_STCLASS) {
1051                     mincount = 0;
1052                     maxcount = REG_INFTY;
1053                     next = regnext(scan);
1054                     scan = NEXTOPER(scan);
1055                     goto do_curly;
1056                 }
1057                 is_inf = is_inf_internal = 1;
1058                 scan = regnext(scan);
1059                 if (flags & SCF_DO_SUBSTR) {
1060                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1061                     data->longest = &(data->longest_float);
1062                 }
1063                 goto optimize_curly_tail;
1064             case CURLY:
1065                 mincount = ARG1(scan);
1066                 maxcount = ARG2(scan);
1067                 next = regnext(scan);
1068                 if (OP(scan) == CURLYX) {
1069                     I32 lp = (data ? *(data->last_closep) : 0);
1070
1071                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1072                 }
1073                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1074                 next_is_eval = (OP(scan) == EVAL);
1075               do_curly:
1076                 if (flags & SCF_DO_SUBSTR) {
1077                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1078                     pos_before = data->pos_min;
1079                 }
1080                 if (data) {
1081                     fl = data->flags;
1082                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1083                     if (is_inf)
1084                         data->flags |= SF_IS_INF;
1085                 }
1086                 if (flags & SCF_DO_STCLASS) {
1087                     cl_init(pRExC_state, &this_class);
1088                     oclass = data->start_class;
1089                     data->start_class = &this_class;
1090                     f |= SCF_DO_STCLASS_AND;
1091                     f &= ~SCF_DO_STCLASS_OR;
1092                 }
1093                 /* These are the cases when once a subexpression
1094                    fails at a particular position, it cannot succeed
1095                    even after backtracking at the enclosing scope.
1096                 
1097                    XXXX what if minimal match and we are at the
1098                         initial run of {n,m}? */
1099                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1100                     f &= ~SCF_WHILEM_VISITED_POS;
1101
1102                 /* This will finish on WHILEM, setting scan, or on NULL: */
1103                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1104                                       mincount == 0
1105                                         ? (f & ~SCF_DO_SUBSTR) : f);
1106
1107                 if (flags & SCF_DO_STCLASS)
1108                     data->start_class = oclass;
1109                 if (mincount == 0 || minnext == 0) {
1110                     if (flags & SCF_DO_STCLASS_OR) {
1111                         cl_or(pRExC_state, data->start_class, &this_class);
1112                     }
1113                     else if (flags & SCF_DO_STCLASS_AND) {
1114                         /* Switch to OR mode: cache the old value of
1115                          * data->start_class */
1116                         StructCopy(data->start_class, &and_with,
1117                                    struct regnode_charclass_class);
1118                         flags &= ~SCF_DO_STCLASS_AND;
1119                         StructCopy(&this_class, data->start_class,
1120                                    struct regnode_charclass_class);
1121                         flags |= SCF_DO_STCLASS_OR;
1122                         data->start_class->flags |= ANYOF_EOS;
1123                     }
1124                 } else {                /* Non-zero len */
1125                     if (flags & SCF_DO_STCLASS_OR) {
1126                         cl_or(pRExC_state, data->start_class, &this_class);
1127                         cl_and(data->start_class, &and_with);
1128                     }
1129                     else if (flags & SCF_DO_STCLASS_AND)
1130                         cl_and(data->start_class, &this_class);
1131                     flags &= ~SCF_DO_STCLASS;
1132                 }
1133                 if (!scan)              /* It was not CURLYX, but CURLY. */
1134                     scan = next;
1135                 if (ckWARN(WARN_REGEXP)
1136                        /* ? quantifier ok, except for (?{ ... }) */
1137                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1138                     && (minnext == 0) && (deltanext == 0)
1139                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1140                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1141                 {
1142                     vWARN(RExC_parse,
1143                           "Quantifier unexpected on zero-length expression");
1144                 }
1145
1146                 min += minnext * mincount;
1147                 is_inf_internal |= ((maxcount == REG_INFTY
1148                                      && (minnext + deltanext) > 0)
1149                                     || deltanext == I32_MAX);
1150                 is_inf |= is_inf_internal;
1151                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1152
1153                 /* Try powerful optimization CURLYX => CURLYN. */
1154                 if (  OP(oscan) == CURLYX && data
1155                       && data->flags & SF_IN_PAR
1156                       && !(data->flags & SF_HAS_EVAL)
1157                       && !deltanext && minnext == 1 ) {
1158                     /* Try to optimize to CURLYN.  */
1159                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1160                     regnode *nxt1 = nxt;
1161 #ifdef DEBUGGING
1162                     regnode *nxt2;
1163 #endif
1164
1165                     /* Skip open. */
1166                     nxt = regnext(nxt);
1167                     if (!strchr((char*)PL_simple,OP(nxt))
1168                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1169                              && STR_LEN(nxt) == 1))
1170                         goto nogo;
1171 #ifdef DEBUGGING
1172                     nxt2 = nxt;
1173 #endif
1174                     nxt = regnext(nxt);
1175                     if (OP(nxt) != CLOSE)
1176                         goto nogo;
1177                     /* Now we know that nxt2 is the only contents: */
1178                     oscan->flags = (U8)ARG(nxt);
1179                     OP(oscan) = CURLYN;
1180                     OP(nxt1) = NOTHING; /* was OPEN. */
1181 #ifdef DEBUGGING
1182                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1183                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1184                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1185                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1186                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1187                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1188 #endif
1189                 }
1190               nogo:
1191
1192                 /* Try optimization CURLYX => CURLYM. */
1193                 if (  OP(oscan) == CURLYX && data
1194                       && !(data->flags & SF_HAS_PAR)
1195                       && !(data->flags & SF_HAS_EVAL)
1196                       && !deltanext     /* atom is fixed width */
1197                       && minnext != 0   /* CURLYM can't handle zero width */
1198                 ) {
1199                     /* XXXX How to optimize if data == 0? */
1200                     /* Optimize to a simpler form.  */
1201                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1202                     regnode *nxt2;
1203
1204                     OP(oscan) = CURLYM;
1205                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1206                             && (OP(nxt2) != WHILEM))
1207                         nxt = nxt2;
1208                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1209                     /* Need to optimize away parenths. */
1210                     if (data->flags & SF_IN_PAR) {
1211                         /* Set the parenth number.  */
1212                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1213
1214                         if (OP(nxt) != CLOSE)
1215                             FAIL("Panic opt close");
1216                         oscan->flags = (U8)ARG(nxt);
1217                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1218                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1219 #ifdef DEBUGGING
1220                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1221                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1222                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1223                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1224 #endif
1225 #if 0
1226                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1227                             regnode *nnxt = regnext(nxt1);
1228                         
1229                             if (nnxt == nxt) {
1230                                 if (reg_off_by_arg[OP(nxt1)])
1231                                     ARG_SET(nxt1, nxt2 - nxt1);
1232                                 else if (nxt2 - nxt1 < U16_MAX)
1233                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1234                                 else
1235                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1236                             }
1237                             nxt1 = nnxt;
1238                         }
1239 #endif
1240                         /* Optimize again: */
1241                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1242                                     NULL, 0);
1243                     }
1244                     else
1245                         oscan->flags = 0;
1246                 }
1247                 else if ((OP(oscan) == CURLYX)
1248                          && (flags & SCF_WHILEM_VISITED_POS)
1249                          /* See the comment on a similar expression above.
1250                             However, this time it not a subexpression
1251                             we care about, but the expression itself. */
1252                          && (maxcount == REG_INFTY)
1253                          && data && ++data->whilem_c < 16) {
1254                     /* This stays as CURLYX, we can put the count/of pair. */
1255                     /* Find WHILEM (as in regexec.c) */
1256                     regnode *nxt = oscan + NEXT_OFF(oscan);
1257
1258                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1259                         nxt += ARG(nxt);
1260                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1261                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1262                 }
1263                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1264                     pars++;
1265                 if (flags & SCF_DO_SUBSTR) {
1266                     SV *last_str = Nullsv;
1267                     int counted = mincount != 0;
1268
1269                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1270 #if defined(SPARC64_GCC_WORKAROUND)
1271                         I32 b = 0;
1272                         STRLEN l = 0;
1273                         char *s = NULL;
1274                         I32 old = 0;
1275
1276                         if (pos_before >= data->last_start_min)
1277                             b = pos_before;
1278                         else
1279                             b = data->last_start_min;
1280
1281                         l = 0;
1282                         s = SvPV(data->last_found, l);
1283                         old = b - data->last_start_min;
1284
1285 #else
1286                         I32 b = pos_before >= data->last_start_min
1287                             ? pos_before : data->last_start_min;
1288                         STRLEN l;
1289                         char *s = SvPV(data->last_found, l);
1290                         I32 old = b - data->last_start_min;
1291 #endif
1292
1293                         if (UTF)
1294                             old = utf8_hop((U8*)s, old) - (U8*)s;
1295                         
1296                         l -= old;
1297                         /* Get the added string: */
1298                         last_str = newSVpvn(s  + old, l);
1299                         if (UTF)
1300                             SvUTF8_on(last_str);
1301                         if (deltanext == 0 && pos_before == b) {
1302                             /* What was added is a constant string */
1303                             if (mincount > 1) {
1304                                 SvGROW(last_str, (mincount * l) + 1);
1305                                 repeatcpy(SvPVX(last_str) + l,
1306                                           SvPVX(last_str), l, mincount - 1);
1307                                 SvCUR(last_str) *= mincount;
1308                                 /* Add additional parts. */
1309                                 SvCUR_set(data->last_found,
1310                                           SvCUR(data->last_found) - l);
1311                                 sv_catsv(data->last_found, last_str);
1312                                 {
1313                                     SV * sv = data->last_found;
1314                                     MAGIC *mg =
1315                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1316                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1317                                     if (mg && mg->mg_len >= 0)
1318                                         mg->mg_len += CHR_SVLEN(last_str);
1319                                 }
1320                                 data->last_end += l * (mincount - 1);
1321                             }
1322                         } else {
1323                             /* start offset must point into the last copy */
1324                             data->last_start_min += minnext * (mincount - 1);
1325                             data->last_start_max += is_inf ? I32_MAX
1326                                 : (maxcount - 1) * (minnext + data->pos_delta);
1327                         }
1328                     }
1329                     /* It is counted once already... */
1330                     data->pos_min += minnext * (mincount - counted);
1331                     data->pos_delta += - counted * deltanext +
1332                         (minnext + deltanext) * maxcount - minnext * mincount;
1333                     if (mincount != maxcount) {
1334                          /* Cannot extend fixed substrings found inside
1335                             the group.  */
1336                         scan_commit(pRExC_state,data);
1337                         if (mincount && last_str) {
1338                             sv_setsv(data->last_found, last_str);
1339                             data->last_end = data->pos_min;
1340                             data->last_start_min =
1341                                 data->pos_min - CHR_SVLEN(last_str);
1342                             data->last_start_max = is_inf
1343                                 ? I32_MAX
1344                                 : data->pos_min + data->pos_delta
1345                                 - CHR_SVLEN(last_str);
1346                         }
1347                         data->longest = &(data->longest_float);
1348                     }
1349                     SvREFCNT_dec(last_str);
1350                 }
1351                 if (data && (fl & SF_HAS_EVAL))
1352                     data->flags |= SF_HAS_EVAL;
1353               optimize_curly_tail:
1354                 if (OP(oscan) != CURLYX) {
1355                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1356                            && NEXT_OFF(next))
1357                         NEXT_OFF(oscan) += NEXT_OFF(next);
1358                 }
1359                 continue;
1360             default:                    /* REF and CLUMP only? */
1361                 if (flags & SCF_DO_SUBSTR) {
1362                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1363                     data->longest = &(data->longest_float);
1364                 }
1365                 is_inf = is_inf_internal = 1;
1366                 if (flags & SCF_DO_STCLASS_OR)
1367                     cl_anything(pRExC_state, data->start_class);
1368                 flags &= ~SCF_DO_STCLASS;
1369                 break;
1370             }
1371         }
1372         else if (strchr((char*)PL_simple,OP(scan))) {
1373             int value = 0;
1374
1375             if (flags & SCF_DO_SUBSTR) {
1376                 scan_commit(pRExC_state,data);
1377                 data->pos_min++;
1378             }
1379             min++;
1380             if (flags & SCF_DO_STCLASS) {
1381                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1382
1383                 /* Some of the logic below assumes that switching
1384                    locale on will only add false positives. */
1385                 switch (PL_regkind[(U8)OP(scan)]) {
1386                 case SANY:
1387                 default:
1388                   do_default:
1389                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1390                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1391                         cl_anything(pRExC_state, data->start_class);
1392                     break;
1393                 case REG_ANY:
1394                     if (OP(scan) == SANY)
1395                         goto do_default;
1396                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1397                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1398                                  || (data->start_class->flags & ANYOF_CLASS));
1399                         cl_anything(pRExC_state, data->start_class);
1400                     }
1401                     if (flags & SCF_DO_STCLASS_AND || !value)
1402                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1403                     break;
1404                 case ANYOF:
1405                     if (flags & SCF_DO_STCLASS_AND)
1406                         cl_and(data->start_class,
1407                                (struct regnode_charclass_class*)scan);
1408                     else
1409                         cl_or(pRExC_state, data->start_class,
1410                               (struct regnode_charclass_class*)scan);
1411                     break;
1412                 case ALNUM:
1413                     if (flags & SCF_DO_STCLASS_AND) {
1414                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416                             for (value = 0; value < 256; value++)
1417                                 if (!isALNUM(value))
1418                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1419                         }
1420                     }
1421                     else {
1422                         if (data->start_class->flags & ANYOF_LOCALE)
1423                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1424                         else {
1425                             for (value = 0; value < 256; value++)
1426                                 if (isALNUM(value))
1427                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1428                         }
1429                     }
1430                     break;
1431                 case ALNUML:
1432                     if (flags & SCF_DO_STCLASS_AND) {
1433                         if (data->start_class->flags & ANYOF_LOCALE)
1434                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1435                     }
1436                     else {
1437                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1438                         data->start_class->flags |= ANYOF_LOCALE;
1439                     }
1440                     break;
1441                 case NALNUM:
1442                     if (flags & SCF_DO_STCLASS_AND) {
1443                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1444                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445                             for (value = 0; value < 256; value++)
1446                                 if (isALNUM(value))
1447                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1448                         }
1449                     }
1450                     else {
1451                         if (data->start_class->flags & ANYOF_LOCALE)
1452                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1453                         else {
1454                             for (value = 0; value < 256; value++)
1455                                 if (!isALNUM(value))
1456                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1457                         }
1458                     }
1459                     break;
1460                 case NALNUML:
1461                     if (flags & SCF_DO_STCLASS_AND) {
1462                         if (data->start_class->flags & ANYOF_LOCALE)
1463                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1464                     }
1465                     else {
1466                         data->start_class->flags |= ANYOF_LOCALE;
1467                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1468                     }
1469                     break;
1470                 case SPACE:
1471                     if (flags & SCF_DO_STCLASS_AND) {
1472                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1473                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474                             for (value = 0; value < 256; value++)
1475                                 if (!isSPACE(value))
1476                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1477                         }
1478                     }
1479                     else {
1480                         if (data->start_class->flags & ANYOF_LOCALE)
1481                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1482                         else {
1483                             for (value = 0; value < 256; value++)
1484                                 if (isSPACE(value))
1485                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1486                         }
1487                     }
1488                     break;
1489                 case SPACEL:
1490                     if (flags & SCF_DO_STCLASS_AND) {
1491                         if (data->start_class->flags & ANYOF_LOCALE)
1492                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1493                     }
1494                     else {
1495                         data->start_class->flags |= ANYOF_LOCALE;
1496                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1497                     }
1498                     break;
1499                 case NSPACE:
1500                     if (flags & SCF_DO_STCLASS_AND) {
1501                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1502                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503                             for (value = 0; value < 256; value++)
1504                                 if (isSPACE(value))
1505                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1506                         }
1507                     }
1508                     else {
1509                         if (data->start_class->flags & ANYOF_LOCALE)
1510                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511                         else {
1512                             for (value = 0; value < 256; value++)
1513                                 if (!isSPACE(value))
1514                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1515                         }
1516                     }
1517                     break;
1518                 case NSPACEL:
1519                     if (flags & SCF_DO_STCLASS_AND) {
1520                         if (data->start_class->flags & ANYOF_LOCALE) {
1521                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1522                             for (value = 0; value < 256; value++)
1523                                 if (!isSPACE(value))
1524                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1525                         }
1526                     }
1527                     else {
1528                         data->start_class->flags |= ANYOF_LOCALE;
1529                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1530                     }
1531                     break;
1532                 case DIGIT:
1533                     if (flags & SCF_DO_STCLASS_AND) {
1534                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1535                         for (value = 0; value < 256; value++)
1536                             if (!isDIGIT(value))
1537                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1538                     }
1539                     else {
1540                         if (data->start_class->flags & ANYOF_LOCALE)
1541                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1542                         else {
1543                             for (value = 0; value < 256; value++)
1544                                 if (isDIGIT(value))
1545                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1546                         }
1547                     }
1548                     break;
1549                 case NDIGIT:
1550                     if (flags & SCF_DO_STCLASS_AND) {
1551                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1552                         for (value = 0; value < 256; value++)
1553                             if (isDIGIT(value))
1554                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1555                     }
1556                     else {
1557                         if (data->start_class->flags & ANYOF_LOCALE)
1558                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1559                         else {
1560                             for (value = 0; value < 256; value++)
1561                                 if (!isDIGIT(value))
1562                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1563                         }
1564                     }
1565                     break;
1566                 }
1567                 if (flags & SCF_DO_STCLASS_OR)
1568                     cl_and(data->start_class, &and_with);
1569                 flags &= ~SCF_DO_STCLASS;
1570             }
1571         }
1572         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1573             data->flags |= (OP(scan) == MEOL
1574                             ? SF_BEFORE_MEOL
1575                             : SF_BEFORE_SEOL);
1576         }
1577         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1578                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1579                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1580                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1581             /* Lookahead/lookbehind */
1582             I32 deltanext, minnext, fake = 0;
1583             regnode *nscan;
1584             struct regnode_charclass_class intrnl;
1585             int f = 0;
1586
1587             data_fake.flags = 0;
1588             if (data) {         
1589                 data_fake.whilem_c = data->whilem_c;
1590                 data_fake.last_closep = data->last_closep;
1591             }
1592             else
1593                 data_fake.last_closep = &fake;
1594             if ( flags & SCF_DO_STCLASS && !scan->flags
1595                  && OP(scan) == IFMATCH ) { /* Lookahead */
1596                 cl_init(pRExC_state, &intrnl);
1597                 data_fake.start_class = &intrnl;
1598                 f |= SCF_DO_STCLASS_AND;
1599             }
1600             if (flags & SCF_WHILEM_VISITED_POS)
1601                 f |= SCF_WHILEM_VISITED_POS;
1602             next = regnext(scan);
1603             nscan = NEXTOPER(NEXTOPER(scan));
1604             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1605             if (scan->flags) {
1606                 if (deltanext) {
1607                     vFAIL("Variable length lookbehind not implemented");
1608                 }
1609                 else if (minnext > U8_MAX) {
1610                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1611                 }
1612                 scan->flags = (U8)minnext;
1613             }
1614             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1615                 pars++;
1616             if (data && (data_fake.flags & SF_HAS_EVAL))
1617                 data->flags |= SF_HAS_EVAL;
1618             if (data)
1619                 data->whilem_c = data_fake.whilem_c;
1620             if (f & SCF_DO_STCLASS_AND) {
1621                 int was = (data->start_class->flags & ANYOF_EOS);
1622
1623                 cl_and(data->start_class, &intrnl);
1624                 if (was)
1625                     data->start_class->flags |= ANYOF_EOS;
1626             }
1627         }
1628         else if (OP(scan) == OPEN) {
1629             pars++;
1630         }
1631         else if (OP(scan) == CLOSE) {
1632             if ((I32)ARG(scan) == is_par) {
1633                 next = regnext(scan);
1634
1635                 if ( next && (OP(next) != WHILEM) && next < last)
1636                     is_par = 0;         /* Disable optimization */
1637             }
1638             if (data)
1639                 *(data->last_closep) = ARG(scan);
1640         }
1641         else if (OP(scan) == EVAL) {
1642                 if (data)
1643                     data->flags |= SF_HAS_EVAL;
1644         }
1645         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1646                 if (flags & SCF_DO_SUBSTR) {
1647                     scan_commit(pRExC_state,data);
1648                     data->longest = &(data->longest_float);
1649                 }
1650                 is_inf = is_inf_internal = 1;
1651                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1652                     cl_anything(pRExC_state, data->start_class);
1653                 flags &= ~SCF_DO_STCLASS;
1654         }
1655         /* Else: zero-length, ignore. */
1656         scan = regnext(scan);
1657     }
1658
1659   finish:
1660     *scanp = scan;
1661     *deltap = is_inf_internal ? I32_MAX : delta;
1662     if (flags & SCF_DO_SUBSTR && is_inf)
1663         data->pos_delta = I32_MAX - data->pos_min;
1664     if (is_par > U8_MAX)
1665         is_par = 0;
1666     if (is_par && pars==1 && data) {
1667         data->flags |= SF_IN_PAR;
1668         data->flags &= ~SF_HAS_PAR;
1669     }
1670     else if (pars && data) {
1671         data->flags |= SF_HAS_PAR;
1672         data->flags &= ~SF_IN_PAR;
1673     }
1674     if (flags & SCF_DO_STCLASS_OR)
1675         cl_and(data->start_class, &and_with);
1676     return min;
1677 }
1678
1679 STATIC I32
1680 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1681 {
1682     if (RExC_rx->data) {
1683         Renewc(RExC_rx->data,
1684                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1685                char, struct reg_data);
1686         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1687         RExC_rx->data->count += n;
1688     }
1689     else {
1690         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1691              char, struct reg_data);
1692         New(1208, RExC_rx->data->what, n, U8);
1693         RExC_rx->data->count = n;
1694     }
1695     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1696     return RExC_rx->data->count - n;
1697 }
1698
1699 void
1700 Perl_reginitcolors(pTHX)
1701 {
1702     int i = 0;
1703     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1704         
1705     if (s) {
1706         PL_colors[0] = s = savepv(s);
1707         while (++i < 6) {
1708             s = strchr(s, '\t');
1709             if (s) {
1710                 *s = '\0';
1711                 PL_colors[i] = ++s;
1712             }
1713             else
1714                 PL_colors[i] = s = "";
1715         }
1716     } else {
1717         while (i < 6)
1718             PL_colors[i++] = "";
1719     }
1720     PL_colorset = 1;
1721 }
1722
1723
1724 /*
1725  - pregcomp - compile a regular expression into internal code
1726  *
1727  * We can't allocate space until we know how big the compiled form will be,
1728  * but we can't compile it (and thus know how big it is) until we've got a
1729  * place to put the code.  So we cheat:  we compile it twice, once with code
1730  * generation turned off and size counting turned on, and once "for real".
1731  * This also means that we don't allocate space until we are sure that the
1732  * thing really will compile successfully, and we never have to move the
1733  * code and thus invalidate pointers into it.  (Note that it has to be in
1734  * one piece because free() must be able to free it all.) [NB: not true in perl]
1735  *
1736  * Beware that the optimization-preparation code in here knows about some
1737  * of the structure of the compiled regexp.  [I'll say.]
1738  */
1739 regexp *
1740 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1741 {
1742     register regexp *r;
1743     regnode *scan;
1744     regnode *first;
1745     I32 flags;
1746     I32 minlen = 0;
1747     I32 sawplus = 0;
1748     I32 sawopen = 0;
1749     scan_data_t data;
1750     RExC_state_t RExC_state;
1751     RExC_state_t *pRExC_state = &RExC_state;
1752
1753     if (exp == NULL)
1754         FAIL("NULL regexp argument");
1755
1756     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1757
1758     RExC_precomp = exp;
1759     DEBUG_r({
1760          if (!PL_colorset) reginitcolors();
1761          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1762                        PL_colors[4],PL_colors[5],PL_colors[0],
1763                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1764     });
1765     RExC_flags = pm->op_pmflags;
1766     RExC_sawback = 0;
1767
1768     RExC_seen = 0;
1769     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1770     RExC_seen_evals = 0;
1771     RExC_extralen = 0;
1772
1773     /* First pass: determine size, legality. */
1774     RExC_parse = exp;
1775     RExC_start = exp;
1776     RExC_end = xend;
1777     RExC_naughty = 0;
1778     RExC_npar = 1;
1779     RExC_size = 0L;
1780     RExC_emit = &PL_regdummy;
1781     RExC_whilem_seen = 0;
1782 #if 0 /* REGC() is (currently) a NOP at the first pass.
1783        * Clever compilers notice this and complain. --jhi */
1784     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1785 #endif
1786     if (reg(pRExC_state, 0, &flags) == NULL) {
1787         RExC_precomp = Nullch;
1788         return(NULL);
1789     }
1790     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1791
1792     /* Small enough for pointer-storage convention?
1793        If extralen==0, this means that we will not need long jumps. */
1794     if (RExC_size >= 0x10000L && RExC_extralen)
1795         RExC_size += RExC_extralen;
1796     else
1797         RExC_extralen = 0;
1798     if (RExC_whilem_seen > 15)
1799         RExC_whilem_seen = 15;
1800
1801     /* Allocate space and initialize. */
1802     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1803          char, regexp);
1804     if (r == NULL)
1805         FAIL("Regexp out of space");
1806
1807 #ifdef DEBUGGING
1808     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1809     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1810 #endif
1811     r->refcnt = 1;
1812     r->prelen = xend - exp;
1813     r->precomp = savepvn(RExC_precomp, r->prelen);
1814     r->subbeg = NULL;
1815 #ifdef PERL_COPY_ON_WRITE
1816     r->saved_copy = Nullsv;
1817 #endif
1818     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1819     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1820
1821     r->substrs = 0;                     /* Useful during FAIL. */
1822     r->startp = 0;                      /* Useful during FAIL. */
1823     r->endp = 0;                        /* Useful during FAIL. */
1824
1825     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1826     if (r->offsets) {
1827       r->offsets[0] = RExC_size; 
1828     }
1829     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1830                           "%s %"UVuf" bytes for offset annotations.\n", 
1831                           r->offsets ? "Got" : "Couldn't get", 
1832                           (UV)((2*RExC_size+1) * sizeof(U32))));
1833
1834     RExC_rx = r;
1835
1836     /* Second pass: emit code. */
1837     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1838     RExC_parse = exp;
1839     RExC_end = xend;
1840     RExC_naughty = 0;
1841     RExC_npar = 1;
1842     RExC_emit_start = r->program;
1843     RExC_emit = r->program;
1844     /* Store the count of eval-groups for security checks: */
1845     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1846     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1847     r->data = 0;
1848     if (reg(pRExC_state, 0, &flags) == NULL)
1849         return(NULL);
1850
1851     /* Dig out information for optimizations. */
1852     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1853     pm->op_pmflags = RExC_flags;
1854     if (UTF)
1855         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1856     r->regstclass = NULL;
1857     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1858         r->reganch |= ROPT_NAUGHTY;
1859     scan = r->program + 1;              /* First BRANCH. */
1860
1861     /* XXXX To minimize changes to RE engine we always allocate
1862        3-units-long substrs field. */
1863     Newz(1004, r->substrs, 1, struct reg_substr_data);
1864
1865     StructCopy(&zero_scan_data, &data, scan_data_t);
1866     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1867     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1868         I32 fake;
1869         STRLEN longest_float_length, longest_fixed_length;
1870         struct regnode_charclass_class ch_class;
1871         int stclass_flag;
1872         I32 last_close = 0;
1873
1874         first = scan;
1875         /* Skip introductions and multiplicators >= 1. */
1876         while ((OP(first) == OPEN && (sawopen = 1)) ||
1877                /* An OR of *one* alternative - should not happen now. */
1878             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1879             (OP(first) == PLUS) ||
1880             (OP(first) == MINMOD) ||
1881                /* An {n,m} with n>0 */
1882             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1883                 if (OP(first) == PLUS)
1884                     sawplus = 1;
1885                 else
1886                     first += regarglen[(U8)OP(first)];
1887                 first = NEXTOPER(first);
1888         }
1889
1890         /* Starting-point info. */
1891       again:
1892         if (PL_regkind[(U8)OP(first)] == EXACT) {
1893             if (OP(first) == EXACT)
1894                 ;       /* Empty, get anchored substr later. */
1895             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1896                 r->regstclass = first;
1897         }
1898         else if (strchr((char*)PL_simple,OP(first)))
1899             r->regstclass = first;
1900         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1901                  PL_regkind[(U8)OP(first)] == NBOUND)
1902             r->regstclass = first;
1903         else if (PL_regkind[(U8)OP(first)] == BOL) {
1904             r->reganch |= (OP(first) == MBOL
1905                            ? ROPT_ANCH_MBOL
1906                            : (OP(first) == SBOL
1907                               ? ROPT_ANCH_SBOL
1908                               : ROPT_ANCH_BOL));
1909             first = NEXTOPER(first);
1910             goto again;
1911         }
1912         else if (OP(first) == GPOS) {
1913             r->reganch |= ROPT_ANCH_GPOS;
1914             first = NEXTOPER(first);
1915             goto again;
1916         }
1917         else if (!sawopen && (OP(first) == STAR &&
1918             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1919             !(r->reganch & ROPT_ANCH) )
1920         {
1921             /* turn .* into ^.* with an implied $*=1 */
1922             int type = OP(NEXTOPER(first));
1923
1924             if (type == REG_ANY)
1925                 type = ROPT_ANCH_MBOL;
1926             else
1927                 type = ROPT_ANCH_SBOL;
1928
1929             r->reganch |= type | ROPT_IMPLICIT;
1930             first = NEXTOPER(first);
1931             goto again;
1932         }
1933         if (sawplus && (!sawopen || !RExC_sawback)
1934             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1935             /* x+ must match at the 1st pos of run of x's */
1936             r->reganch |= ROPT_SKIP;
1937
1938         /* Scan is after the zeroth branch, first is atomic matcher. */
1939         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1940                               (IV)(first - scan + 1)));
1941         /*
1942         * If there's something expensive in the r.e., find the
1943         * longest literal string that must appear and make it the
1944         * regmust.  Resolve ties in favor of later strings, since
1945         * the regstart check works with the beginning of the r.e.
1946         * and avoiding duplication strengthens checking.  Not a
1947         * strong reason, but sufficient in the absence of others.
1948         * [Now we resolve ties in favor of the earlier string if
1949         * it happens that c_offset_min has been invalidated, since the
1950         * earlier string may buy us something the later one won't.]
1951         */
1952         minlen = 0;
1953
1954         data.longest_fixed = newSVpvn("",0);
1955         data.longest_float = newSVpvn("",0);
1956         data.last_found = newSVpvn("",0);
1957         data.longest = &(data.longest_fixed);
1958         first = scan;
1959         if (!r->regstclass) {
1960             cl_init(pRExC_state, &ch_class);
1961             data.start_class = &ch_class;
1962             stclass_flag = SCF_DO_STCLASS_AND;
1963         } else                          /* XXXX Check for BOUND? */
1964             stclass_flag = 0;
1965         data.last_closep = &last_close;
1966
1967         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1968                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1969         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1970              && data.last_start_min == 0 && data.last_end > 0
1971              && !RExC_seen_zerolen
1972              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1973             r->reganch |= ROPT_CHECK_ALL;
1974         scan_commit(pRExC_state, &data);
1975         SvREFCNT_dec(data.last_found);
1976
1977         longest_float_length = CHR_SVLEN(data.longest_float);
1978         if (longest_float_length
1979             || (data.flags & SF_FL_BEFORE_EOL
1980                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1981                     || (RExC_flags & PMf_MULTILINE)))) {
1982             int t;
1983
1984             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1985                 && data.offset_fixed == data.offset_float_min
1986                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1987                     goto remove_float;          /* As in (a)+. */
1988
1989             if (SvUTF8(data.longest_float)) {
1990                 r->float_utf8 = data.longest_float;
1991                 r->float_substr = Nullsv;
1992             } else {
1993                 r->float_substr = data.longest_float;
1994                 r->float_utf8 = Nullsv;
1995             }
1996             r->float_min_offset = data.offset_float_min;
1997             r->float_max_offset = data.offset_float_max;
1998             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1999                        && (!(data.flags & SF_FL_BEFORE_MEOL)
2000                            || (RExC_flags & PMf_MULTILINE)));
2001             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2002         }
2003         else {
2004           remove_float:
2005             r->float_substr = r->float_utf8 = Nullsv;
2006             SvREFCNT_dec(data.longest_float);
2007             longest_float_length = 0;
2008         }
2009
2010         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2011         if (longest_fixed_length
2012             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2013                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2014                     || (RExC_flags & PMf_MULTILINE)))) {
2015             int t;
2016
2017             if (SvUTF8(data.longest_fixed)) {
2018                 r->anchored_utf8 = data.longest_fixed;
2019                 r->anchored_substr = Nullsv;
2020             } else {
2021                 r->anchored_substr = data.longest_fixed;
2022                 r->anchored_utf8 = Nullsv;
2023             }
2024             r->anchored_offset = data.offset_fixed;
2025             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2026                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2027                      || (RExC_flags & PMf_MULTILINE)));
2028             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2029         }
2030         else {
2031             r->anchored_substr = r->anchored_utf8 = Nullsv;
2032             SvREFCNT_dec(data.longest_fixed);
2033             longest_fixed_length = 0;
2034         }
2035         if (r->regstclass
2036             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2037             r->regstclass = NULL;
2038         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2039             && stclass_flag
2040             && !(data.start_class->flags & ANYOF_EOS)
2041             && !cl_is_anything(data.start_class))
2042         {
2043             I32 n = add_data(pRExC_state, 1, "f");
2044
2045             New(1006, RExC_rx->data->data[n], 1,
2046                 struct regnode_charclass_class);
2047             StructCopy(data.start_class,
2048                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2049                        struct regnode_charclass_class);
2050             r->regstclass = (regnode*)RExC_rx->data->data[n];
2051             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2052             PL_regdata = r->data; /* for regprop() */
2053             DEBUG_r({ SV *sv = sv_newmortal();
2054                       regprop(sv, (regnode*)data.start_class);
2055                       PerlIO_printf(Perl_debug_log,
2056                                     "synthetic stclass `%s'.\n",
2057                                     SvPVX(sv));});
2058         }
2059
2060         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2061         if (longest_fixed_length > longest_float_length) {
2062             r->check_substr = r->anchored_substr;
2063             r->check_utf8 = r->anchored_utf8;
2064             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2065             if (r->reganch & ROPT_ANCH_SINGLE)
2066                 r->reganch |= ROPT_NOSCAN;
2067         }
2068         else {
2069             r->check_substr = r->float_substr;
2070             r->check_utf8 = r->float_utf8;
2071             r->check_offset_min = data.offset_float_min;
2072             r->check_offset_max = data.offset_float_max;
2073         }
2074         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2075            This should be changed ASAP!  */
2076         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2077             r->reganch |= RE_USE_INTUIT;
2078             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2079                 r->reganch |= RE_INTUIT_TAIL;
2080         }
2081     }
2082     else {
2083         /* Several toplevels. Best we can is to set minlen. */
2084         I32 fake;
2085         struct regnode_charclass_class ch_class;
2086         I32 last_close = 0;
2087         
2088         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2089         scan = r->program + 1;
2090         cl_init(pRExC_state, &ch_class);
2091         data.start_class = &ch_class;
2092         data.last_closep = &last_close;
2093         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2094         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2095                 = r->float_substr = r->float_utf8 = Nullsv;
2096         if (!(data.start_class->flags & ANYOF_EOS)
2097             && !cl_is_anything(data.start_class))
2098         {
2099             I32 n = add_data(pRExC_state, 1, "f");
2100
2101             New(1006, RExC_rx->data->data[n], 1,
2102                 struct regnode_charclass_class);
2103             StructCopy(data.start_class,
2104                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2105                        struct regnode_charclass_class);
2106             r->regstclass = (regnode*)RExC_rx->data->data[n];
2107             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2108             DEBUG_r({ SV* sv = sv_newmortal();
2109                       regprop(sv, (regnode*)data.start_class);
2110                       PerlIO_printf(Perl_debug_log,
2111                                     "synthetic stclass `%s'.\n",
2112                                     SvPVX(sv));});
2113         }
2114     }
2115
2116     r->minlen = minlen;
2117     if (RExC_seen & REG_SEEN_GPOS)
2118         r->reganch |= ROPT_GPOS_SEEN;
2119     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2120         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2121     if (RExC_seen & REG_SEEN_EVAL)
2122         r->reganch |= ROPT_EVAL_SEEN;
2123     if (RExC_seen & REG_SEEN_CANY)
2124         r->reganch |= ROPT_CANY_SEEN;
2125     Newz(1002, r->startp, RExC_npar, I32);
2126     Newz(1002, r->endp, RExC_npar, I32);
2127     PL_regdata = r->data; /* for regprop() */
2128     DEBUG_r(regdump(r));
2129     return(r);
2130 }
2131
2132 /*
2133  - reg - regular expression, i.e. main body or parenthesized thing
2134  *
2135  * Caller must absorb opening parenthesis.
2136  *
2137  * Combining parenthesis handling with the base level of regular expression
2138  * is a trifle forced, but the need to tie the tails of the branches to what
2139  * follows makes it hard to avoid.
2140  */
2141 STATIC regnode *
2142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2143     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2144 {
2145     register regnode *ret;              /* Will be the head of the group. */
2146     register regnode *br;
2147     register regnode *lastbr;
2148     register regnode *ender = 0;
2149     register I32 parno = 0;
2150     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2151
2152     /* for (?g), (?gc), and (?o) warnings; warning
2153        about (?c) will warn about (?g) -- japhy    */
2154
2155     I32 wastedflags = 0x00,
2156         wasted_o    = 0x01,
2157         wasted_g    = 0x02,
2158         wasted_gc   = 0x02 | 0x04,
2159         wasted_c    = 0x04;
2160
2161     char * parse_start = RExC_parse; /* MJD */
2162     char *oregcomp_parse = RExC_parse;
2163     char c;
2164
2165     *flagp = 0;                         /* Tentatively. */
2166
2167
2168     /* Make an OPEN node, if parenthesized. */
2169     if (paren) {
2170         if (*RExC_parse == '?') { /* (?...) */
2171             U32 posflags = 0, negflags = 0;
2172             U32 *flagsp = &posflags;
2173             int logical = 0;
2174             char *seqstart = RExC_parse;
2175
2176             RExC_parse++;
2177             paren = *RExC_parse++;
2178             ret = NULL;                 /* For look-ahead/behind. */
2179             switch (paren) {
2180             case '<':           /* (?<...) */
2181                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2182                 if (*RExC_parse == '!')
2183                     paren = ',';
2184                 if (*RExC_parse != '=' && *RExC_parse != '!')
2185                     goto unknown;
2186                 RExC_parse++;
2187             case '=':           /* (?=...) */
2188             case '!':           /* (?!...) */
2189                 RExC_seen_zerolen++;
2190             case ':':           /* (?:...) */
2191             case '>':           /* (?>...) */
2192                 break;
2193             case '$':           /* (?$...) */
2194             case '@':           /* (?@...) */
2195                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2196                 break;
2197             case '#':           /* (?#...) */
2198                 while (*RExC_parse && *RExC_parse != ')')
2199                     RExC_parse++;
2200                 if (*RExC_parse != ')')
2201                     FAIL("Sequence (?#... not terminated");
2202                 nextchar(pRExC_state);
2203                 *flagp = TRYAGAIN;
2204                 return NULL;
2205             case 'p':           /* (?p...) */
2206                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2207                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2208                 /* FALL THROUGH*/
2209             case '?':           /* (??...) */
2210                 logical = 1;
2211                 if (*RExC_parse != '{')
2212                     goto unknown;
2213                 paren = *RExC_parse++;
2214                 /* FALL THROUGH */
2215             case '{':           /* (?{...}) */
2216             {
2217                 I32 count = 1, n = 0;
2218                 char c;
2219                 char *s = RExC_parse;
2220                 SV *sv;
2221                 OP_4tree *sop, *rop;
2222
2223                 RExC_seen_zerolen++;
2224                 RExC_seen |= REG_SEEN_EVAL;
2225                 while (count && (c = *RExC_parse)) {
2226                     if (c == '\\' && RExC_parse[1])
2227                         RExC_parse++;
2228                     else if (c == '{')
2229                         count++;
2230                     else if (c == '}')
2231                         count--;
2232                     RExC_parse++;
2233                 }
2234                 if (*RExC_parse != ')')
2235                 {
2236                     RExC_parse = s;             
2237                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2238                 }
2239                 if (!SIZE_ONLY) {
2240                     PAD *pad;
2241                 
2242                     if (RExC_parse - 1 - s)
2243                         sv = newSVpvn(s, RExC_parse - 1 - s);
2244                     else
2245                         sv = newSVpvn("", 0);
2246
2247                     ENTER;
2248                     Perl_save_re_context(aTHX);
2249                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2250                     sop->op_private |= OPpREFCOUNTED;
2251                     /* re_dup will OpREFCNT_inc */
2252                     OpREFCNT_set(sop, 1);
2253                     LEAVE;
2254
2255                     n = add_data(pRExC_state, 3, "nop");
2256                     RExC_rx->data->data[n] = (void*)rop;
2257                     RExC_rx->data->data[n+1] = (void*)sop;
2258                     RExC_rx->data->data[n+2] = (void*)pad;
2259                     SvREFCNT_dec(sv);
2260                 }
2261                 else {                                          /* First pass */
2262                     if (PL_reginterp_cnt < ++RExC_seen_evals
2263                         && IN_PERL_RUNTIME)
2264                         /* No compiled RE interpolated, has runtime
2265                            components ===> unsafe.  */
2266                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2267                     if (PL_tainting && PL_tainted)
2268                         FAIL("Eval-group in insecure regular expression");
2269                     if (IN_PERL_COMPILETIME)
2270                         PL_cv_has_eval = 1;
2271                 }
2272
2273                 nextchar(pRExC_state);
2274                 if (logical) {
2275                     ret = reg_node(pRExC_state, LOGICAL);
2276                     if (!SIZE_ONLY)
2277                         ret->flags = 2;
2278                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2279                     /* deal with the length of this later - MJD */
2280                     return ret;
2281                 }
2282                 ret = reganode(pRExC_state, EVAL, n);
2283                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2284                 Set_Node_Offset(ret, parse_start);
2285                 return ret;
2286             }
2287             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2288             {
2289                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2290                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2291                         || RExC_parse[1] == '<'
2292                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2293                         I32 flag;
2294                         
2295                         ret = reg_node(pRExC_state, LOGICAL);
2296                         if (!SIZE_ONLY)
2297                             ret->flags = 1;
2298                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2299                         goto insert_if;
2300                     }
2301                 }
2302                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2303                     /* (?(1)...) */
2304                     parno = atoi(RExC_parse++);
2305
2306                     while (isDIGIT(*RExC_parse))
2307                         RExC_parse++;
2308                     ret = reganode(pRExC_state, GROUPP, parno);
2309                     
2310                     if ((c = *nextchar(pRExC_state)) != ')')
2311                         vFAIL("Switch condition not recognized");
2312                   insert_if:
2313                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2314                     br = regbranch(pRExC_state, &flags, 1);
2315                     if (br == NULL)
2316                         br = reganode(pRExC_state, LONGJMP, 0);
2317                     else
2318                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2319                     c = *nextchar(pRExC_state);
2320                     if (flags&HASWIDTH)
2321                         *flagp |= HASWIDTH;
2322                     if (c == '|') {
2323                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2324                         regbranch(pRExC_state, &flags, 1);
2325                         regtail(pRExC_state, ret, lastbr);
2326                         if (flags&HASWIDTH)
2327                             *flagp |= HASWIDTH;
2328                         c = *nextchar(pRExC_state);
2329                     }
2330                     else
2331                         lastbr = NULL;
2332                     if (c != ')')
2333                         vFAIL("Switch (?(condition)... contains too many branches");
2334                     ender = reg_node(pRExC_state, TAIL);
2335                     regtail(pRExC_state, br, ender);
2336                     if (lastbr) {
2337                         regtail(pRExC_state, lastbr, ender);
2338                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2339                     }
2340                     else
2341                         regtail(pRExC_state, ret, ender);
2342                     return ret;
2343                 }
2344                 else {
2345                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2346                 }
2347             }
2348             case 0:
2349                 RExC_parse--; /* for vFAIL to print correctly */
2350                 vFAIL("Sequence (? incomplete");
2351                 break;
2352             default:
2353                 --RExC_parse;
2354               parse_flags:      /* (?i) */
2355                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2356                     /* (?g), (?gc) and (?o) are useless here
2357                        and must be globally applied -- japhy */
2358
2359                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2360                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2362                             if (! (wastedflags & wflagbit) ) {
2363                                 wastedflags |= wflagbit;
2364                                 vWARN5(
2365                                     RExC_parse + 1,
2366                                     "Useless (%s%c) - %suse /%c modifier",
2367                                     flagsp == &negflags ? "?-" : "?",
2368                                     *RExC_parse,
2369                                     flagsp == &negflags ? "don't " : "",
2370                                     *RExC_parse
2371                                 );
2372                             }
2373                         }
2374                     }
2375                     else if (*RExC_parse == 'c') {
2376                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2377                             if (! (wastedflags & wasted_c) ) {
2378                                 wastedflags |= wasted_gc;
2379                                 vWARN3(
2380                                     RExC_parse + 1,
2381                                     "Useless (%sc) - %suse /gc modifier",
2382                                     flagsp == &negflags ? "?-" : "?",
2383                                     flagsp == &negflags ? "don't " : ""
2384                                 );
2385                             }
2386                         }
2387                     }
2388                     else { pmflag(flagsp, *RExC_parse); }
2389
2390                     ++RExC_parse;
2391                 }
2392                 if (*RExC_parse == '-') {
2393                     flagsp = &negflags;
2394                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2395                     ++RExC_parse;
2396                     goto parse_flags;
2397                 }
2398                 RExC_flags |= posflags;
2399                 RExC_flags &= ~negflags;
2400                 if (*RExC_parse == ':') {
2401                     RExC_parse++;
2402                     paren = ':';
2403                     break;
2404                 }               
2405               unknown:
2406                 if (*RExC_parse != ')') {
2407                     RExC_parse++;
2408                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2409                 }
2410                 nextchar(pRExC_state);
2411                 *flagp = TRYAGAIN;
2412                 return NULL;
2413             }
2414         }
2415         else {                  /* (...) */
2416             parno = RExC_npar;
2417             RExC_npar++;
2418             ret = reganode(pRExC_state, OPEN, parno);
2419             Set_Node_Length(ret, 1); /* MJD */
2420             Set_Node_Offset(ret, RExC_parse); /* MJD */
2421             open = 1;
2422         }
2423     }
2424     else                        /* ! paren */
2425         ret = NULL;
2426
2427     /* Pick up the branches, linking them together. */
2428     parse_start = RExC_parse;   /* MJD */
2429     br = regbranch(pRExC_state, &flags, 1);
2430     /*     branch_len = (paren != 0); */
2431     
2432     if (br == NULL)
2433         return(NULL);
2434     if (*RExC_parse == '|') {
2435         if (!SIZE_ONLY && RExC_extralen) {
2436             reginsert(pRExC_state, BRANCHJ, br);
2437         }
2438         else {                  /* MJD */
2439             reginsert(pRExC_state, BRANCH, br);
2440             Set_Node_Length(br, paren != 0);
2441             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2442         }
2443         have_branch = 1;
2444         if (SIZE_ONLY)
2445             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2446     }
2447     else if (paren == ':') {
2448         *flagp |= flags&SIMPLE;
2449     }
2450     if (open) {                         /* Starts with OPEN. */
2451         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2452     }
2453     else if (paren != '?')              /* Not Conditional */
2454         ret = br;
2455     *flagp |= flags & (SPSTART | HASWIDTH);
2456     lastbr = br;
2457     while (*RExC_parse == '|') {
2458         if (!SIZE_ONLY && RExC_extralen) {
2459             ender = reganode(pRExC_state, LONGJMP,0);
2460             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2461         }
2462         if (SIZE_ONLY)
2463             RExC_extralen += 2;         /* Account for LONGJMP. */
2464         nextchar(pRExC_state);
2465         br = regbranch(pRExC_state, &flags, 0);
2466         
2467         if (br == NULL)
2468             return(NULL);
2469         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2470         lastbr = br;
2471         if (flags&HASWIDTH)
2472             *flagp |= HASWIDTH;
2473         *flagp |= flags&SPSTART;
2474     }
2475
2476     if (have_branch || paren != ':') {
2477         /* Make a closing node, and hook it on the end. */
2478         switch (paren) {
2479         case ':':
2480             ender = reg_node(pRExC_state, TAIL);
2481             break;
2482         case 1:
2483             ender = reganode(pRExC_state, CLOSE, parno);
2484             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2485             Set_Node_Length(ender,1); /* MJD */
2486             break;
2487         case '<':
2488         case ',':
2489         case '=':
2490         case '!':
2491             *flagp &= ~HASWIDTH;
2492             /* FALL THROUGH */
2493         case '>':
2494             ender = reg_node(pRExC_state, SUCCEED);
2495             break;
2496         case 0:
2497             ender = reg_node(pRExC_state, END);
2498             break;
2499         }
2500         regtail(pRExC_state, lastbr, ender);
2501
2502         if (have_branch) {
2503             /* Hook the tails of the branches to the closing node. */
2504             for (br = ret; br != NULL; br = regnext(br)) {
2505                 regoptail(pRExC_state, br, ender);
2506             }
2507         }
2508     }
2509
2510     {
2511         char *p;
2512         static char parens[] = "=!<,>";
2513
2514         if (paren && (p = strchr(parens, paren))) {
2515             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2516             int flag = (p - parens) > 1;
2517
2518             if (paren == '>')
2519                 node = SUSPEND, flag = 0;
2520             reginsert(pRExC_state, node,ret);
2521             Set_Node_Cur_Length(ret);
2522             Set_Node_Offset(ret, parse_start + 1);
2523             ret->flags = flag;
2524             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2525         }
2526     }
2527
2528     /* Check for proper termination. */
2529     if (paren) {
2530         RExC_flags = oregflags;
2531         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2532             RExC_parse = oregcomp_parse;
2533             vFAIL("Unmatched (");
2534         }
2535     }
2536     else if (!paren && RExC_parse < RExC_end) {
2537         if (*RExC_parse == ')') {
2538             RExC_parse++;
2539             vFAIL("Unmatched )");
2540         }
2541         else
2542             FAIL("Junk on end of regexp");      /* "Can't happen". */
2543         /* NOTREACHED */
2544     }
2545
2546     return(ret);
2547 }
2548
2549 /*
2550  - regbranch - one alternative of an | operator
2551  *
2552  * Implements the concatenation operator.
2553  */
2554 STATIC regnode *
2555 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2556 {
2557     register regnode *ret;
2558     register regnode *chain = NULL;
2559     register regnode *latest;
2560     I32 flags = 0, c = 0;
2561
2562     if (first)
2563         ret = NULL;
2564     else {
2565         if (!SIZE_ONLY && RExC_extralen)
2566             ret = reganode(pRExC_state, BRANCHJ,0);
2567         else {
2568             ret = reg_node(pRExC_state, BRANCH);
2569             Set_Node_Length(ret, 1);
2570         }
2571     }
2572         
2573     if (!first && SIZE_ONLY)
2574         RExC_extralen += 1;                     /* BRANCHJ */
2575
2576     *flagp = WORST;                     /* Tentatively. */
2577
2578     RExC_parse--;
2579     nextchar(pRExC_state);
2580     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2581         flags &= ~TRYAGAIN;
2582         latest = regpiece(pRExC_state, &flags);
2583         if (latest == NULL) {
2584             if (flags & TRYAGAIN)
2585                 continue;
2586             return(NULL);
2587         }
2588         else if (ret == NULL)
2589             ret = latest;
2590         *flagp |= flags&HASWIDTH;
2591         if (chain == NULL)      /* First piece. */
2592             *flagp |= flags&SPSTART;
2593         else {
2594             RExC_naughty++;
2595             regtail(pRExC_state, chain, latest);
2596         }
2597         chain = latest;
2598         c++;
2599     }
2600     if (chain == NULL) {        /* Loop ran zero times. */
2601         chain = reg_node(pRExC_state, NOTHING);
2602         if (ret == NULL)
2603             ret = chain;
2604     }
2605     if (c == 1) {
2606         *flagp |= flags&SIMPLE;
2607     }
2608
2609     return(ret);
2610 }
2611
2612 /*
2613  - regpiece - something followed by possible [*+?]
2614  *
2615  * Note that the branching code sequences used for ? and the general cases
2616  * of * and + are somewhat optimized:  they use the same NOTHING node as
2617  * both the endmarker for their branch list and the body of the last branch.
2618  * It might seem that this node could be dispensed with entirely, but the
2619  * endmarker role is not redundant.
2620  */
2621 STATIC regnode *
2622 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2623 {
2624     register regnode *ret;
2625     register char op;
2626     register char *next;
2627     I32 flags;
2628     char *origparse = RExC_parse;
2629     char *maxpos;
2630     I32 min;
2631     I32 max = REG_INFTY;
2632     char *parse_start;
2633
2634     ret = regatom(pRExC_state, &flags);
2635     if (ret == NULL) {
2636         if (flags & TRYAGAIN)
2637             *flagp |= TRYAGAIN;
2638         return(NULL);
2639     }
2640
2641     op = *RExC_parse;
2642
2643     if (op == '{' && regcurly(RExC_parse)) {
2644         parse_start = RExC_parse; /* MJD */
2645         next = RExC_parse + 1;
2646         maxpos = Nullch;
2647         while (isDIGIT(*next) || *next == ',') {
2648             if (*next == ',') {
2649                 if (maxpos)
2650                     break;
2651                 else
2652                     maxpos = next;
2653             }
2654             next++;
2655         }
2656         if (*next == '}') {             /* got one */
2657             if (!maxpos)
2658                 maxpos = next;
2659             RExC_parse++;
2660             min = atoi(RExC_parse);
2661             if (*maxpos == ',')
2662                 maxpos++;
2663             else
2664                 maxpos = RExC_parse;
2665             max = atoi(maxpos);
2666             if (!max && *maxpos != '0')
2667                 max = REG_INFTY;                /* meaning "infinity" */
2668             else if (max >= REG_INFTY)
2669                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2670             RExC_parse = next;
2671             nextchar(pRExC_state);
2672
2673         do_curly:
2674             if ((flags&SIMPLE)) {
2675                 RExC_naughty += 2 + RExC_naughty / 2;
2676                 reginsert(pRExC_state, CURLY, ret);
2677                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2678                 Set_Node_Cur_Length(ret);
2679             }
2680             else {
2681                 regnode *w = reg_node(pRExC_state, WHILEM);
2682
2683                 w->flags = 0;
2684                 regtail(pRExC_state, ret, w);
2685                 if (!SIZE_ONLY && RExC_extralen) {
2686                     reginsert(pRExC_state, LONGJMP,ret);
2687                     reginsert(pRExC_state, NOTHING,ret);
2688                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2689                 }
2690                 reginsert(pRExC_state, CURLYX,ret);
2691                                 /* MJD hk */
2692                 Set_Node_Offset(ret, parse_start+1);
2693                 Set_Node_Length(ret, 
2694                                 op == '{' ? (RExC_parse - parse_start) : 1);
2695                 
2696                 if (!SIZE_ONLY && RExC_extralen)
2697                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2698                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2699                 if (SIZE_ONLY)
2700                     RExC_whilem_seen++, RExC_extralen += 3;
2701                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2702             }
2703             ret->flags = 0;
2704
2705             if (min > 0)
2706                 *flagp = WORST;
2707             if (max > 0)
2708                 *flagp |= HASWIDTH;
2709             if (max && max < min)
2710                 vFAIL("Can't do {n,m} with n > m");
2711             if (!SIZE_ONLY) {
2712                 ARG1_SET(ret, (U16)min);
2713                 ARG2_SET(ret, (U16)max);
2714             }
2715
2716             goto nest_check;
2717         }
2718     }
2719
2720     if (!ISMULT1(op)) {
2721         *flagp = flags;
2722         return(ret);
2723     }
2724
2725 #if 0                           /* Now runtime fix should be reliable. */
2726
2727     /* if this is reinstated, don't forget to put this back into perldiag:
2728
2729             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2730
2731            (F) The part of the regexp subject to either the * or + quantifier
2732            could match an empty string. The {#} shows in the regular
2733            expression about where the problem was discovered.
2734
2735     */
2736
2737     if (!(flags&HASWIDTH) && op != '?')
2738       vFAIL("Regexp *+ operand could be empty");
2739 #endif
2740
2741     parse_start = RExC_parse;
2742     nextchar(pRExC_state);
2743
2744     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2745
2746     if (op == '*' && (flags&SIMPLE)) {
2747         reginsert(pRExC_state, STAR, ret);
2748         ret->flags = 0;
2749         RExC_naughty += 4;
2750     }
2751     else if (op == '*') {
2752         min = 0;
2753         goto do_curly;
2754     }
2755     else if (op == '+' && (flags&SIMPLE)) {
2756         reginsert(pRExC_state, PLUS, ret);
2757         ret->flags = 0;
2758         RExC_naughty += 3;
2759     }
2760     else if (op == '+') {
2761         min = 1;
2762         goto do_curly;
2763     }
2764     else if (op == '?') {
2765         min = 0; max = 1;
2766         goto do_curly;
2767     }
2768   nest_check:
2769     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2770         vWARN3(RExC_parse,
2771                "%.*s matches null string many times",
2772                RExC_parse - origparse,
2773                origparse);
2774     }
2775
2776     if (*RExC_parse == '?') {
2777         nextchar(pRExC_state);
2778         reginsert(pRExC_state, MINMOD, ret);
2779         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2780     }
2781     if (ISMULT2(RExC_parse)) {
2782         RExC_parse++;
2783         vFAIL("Nested quantifiers");
2784     }
2785
2786     return(ret);
2787 }
2788
2789 /*
2790  - regatom - the lowest level
2791  *
2792  * Optimization:  gobbles an entire sequence of ordinary characters so that
2793  * it can turn them into a single node, which is smaller to store and
2794  * faster to run.  Backslashed characters are exceptions, each becoming a
2795  * separate node; the code is simpler that way and it's not worth fixing.
2796  *
2797  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2798 STATIC regnode *
2799 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2800 {
2801     register regnode *ret = 0;
2802     I32 flags;
2803     char *parse_start = RExC_parse;
2804
2805     *flagp = WORST;             /* Tentatively. */
2806
2807 tryagain:
2808     switch (*RExC_parse) {
2809     case '^':
2810         RExC_seen_zerolen++;
2811         nextchar(pRExC_state);
2812         if (RExC_flags & PMf_MULTILINE)
2813             ret = reg_node(pRExC_state, MBOL);
2814         else if (RExC_flags & PMf_SINGLELINE)
2815             ret = reg_node(pRExC_state, SBOL);
2816         else
2817             ret = reg_node(pRExC_state, BOL);
2818         Set_Node_Length(ret, 1); /* MJD */
2819         break;
2820     case '$':
2821         nextchar(pRExC_state);
2822         if (*RExC_parse)
2823             RExC_seen_zerolen++;
2824         if (RExC_flags & PMf_MULTILINE)
2825             ret = reg_node(pRExC_state, MEOL);
2826         else if (RExC_flags & PMf_SINGLELINE)
2827             ret = reg_node(pRExC_state, SEOL);
2828         else
2829             ret = reg_node(pRExC_state, EOL);
2830         Set_Node_Length(ret, 1); /* MJD */
2831         break;
2832     case '.':
2833         nextchar(pRExC_state);
2834         if (RExC_flags & PMf_SINGLELINE)
2835             ret = reg_node(pRExC_state, SANY);
2836         else
2837             ret = reg_node(pRExC_state, REG_ANY);
2838         *flagp |= HASWIDTH|SIMPLE;
2839         RExC_naughty++;
2840         Set_Node_Length(ret, 1); /* MJD */
2841         break;
2842     case '[':
2843     {
2844         char *oregcomp_parse = ++RExC_parse;
2845         ret = regclass(pRExC_state);
2846         if (*RExC_parse != ']') {
2847             RExC_parse = oregcomp_parse;
2848             vFAIL("Unmatched [");
2849         }
2850         nextchar(pRExC_state);
2851         *flagp |= HASWIDTH|SIMPLE;
2852         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2853         break;
2854     }
2855     case '(':
2856         nextchar(pRExC_state);
2857         ret = reg(pRExC_state, 1, &flags);
2858         if (ret == NULL) {
2859                 if (flags & TRYAGAIN) {
2860                     if (RExC_parse == RExC_end) {
2861                          /* Make parent create an empty node if needed. */
2862                         *flagp |= TRYAGAIN;
2863                         return(NULL);
2864                     }
2865                     goto tryagain;
2866                 }
2867                 return(NULL);
2868         }
2869         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2870         break;
2871     case '|':
2872     case ')':
2873         if (flags & TRYAGAIN) {
2874             *flagp |= TRYAGAIN;
2875             return NULL;
2876         }
2877         vFAIL("Internal urp");
2878                                 /* Supposed to be caught earlier. */
2879         break;
2880     case '{':
2881         if (!regcurly(RExC_parse)) {
2882             RExC_parse++;
2883             goto defchar;
2884         }
2885         /* FALL THROUGH */
2886     case '?':
2887     case '+':
2888     case '*':
2889         RExC_parse++;
2890         vFAIL("Quantifier follows nothing");
2891         break;
2892     case '\\':
2893         switch (*++RExC_parse) {
2894         case 'A':
2895             RExC_seen_zerolen++;
2896             ret = reg_node(pRExC_state, SBOL);
2897             *flagp |= SIMPLE;
2898             nextchar(pRExC_state);
2899             Set_Node_Length(ret, 2); /* MJD */
2900             break;
2901         case 'G':
2902             ret = reg_node(pRExC_state, GPOS);
2903             RExC_seen |= REG_SEEN_GPOS;
2904             *flagp |= SIMPLE;
2905             nextchar(pRExC_state);
2906             Set_Node_Length(ret, 2); /* MJD */
2907             break;
2908         case 'Z':
2909             ret = reg_node(pRExC_state, SEOL);
2910             *flagp |= SIMPLE;
2911             RExC_seen_zerolen++;                /* Do not optimize RE away */
2912             nextchar(pRExC_state);
2913             break;
2914         case 'z':
2915             ret = reg_node(pRExC_state, EOS);
2916             *flagp |= SIMPLE;
2917             RExC_seen_zerolen++;                /* Do not optimize RE away */
2918             nextchar(pRExC_state);
2919             Set_Node_Length(ret, 2); /* MJD */
2920             break;
2921         case 'C':
2922             ret = reg_node(pRExC_state, CANY);
2923             RExC_seen |= REG_SEEN_CANY;
2924             *flagp |= HASWIDTH|SIMPLE;
2925             nextchar(pRExC_state);
2926             Set_Node_Length(ret, 2); /* MJD */
2927             break;
2928         case 'X':
2929             ret = reg_node(pRExC_state, CLUMP);
2930             *flagp |= HASWIDTH;
2931             nextchar(pRExC_state);
2932             Set_Node_Length(ret, 2); /* MJD */
2933             break;
2934         case 'w':
2935             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2936             *flagp |= HASWIDTH|SIMPLE;
2937             nextchar(pRExC_state);
2938             Set_Node_Length(ret, 2); /* MJD */
2939             break;
2940         case 'W':
2941             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2942             *flagp |= HASWIDTH|SIMPLE;
2943             nextchar(pRExC_state);
2944             Set_Node_Length(ret, 2); /* MJD */
2945             break;
2946         case 'b':
2947             RExC_seen_zerolen++;
2948             RExC_seen |= REG_SEEN_LOOKBEHIND;
2949             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2950             *flagp |= SIMPLE;
2951             nextchar(pRExC_state);
2952             Set_Node_Length(ret, 2); /* MJD */
2953             break;
2954         case 'B':
2955             RExC_seen_zerolen++;
2956             RExC_seen |= REG_SEEN_LOOKBEHIND;
2957             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2958             *flagp |= SIMPLE;
2959             nextchar(pRExC_state);
2960             Set_Node_Length(ret, 2); /* MJD */
2961             break;
2962         case 's':
2963             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2964             *flagp |= HASWIDTH|SIMPLE;
2965             nextchar(pRExC_state);
2966             Set_Node_Length(ret, 2); /* MJD */
2967             break;
2968         case 'S':
2969             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2970             *flagp |= HASWIDTH|SIMPLE;
2971             nextchar(pRExC_state);
2972             Set_Node_Length(ret, 2); /* MJD */
2973             break;
2974         case 'd':
2975             ret = reg_node(pRExC_state, DIGIT);
2976             *flagp |= HASWIDTH|SIMPLE;
2977             nextchar(pRExC_state);
2978             Set_Node_Length(ret, 2); /* MJD */
2979             break;
2980         case 'D':
2981             ret = reg_node(pRExC_state, NDIGIT);
2982             *flagp |= HASWIDTH|SIMPLE;
2983             nextchar(pRExC_state);
2984             Set_Node_Length(ret, 2); /* MJD */
2985             break;
2986         case 'p':
2987         case 'P':
2988             {   
2989                 char* oldregxend = RExC_end;
2990                 char* parse_start = RExC_parse - 2;
2991
2992                 if (RExC_parse[1] == '{') {
2993                   /* a lovely hack--pretend we saw [\pX] instead */
2994                     RExC_end = strchr(RExC_parse, '}');
2995                     if (!RExC_end) {
2996                         U8 c = (U8)*RExC_parse;
2997                         RExC_parse += 2;
2998                         RExC_end = oldregxend;
2999                         vFAIL2("Missing right brace on \\%c{}", c);
3000                     }
3001                     RExC_end++;
3002                 }
3003                 else {
3004                     RExC_end = RExC_parse + 2;
3005                     if (RExC_end > oldregxend)
3006                         RExC_end = oldregxend;
3007                 }
3008                 RExC_parse--;
3009
3010                 ret = regclass(pRExC_state);
3011
3012                 RExC_end = oldregxend;
3013                 RExC_parse--;
3014
3015                 Set_Node_Offset(ret, parse_start + 2);
3016                 Set_Node_Cur_Length(ret);
3017                 nextchar(pRExC_state);
3018                 *flagp |= HASWIDTH|SIMPLE;
3019             }
3020             break;
3021         case 'n':
3022         case 'r':
3023         case 't':
3024         case 'f':
3025         case 'e':
3026         case 'a':
3027         case 'x':
3028         case 'c':
3029         case '0':
3030             goto defchar;
3031         case '1': case '2': case '3': case '4':
3032         case '5': case '6': case '7': case '8': case '9':
3033             {
3034                 I32 num = atoi(RExC_parse);
3035
3036                 if (num > 9 && num >= RExC_npar)
3037                     goto defchar;
3038                 else {
3039                     char * parse_start = RExC_parse - 1; /* MJD */
3040                     while (isDIGIT(*RExC_parse))
3041                         RExC_parse++;
3042
3043                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3044                         vFAIL("Reference to nonexistent group");
3045                     RExC_sawback = 1;
3046                     ret = reganode(pRExC_state,
3047                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3048                                    num);
3049                     *flagp |= HASWIDTH;
3050                     
3051                     /* override incorrect value set in reganode MJD */
3052                     Set_Node_Offset(ret, parse_start+1); 
3053                     Set_Node_Cur_Length(ret); /* MJD */
3054                     RExC_parse--;
3055                     nextchar(pRExC_state);
3056                 }
3057             }
3058             break;
3059         case '\0':
3060             if (RExC_parse >= RExC_end)
3061                 FAIL("Trailing \\");
3062             /* FALL THROUGH */
3063         default:
3064             /* Do not generate `unrecognized' warnings here, we fall
3065                back into the quick-grab loop below */
3066             parse_start--;
3067             goto defchar;
3068         }
3069         break;
3070
3071     case '#':
3072         if (RExC_flags & PMf_EXTENDED) {
3073             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3074             if (RExC_parse < RExC_end)
3075                 goto tryagain;
3076         }
3077         /* FALL THROUGH */
3078
3079     default: {
3080             register STRLEN len;
3081             register UV ender;
3082             register char *p;
3083             char *oldp, *s;
3084             STRLEN numlen;
3085             STRLEN foldlen;
3086             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3087
3088             parse_start = RExC_parse - 1;
3089
3090             RExC_parse++;
3091
3092         defchar:
3093             ender = 0;
3094             ret = reg_node(pRExC_state,
3095                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3096             s = STRING(ret);
3097             for (len = 0, p = RExC_parse - 1;
3098               len < 127 && p < RExC_end;
3099               len++)
3100             {
3101                 oldp = p;
3102
3103                 if (RExC_flags & PMf_EXTENDED)
3104                     p = regwhite(p, RExC_end);
3105                 switch (*p) {
3106                 case '^':
3107                 case '$':
3108                 case '.':
3109                 case '[':
3110                 case '(':
3111                 case ')':
3112                 case '|':
3113                     goto loopdone;
3114                 case '\\':
3115                     switch (*++p) {
3116                     case 'A':
3117                     case 'C':
3118                     case 'X':
3119                     case 'G':
3120                     case 'Z':
3121                     case 'z':
3122                     case 'w':
3123                     case 'W':
3124                     case 'b':
3125                     case 'B':
3126                     case 's':
3127                     case 'S':
3128                     case 'd':
3129                     case 'D':
3130                     case 'p':
3131                     case 'P':
3132                         --p;
3133                         goto loopdone;
3134                     case 'n':
3135                         ender = '\n';
3136                         p++;
3137                         break;
3138                     case 'r':
3139                         ender = '\r';
3140                         p++;
3141                         break;
3142                     case 't':
3143                         ender = '\t';
3144                         p++;
3145                         break;
3146                     case 'f':
3147                         ender = '\f';
3148                         p++;
3149                         break;
3150                     case 'e':
3151                           ender = ASCII_TO_NATIVE('\033');
3152                         p++;
3153                         break;
3154                     case 'a':
3155                           ender = ASCII_TO_NATIVE('\007');
3156                         p++;
3157                         break;
3158                     case 'x':
3159                         if (*++p == '{') {
3160                             char* e = strchr(p, '}');
3161         
3162                             if (!e) {
3163                                 RExC_parse = p + 1;
3164                                 vFAIL("Missing right brace on \\x{}");
3165                             }
3166                             else {
3167                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3168                                     | PERL_SCAN_DISALLOW_PREFIX;
3169                                 numlen = e - p - 1;
3170                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3171                                 if (ender > 0xff)
3172                                     RExC_utf8 = 1;
3173                                 p = e + 1;
3174                             }
3175                         }
3176                         else {
3177                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3178                             numlen = 2;
3179                             ender = grok_hex(p, &numlen, &flags, NULL);
3180                             p += numlen;
3181                         }
3182                         break;
3183                     case 'c':
3184                         p++;
3185                         ender = UCHARAT(p++);
3186                         ender = toCTRL(ender);
3187                         break;
3188                     case '0': case '1': case '2': case '3':case '4':
3189                     case '5': case '6': case '7': case '8':case '9':
3190                         if (*p == '0' ||
3191                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3192                             I32 flags = 0;
3193                             numlen = 3;
3194                             ender = grok_oct(p, &numlen, &flags, NULL);
3195                             p += numlen;
3196                         }
3197                         else {
3198                             --p;
3199                             goto loopdone;
3200                         }
3201                         break;
3202                     case '\0':
3203                         if (p >= RExC_end)
3204                             FAIL("Trailing \\");
3205                         /* FALL THROUGH */
3206                     default:
3207                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3208                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3209                         goto normal_default;
3210                     }
3211                     break;
3212                 default:
3213                   normal_default:
3214                     if (UTF8_IS_START(*p) && UTF) {
3215                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3216                                                &numlen, 0);
3217                         p += numlen;
3218                     }
3219                     else
3220                         ender = *p++;
3221                     break;
3222                 }
3223                 if (RExC_flags & PMf_EXTENDED)
3224                     p = regwhite(p, RExC_end);
3225                 if (UTF && FOLD) {
3226                     /* Prime the casefolded buffer. */
3227                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3228                 }
3229                 if (ISMULT2(p)) { /* Back off on ?+*. */
3230                     if (len)
3231                         p = oldp;
3232                     else if (UTF) {
3233                          STRLEN unilen;
3234
3235                          if (FOLD) {
3236                               /* Emit all the Unicode characters. */
3237                               for (foldbuf = tmpbuf;
3238                                    foldlen;
3239                                    foldlen -= numlen) {
3240                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3241                                    if (numlen > 0) {
3242                                         reguni(pRExC_state, ender, s, &unilen);
3243                                         s       += unilen;
3244                                         len     += unilen;
3245                                         /* In EBCDIC the numlen
3246                                          * and unilen can differ. */
3247                                         foldbuf += numlen;
3248                                         if (numlen >= foldlen)
3249                                              break;
3250                                    }
3251                                    else
3252                                         break; /* "Can't happen." */
3253                               }
3254                          }
3255                          else {
3256                               reguni(pRExC_state, ender, s, &unilen);
3257                               if (unilen > 0) {
3258                                    s   += unilen;
3259                                    len += unilen;
3260                               }
3261                          }
3262                     }
3263                     else {
3264                         len++;
3265                         REGC((char)ender, s++);
3266                     }
3267                     break;
3268                 }
3269                 if (UTF) {
3270                      STRLEN unilen;
3271
3272                      if (FOLD) {
3273                           /* Emit all the Unicode characters. */
3274                           for (foldbuf = tmpbuf;
3275                                foldlen;
3276                                foldlen -= numlen) {
3277                                ender = utf8_to_uvchr(foldbuf, &numlen);
3278                                if (numlen > 0) {
3279                                     reguni(pRExC_state, ender, s, &unilen);
3280                                     len     += unilen;
3281                                     s       += unilen;
3282                                     /* In EBCDIC the numlen
3283                                      * and unilen can differ. */
3284                                     foldbuf += numlen;
3285                                     if (numlen >= foldlen)
3286                                          break;
3287                                }
3288                                else
3289                                     break;
3290                           }
3291                      }
3292                      else {
3293                           reguni(pRExC_state, ender, s, &unilen);
3294                           if (unilen > 0) {
3295                                s   += unilen;
3296                                len += unilen;
3297                           }
3298                      }
3299                      len--;
3300                 }
3301                 else
3302                     REGC((char)ender, s++);
3303             }
3304         loopdone:
3305             RExC_parse = p - 1;
3306             Set_Node_Cur_Length(ret); /* MJD */
3307             nextchar(pRExC_state);
3308             {
3309                 /* len is STRLEN which is unsigned, need to copy to signed */
3310                 IV iv = len;
3311                 if (iv < 0)
3312                     vFAIL("Internal disaster");
3313             }
3314             if (len > 0)
3315                 *flagp |= HASWIDTH;
3316             if (len == 1 && UNI_IS_INVARIANT(ender))
3317                 *flagp |= SIMPLE;
3318             if (!SIZE_ONLY)
3319                 STR_LEN(ret) = len;
3320             if (SIZE_ONLY)
3321                 RExC_size += STR_SZ(len);
3322             else
3323                 RExC_emit += STR_SZ(len);
3324         }
3325         break;
3326     }
3327
3328     /* If the encoding pragma is in effect recode the text of
3329      * any EXACT-kind nodes. */
3330     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3331         STRLEN oldlen = STR_LEN(ret);
3332         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3333
3334         if (RExC_utf8)
3335             SvUTF8_on(sv);
3336         if (sv_utf8_downgrade(sv, TRUE)) {
3337             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3338             STRLEN newlen = SvCUR(sv);
3339
3340             if (SvUTF8(sv))
3341                 RExC_utf8 = 1;
3342             if (!SIZE_ONLY) {
3343                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344                                       (int)oldlen, STRING(ret),
3345                                       (int)newlen, s));
3346                 Copy(s, STRING(ret), newlen, char);
3347                 STR_LEN(ret) += newlen - oldlen;
3348                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3349             } else
3350                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3351         }
3352     }
3353
3354     return(ret);
3355 }
3356
3357 STATIC char *
3358 S_regwhite(pTHX_ char *p, char *e)
3359 {
3360     while (p < e) {
3361         if (isSPACE(*p))
3362             ++p;
3363         else if (*p == '#') {
3364             do {
3365                 p++;
3366             } while (p < e && *p != '\n');
3367         }
3368         else
3369             break;
3370     }
3371     return p;
3372 }
3373
3374 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375    Character classes ([:foo:]) can also be negated ([:^foo:]).
3376    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3378    but trigger failures because they are currently unimplemented. */
3379
3380 #define POSIXCC_DONE(c)   ((c) == ':')
3381 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3383
3384 STATIC I32
3385 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3386 {
3387     char *posixcc = 0;
3388     I32 namedclass = OOB_NAMEDCLASS;
3389
3390     if (value == '[' && RExC_parse + 1 < RExC_end &&
3391         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3392         POSIXCC(UCHARAT(RExC_parse))) {
3393         char  c = UCHARAT(RExC_parse);
3394         char* s = RExC_parse++;
3395         
3396         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3397             RExC_parse++;
3398         if (RExC_parse == RExC_end)
3399             /* Grandfather lone [:, [=, [. */
3400             RExC_parse = s;
3401         else {
3402             char* t = RExC_parse++; /* skip over the c */
3403
3404             if (UCHARAT(RExC_parse) == ']') {
3405                 RExC_parse++; /* skip over the ending ] */
3406                 posixcc = s + 1;
3407                 if (*s == ':') {
3408                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3409                     I32 skip = 5; /* the most common skip */
3410
3411                     switch (*posixcc) {
3412                     case 'a':
3413                         if (strnEQ(posixcc, "alnum", 5))
3414                             namedclass =
3415                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3416                         else if (strnEQ(posixcc, "alpha", 5))
3417                             namedclass =
3418                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3419                         else if (strnEQ(posixcc, "ascii", 5))
3420                             namedclass =
3421                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3422                         break;
3423                     case 'b':
3424                         if (strnEQ(posixcc, "blank", 5))
3425                             namedclass =
3426                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3427                         break;
3428                     case 'c':
3429                         if (strnEQ(posixcc, "cntrl", 5))
3430                             namedclass =
3431                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3432                         break;
3433                     case 'd':
3434                         if (strnEQ(posixcc, "digit", 5))
3435                             namedclass =
3436                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3437                         break;
3438                     case 'g':
3439                         if (strnEQ(posixcc, "graph", 5))
3440                             namedclass =
3441                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3442                         break;
3443                     case 'l':
3444                         if (strnEQ(posixcc, "lower", 5))
3445                             namedclass =
3446                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3447                         break;
3448                     case 'p':
3449                         if (strnEQ(posixcc, "print", 5))
3450                             namedclass =
3451                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3452                         else if (strnEQ(posixcc, "punct", 5))
3453                             namedclass =
3454                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3455                         break;
3456                     case 's':
3457                         if (strnEQ(posixcc, "space", 5))
3458                             namedclass =
3459                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3460                         break;
3461                     case 'u':
3462                         if (strnEQ(posixcc, "upper", 5))
3463                             namedclass =
3464                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3465                         break;
3466                     case 'w': /* this is not POSIX, this is the Perl \w */
3467                         if (strnEQ(posixcc, "word", 4)) {
3468                             namedclass =
3469                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3470                             skip = 4;
3471                         }
3472                         break;
3473                     case 'x':
3474                         if (strnEQ(posixcc, "xdigit", 6)) {
3475                             namedclass =
3476                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3477                             skip = 6;
3478                         }
3479                         break;
3480                     }
3481                     if (namedclass == OOB_NAMEDCLASS ||
3482                         posixcc[skip] != ':' ||
3483                         posixcc[skip+1] != ']')
3484                     {
3485                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3486                                       t - s - 1, s + 1);
3487                     }
3488                 } else if (!SIZE_ONLY) {
3489                     /* [[=foo=]] and [[.foo.]] are still future. */
3490
3491                     /* adjust RExC_parse so the warning shows after
3492                        the class closes */
3493                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3494                         RExC_parse++;
3495                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3496                 }
3497             } else {
3498                 /* Maternal grandfather:
3499                  * "[:" ending in ":" but not in ":]" */
3500                 RExC_parse = s;
3501             }
3502         }
3503     }
3504
3505     return namedclass;
3506 }
3507
3508 STATIC void
3509 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3510 {
3511     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3512         char *s = RExC_parse;
3513         char  c = *s++;
3514
3515         while(*s && isALNUM(*s))
3516             s++;
3517         if (*s && c == *s && s[1] == ']') {
3518             if (ckWARN(WARN_REGEXP))
3519                 vWARN3(s+2,
3520                         "POSIX syntax [%c %c] belongs inside character classes",
3521                         c, c);
3522
3523             /* [[=foo=]] and [[.foo.]] are still future. */
3524             if (POSIXCC_NOTYET(c)) {
3525                 /* adjust RExC_parse so the error shows after
3526                    the class closes */
3527                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3528                     ;
3529                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3530             }
3531         }
3532     }
3533 }
3534
3535 STATIC regnode *
3536 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3537 {
3538     register UV value;
3539     register UV nextvalue;
3540     register IV prevvalue = OOB_UNICODE;
3541     register IV range = 0;
3542     register regnode *ret;
3543     STRLEN numlen;
3544     IV namedclass;
3545     char *rangebegin = 0;
3546     bool need_class = 0;
3547     SV *listsv = Nullsv;
3548     register char *e;
3549     UV n;
3550     bool optimize_invert   = TRUE;
3551     AV* unicode_alternate  = 0;
3552 #ifdef EBCDIC
3553     UV literal_endpoint = 0;
3554 #endif
3555
3556     ret = reganode(pRExC_state, ANYOF, 0);
3557
3558     if (!SIZE_ONLY)
3559         ANYOF_FLAGS(ret) = 0;
3560
3561     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3562         RExC_naughty++;
3563         RExC_parse++;
3564         if (!SIZE_ONLY)
3565             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3566     }
3567
3568     if (SIZE_ONLY)
3569         RExC_size += ANYOF_SKIP;
3570     else {
3571         RExC_emit += ANYOF_SKIP;
3572         if (FOLD)
3573             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3574         if (LOC)
3575             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3576         ANYOF_BITMAP_ZERO(ret);
3577         listsv = newSVpvn("# comment\n", 10);
3578     }
3579
3580     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3581
3582     if (!SIZE_ONLY && POSIXCC(nextvalue))
3583         checkposixcc(pRExC_state);
3584
3585     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3586     if (UCHARAT(RExC_parse) == ']')
3587         goto charclassloop;
3588
3589     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3590
3591     charclassloop:
3592
3593         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3594
3595         if (!range)
3596             rangebegin = RExC_parse;
3597         if (UTF) {
3598             value = utf8n_to_uvchr((U8*)RExC_parse,
3599                                    RExC_end - RExC_parse,
3600                                    &numlen, 0);
3601             RExC_parse += numlen;
3602         }
3603         else
3604             value = UCHARAT(RExC_parse++);
3605         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606         if (value == '[' && POSIXCC(nextvalue))
3607             namedclass = regpposixcc(pRExC_state, value);
3608         else if (value == '\\') {
3609             if (UTF) {
3610                 value = utf8n_to_uvchr((U8*)RExC_parse,
3611                                    RExC_end - RExC_parse,
3612                                    &numlen, 0);
3613                 RExC_parse += numlen;
3614             }
3615             else
3616                 value = UCHARAT(RExC_parse++);
3617             /* Some compilers cannot handle switching on 64-bit integer
3618              * values, therefore value cannot be an UV.  Yes, this will
3619              * be a problem later if we want switch on Unicode.
3620              * A similar issue a little bit later when switching on
3621              * namedclass. --jhi */
3622             switch ((I32)value) {
3623             case 'w':   namedclass = ANYOF_ALNUM;       break;
3624             case 'W':   namedclass = ANYOF_NALNUM;      break;
3625             case 's':   namedclass = ANYOF_SPACE;       break;
3626             case 'S':   namedclass = ANYOF_NSPACE;      break;
3627             case 'd':   namedclass = ANYOF_DIGIT;       break;
3628             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3629             case 'p':
3630             case 'P':
3631                 if (RExC_parse >= RExC_end)
3632                     vFAIL2("Empty \\%c{}", (U8)value);
3633                 if (*RExC_parse == '{') {
3634                     U8 c = (U8)value;
3635                     e = strchr(RExC_parse++, '}');
3636                     if (!e)
3637                         vFAIL2("Missing right brace on \\%c{}", c);
3638                     while (isSPACE(UCHARAT(RExC_parse)))
3639                         RExC_parse++;
3640                     if (e == RExC_parse)
3641                         vFAIL2("Empty \\%c{}", c);
3642                     n = e - RExC_parse;
3643                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3644                         n--;
3645                 }
3646                 else {
3647                     e = RExC_parse;
3648                     n = 1;
3649                 }
3650                 if (!SIZE_ONLY) {
3651                     if (UCHARAT(RExC_parse) == '^') {
3652                          RExC_parse++;
3653                          n--;
3654                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3655                          while (isSPACE(UCHARAT(RExC_parse))) {
3656                               RExC_parse++;
3657                               n--;
3658                          }
3659                     }
3660                     if (value == 'p')
3661                          Perl_sv_catpvf(aTHX_ listsv,
3662                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3663                     else
3664                          Perl_sv_catpvf(aTHX_ listsv,
3665                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3666                 }
3667                 RExC_parse = e + 1;
3668                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3669                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
3670                 break;
3671             case 'n':   value = '\n';                   break;
3672             case 'r':   value = '\r';                   break;
3673             case 't':   value = '\t';                   break;
3674             case 'f':   value = '\f';                   break;
3675             case 'b':   value = '\b';                   break;
3676             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3677             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3678             case 'x':
3679                 if (*RExC_parse == '{') {
3680                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3681                         | PERL_SCAN_DISALLOW_PREFIX;
3682                     e = strchr(RExC_parse++, '}');
3683                     if (!e)
3684                         vFAIL("Missing right brace on \\x{}");
3685
3686                     numlen = e - RExC_parse;
3687                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3688                     RExC_parse = e + 1;
3689                 }
3690                 else {
3691                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3692                     numlen = 2;
3693                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3694                     RExC_parse += numlen;
3695                 }
3696                 break;
3697             case 'c':
3698                 value = UCHARAT(RExC_parse++);
3699                 value = toCTRL(value);
3700                 break;
3701             case '0': case '1': case '2': case '3': case '4':
3702             case '5': case '6': case '7': case '8': case '9':
3703             {
3704                 I32 flags = 0;
3705                 numlen = 3;
3706                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3707                 RExC_parse += numlen;
3708                 break;
3709             }
3710             default:
3711                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3712                     vWARN2(RExC_parse,
3713                            "Unrecognized escape \\%c in character class passed through",
3714                            (int)value);
3715                 break;
3716             }
3717         } /* end of \blah */
3718 #ifdef EBCDIC
3719         else
3720             literal_endpoint++;
3721 #endif
3722
3723         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3724
3725             if (!SIZE_ONLY && !need_class)
3726                 ANYOF_CLASS_ZERO(ret);
3727
3728             need_class = 1;
3729
3730             /* a bad range like a-\d, a-[:digit:] ? */
3731             if (range) {
3732                 if (!SIZE_ONLY) {
3733                     if (ckWARN(WARN_REGEXP))
3734                         vWARN4(RExC_parse,
3735                                "False [] range \"%*.*s\"",
3736                                RExC_parse - rangebegin,
3737                                RExC_parse - rangebegin,
3738                                rangebegin);
3739                     if (prevvalue < 256) {
3740                         ANYOF_BITMAP_SET(ret, prevvalue);
3741                         ANYOF_BITMAP_SET(ret, '-');
3742                     }
3743                     else {
3744                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3745                         Perl_sv_catpvf(aTHX_ listsv,
3746                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3747                     }
3748                 }
3749
3750                 range = 0; /* this was not a true range */
3751             }
3752
3753             if (!SIZE_ONLY) {
3754                 if (namedclass > OOB_NAMEDCLASS)
3755                     optimize_invert = FALSE;
3756                 /* Possible truncation here but in some 64-bit environments
3757                  * the compiler gets heartburn about switch on 64-bit values.
3758                  * A similar issue a little earlier when switching on value.
3759                  * --jhi */
3760                 switch ((I32)namedclass) {
3761                 case ANYOF_ALNUM:
3762                     if (LOC)
3763                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3764                     else {
3765                         for (value = 0; value < 256; value++)
3766                             if (isALNUM(value))
3767                                 ANYOF_BITMAP_SET(ret, value);
3768                     }
3769                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3770                     break;
3771                 case ANYOF_NALNUM:
3772                     if (LOC)
3773                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3774                     else {
3775                         for (value = 0; value < 256; value++)
3776                             if (!isALNUM(value))
3777                                 ANYOF_BITMAP_SET(ret, value);
3778                     }
3779                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3780                     break;
3781                 case ANYOF_ALNUMC:
3782                     if (LOC)
3783                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3784                     else {
3785                         for (value = 0; value < 256; value++)
3786                             if (isALNUMC(value))
3787                                 ANYOF_BITMAP_SET(ret, value);
3788                     }
3789                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3790                     break;
3791                 case ANYOF_NALNUMC:
3792                     if (LOC)
3793                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3794                     else {
3795                         for (value = 0; value < 256; value++)
3796                             if (!isALNUMC(value))
3797                                 ANYOF_BITMAP_SET(ret, value);
3798                     }
3799                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3800                     break;
3801                 case ANYOF_ALPHA:
3802                     if (LOC)
3803                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3804                     else {
3805                         for (value = 0; value < 256; value++)
3806                             if (isALPHA(value))
3807                                 ANYOF_BITMAP_SET(ret, value);
3808                     }
3809                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3810                     break;
3811                 case ANYOF_NALPHA:
3812                     if (LOC)
3813                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3814                     else {
3815                         for (value = 0; value < 256; value++)
3816                             if (!isALPHA(value))
3817                                 ANYOF_BITMAP_SET(ret, value);
3818                     }
3819                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3820                     break;
3821                 case ANYOF_ASCII:
3822                     if (LOC)
3823                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3824                     else {
3825 #ifndef EBCDIC
3826                         for (value = 0; value < 128; value++)
3827                             ANYOF_BITMAP_SET(ret, value);
3828 #else  /* EBCDIC */
3829                         for (value = 0; value < 256; value++) {
3830                             if (isASCII(value))
3831                                 ANYOF_BITMAP_SET(ret, value);
3832                         }
3833 #endif /* EBCDIC */
3834                     }
3835                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3836                     break;
3837                 case ANYOF_NASCII:
3838                     if (LOC)
3839                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3840                     else {
3841 #ifndef EBCDIC
3842                         for (value = 128; value < 256; value++)
3843                             ANYOF_BITMAP_SET(ret, value);
3844 #else  /* EBCDIC */
3845                         for (value = 0; value < 256; value++) {
3846                             if (!isASCII(value))
3847                                 ANYOF_BITMAP_SET(ret, value);
3848                         }
3849 #endif /* EBCDIC */
3850                     }
3851                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3852                     break;
3853                 case ANYOF_BLANK:
3854                     if (LOC)
3855                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3856                     else {
3857                         for (value = 0; value < 256; value++)
3858                             if (isBLANK(value))
3859                                 ANYOF_BITMAP_SET(ret, value);
3860                     }
3861                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3862                     break;
3863                 case ANYOF_NBLANK:
3864                     if (LOC)
3865                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3866                     else {
3867                         for (value = 0; value < 256; value++)
3868                             if (!isBLANK(value))
3869                                 ANYOF_BITMAP_SET(ret, value);
3870                     }
3871                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3872                     break;
3873                 case ANYOF_CNTRL:
3874                     if (LOC)
3875                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3876                     else {
3877                         for (value = 0; value < 256; value++)
3878                             if (isCNTRL(value))
3879                                 ANYOF_BITMAP_SET(ret, value);
3880                     }
3881                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3882                     break;
3883                 case ANYOF_NCNTRL:
3884                     if (LOC)
3885                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3886                     else {
3887                         for (value = 0; value < 256; value++)
3888                             if (!isCNTRL(value))
3889                                 ANYOF_BITMAP_SET(ret, value);
3890                     }
3891                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3892                     break;
3893                 case ANYOF_DIGIT:
3894                     if (LOC)
3895                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3896                     else {
3897                         /* consecutive digits assumed */
3898                         for (value = '0'; value <= '9'; value++)
3899                             ANYOF_BITMAP_SET(ret, value);
3900                     }
3901                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3902                     break;
3903                 case ANYOF_NDIGIT:
3904                     if (LOC)
3905                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3906                     else {
3907                         /* consecutive digits assumed */
3908                         for (value = 0; value < '0'; value++)
3909                             ANYOF_BITMAP_SET(ret, value);
3910                         for (value = '9' + 1; value < 256; value++)
3911                             ANYOF_BITMAP_SET(ret, value);
3912                     }
3913                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3914                     break;
3915                 case ANYOF_GRAPH:
3916                     if (LOC)
3917                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3918                     else {
3919                         for (value = 0; value < 256; value++)
3920                             if (isGRAPH(value))
3921                                 ANYOF_BITMAP_SET(ret, value);
3922                     }
3923                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3924                     break;
3925                 case ANYOF_NGRAPH: