Win32 pacifying from mjd.
[perl.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U16         flags16;                /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags16    (pRExC_state->flags16)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
397     } STMT_END                                                               \
398
399
400 #define vWARN2(loc, m, a1)                                                   \
401     STMT_START {                                                             \
402         IV offset = loc - RExC_precomp;          \
403         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
404                  a1,                                                         \
405                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
406     } STMT_END
407
408 #define vWARN3(loc, m, a1, a2)                                               \
409     STMT_START {                                                             \
410       IV offset = loc - RExC_precomp;        \
411         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
412                  a1, a2,                                                     \
413                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
414     } STMT_END
415
416 #define vWARN4(loc, m, a1, a2, a3)                                           \
417     STMT_START {                                                             \
418       IV offset = loc - RExC_precomp;            \
419         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
420                  a1, a2, a3,                                                 \
421                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
422     } STMT_END
423
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
426   STMT_START {                                                   \
427       IV offset = loc - RExC_precomp;   \
428         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
429                  a1, a2, a3, a4,                                 \
430                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
431     } STMT_END
432
433
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
436
437 /* Macros for recording node offsets.   20001227 mjd@plover.com 
438  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
439  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
440  * Element 0 holds the number n.
441  */
442
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447 #  define Set_Node_Offset_To_R(node,byte)                           \
448    STMT_START {                                        \
449      if (! SIZE_ONLY) {                                  \
450        if((node) < 0) {                   \
451          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452        } else {                                                        \
453          RExC_offsets[2*(node)-1] = (byte);                               \
454        }                                                               \
455      }                                                                 \
456    } STMT_END
457
458 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461 #  define Set_Node_Length_To_R(node,len)                            \
462    STMT_START {                                        \
463      if (! SIZE_ONLY) {                                  \
464        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465        if((node) < 0) {                   \
466          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467        } else {                                                        \
468          RExC_offsets[2*(node)] = (len);                               \
469        }                                                               \
470      }                                                                 \
471    } STMT_END
472
473 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
475 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
476
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
481 static void clear_re(pTHX_ void *r);
482
483 /* Mark that we cannot extend a found fixed substring at this point.
484    Updata the longest found anchored substring and the longest found
485    floating substrings if needed. */
486
487 STATIC void
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
489 {
490     STRLEN l = CHR_SVLEN(data->last_found);
491     STRLEN old_l = CHR_SVLEN(*data->longest);
492
493     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494         sv_setsv(*data->longest, data->last_found);
495         if (*data->longest == data->longest_fixed) {
496             data->offset_fixed = l ? data->last_start_min : data->pos_min;
497             if (data->flags & SF_BEFORE_EOL)
498                 data->flags
499                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500             else
501                 data->flags &= ~SF_FIX_BEFORE_EOL;
502         }
503         else {
504             data->offset_float_min = l ? data->last_start_min : data->pos_min;
505             data->offset_float_max = (l
506                                       ? data->last_start_max
507                                       : data->pos_min + data->pos_delta);
508             if ((U32)data->offset_float_max > (U32)I32_MAX)
509                 data->offset_float_max = I32_MAX;
510             if (data->flags & SF_BEFORE_EOL)
511                 data->flags
512                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
513             else
514                 data->flags &= ~SF_FL_BEFORE_EOL;
515         }
516     }
517     SvCUR_set(data->last_found, 0);
518     data->last_end = -1;
519     data->flags &= ~SF_BEFORE_EOL;
520 }
521
522 /* Can match anything (initialization) */
523 STATIC void
524 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
525 {
526     ANYOF_CLASS_ZERO(cl);
527     ANYOF_BITMAP_SETALL(cl);
528     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
529     if (LOC)
530         cl->flags |= ANYOF_LOCALE;
531 }
532
533 /* Can match anything (initialization) */
534 STATIC int
535 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
536 {
537     int value;
538
539     for (value = 0; value <= ANYOF_MAX; value += 2)
540         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
541             return 1;
542     if (!(cl->flags & ANYOF_UNICODE_ALL))
543         return 0;
544     if (!ANYOF_BITMAP_TESTALLSET(cl))
545         return 0;
546     return 1;
547 }
548
549 /* Can match anything (initialization) */
550 STATIC void
551 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
552 {
553     Zero(cl, 1, struct regnode_charclass_class);
554     cl->type = ANYOF;
555     cl_anything(pRExC_state, cl);
556 }
557
558 STATIC void
559 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
560 {
561     Zero(cl, 1, struct regnode_charclass_class);
562     cl->type = ANYOF;
563     cl_anything(pRExC_state, cl);
564     if (LOC)
565         cl->flags |= ANYOF_LOCALE;
566 }
567
568 /* 'And' a given class with another one.  Can create false positives */
569 /* We assume that cl is not inverted */
570 STATIC void
571 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
572          struct regnode_charclass_class *and_with)
573 {
574     if (!(and_with->flags & ANYOF_CLASS)
575         && !(cl->flags & ANYOF_CLASS)
576         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
577         && !(and_with->flags & ANYOF_FOLD)
578         && !(cl->flags & ANYOF_FOLD)) {
579         int i;
580
581         if (and_with->flags & ANYOF_INVERT)
582             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583                 cl->bitmap[i] &= ~and_with->bitmap[i];
584         else
585             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586                 cl->bitmap[i] &= and_with->bitmap[i];
587     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
588     if (!(and_with->flags & ANYOF_EOS))
589         cl->flags &= ~ANYOF_EOS;
590
591     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
592         cl->flags &= ~ANYOF_UNICODE_ALL;
593         cl->flags |= ANYOF_UNICODE;
594         ARG_SET(cl, ARG(and_with));
595     }
596     if (!(and_with->flags & ANYOF_UNICODE_ALL))
597         cl->flags &= ~ANYOF_UNICODE_ALL;
598     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
599         cl->flags &= ~ANYOF_UNICODE;
600 }
601
602 /* 'OR' a given class with another one.  Can create false positives */
603 /* We assume that cl is not inverted */
604 STATIC void
605 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
606 {
607     if (or_with->flags & ANYOF_INVERT) {
608         /* We do not use
609          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
610          *   <= (B1 | !B2) | (CL1 | !CL2)
611          * which is wasteful if CL2 is small, but we ignore CL2:
612          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
613          * XXXX Can we handle case-fold?  Unclear:
614          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
615          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
616          */
617         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618              && !(or_with->flags & ANYOF_FOLD)
619              && !(cl->flags & ANYOF_FOLD) ) {
620             int i;
621
622             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
623                 cl->bitmap[i] |= ~or_with->bitmap[i];
624         } /* XXXX: logic is complicated otherwise */
625         else {
626             cl_anything(pRExC_state, cl);
627         }
628     } else {
629         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
630         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
631              && (!(or_with->flags & ANYOF_FOLD)
632                  || (cl->flags & ANYOF_FOLD)) ) {
633             int i;
634
635             /* OR char bitmap and class bitmap separately */
636             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
637                 cl->bitmap[i] |= or_with->bitmap[i];
638             if (or_with->flags & ANYOF_CLASS) {
639                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
640                     cl->classflags[i] |= or_with->classflags[i];
641                 cl->flags |= ANYOF_CLASS;
642             }
643         }
644         else { /* XXXX: logic is complicated, leave it along for a moment. */
645             cl_anything(pRExC_state, cl);
646         }
647     }
648     if (or_with->flags & ANYOF_EOS)
649         cl->flags |= ANYOF_EOS;
650
651     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
652         ARG(cl) != ARG(or_with)) {
653         cl->flags |= ANYOF_UNICODE_ALL;
654         cl->flags &= ~ANYOF_UNICODE;
655     }
656     if (or_with->flags & ANYOF_UNICODE_ALL) {
657         cl->flags |= ANYOF_UNICODE_ALL;
658         cl->flags &= ~ANYOF_UNICODE;
659     }
660 }
661
662 /*
663  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
664  * These need to be revisited when a newer toolchain becomes available.
665  */
666 #if defined(__sparc64__) && defined(__GNUC__)
667 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
668 #       undef  SPARC64_GCC_WORKAROUND
669 #       define SPARC64_GCC_WORKAROUND 1
670 #   endif
671 #endif
672
673 /* REx optimizer.  Converts nodes into quickier variants "in place".
674    Finds fixed substrings.  */
675
676 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
677    to the position after last scanned or to NULL. */
678
679 STATIC I32
680 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
681                         /* scanp: Start here (read-write). */
682                         /* deltap: Write maxlen-minlen here. */
683                         /* last: Stop before this one. */
684 {
685     I32 min = 0, pars = 0, code;
686     regnode *scan = *scanp, *next;
687     I32 delta = 0;
688     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
689     int is_inf_internal = 0;            /* The studied chunk is infinite */
690     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
691     scan_data_t data_fake;
692     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
693
694     while (scan && OP(scan) != END && scan < last) {
695         /* Peephole optimizer: */
696
697         if (PL_regkind[(U8)OP(scan)] == EXACT) {
698             /* Merge several consecutive EXACTish nodes into one. */
699             regnode *n = regnext(scan);
700             U32 stringok = 1;
701 #ifdef DEBUGGING
702             regnode *stop = scan;
703 #endif
704
705             next = scan + NODE_SZ_STR(scan);
706             /* Skip NOTHING, merge EXACT*. */
707             while (n &&
708                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
709                      (stringok && (OP(n) == OP(scan))))
710                    && NEXT_OFF(n)
711                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
712                 if (OP(n) == TAIL || n > next)
713                     stringok = 0;
714                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
715                     NEXT_OFF(scan) += NEXT_OFF(n);
716                     next = n + NODE_STEP_REGNODE;
717 #ifdef DEBUGGING
718                     if (stringok)
719                         stop = n;
720 #endif
721                     n = regnext(n);
722                 }
723                 else if (stringok) {
724                     int oldl = STR_LEN(scan);
725                     regnode *nnext = regnext(n);
726
727                     if (oldl + STR_LEN(n) > U8_MAX)
728                         break;
729                     NEXT_OFF(scan) += NEXT_OFF(n);
730                     STR_LEN(scan) += STR_LEN(n);
731                     next = n + NODE_SZ_STR(n);
732                     /* Now we can overwrite *n : */
733                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
734 #ifdef DEBUGGING
735                     stop = next - 1;
736 #endif
737                     n = nnext;
738                 }
739             }
740
741             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
742 /*
743   Two problematic code points in Unicode casefolding of EXACT nodes:
744
745    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
746    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
747
748    which casefold to
749
750    Unicode                      UTF-8
751
752    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
753    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
754
755    This means that in case-insensitive matching (or "loose matching",
756    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
757    length of the above casefolded versions) can match a target string
758    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
759    This would rather mess up the minimum length computation.
760
761    What we'll do is to look for the tail four bytes, and then peek
762    at the preceding two bytes to see whether we need to decrease
763    the minimum length by four (six minus two).
764
765    Thanks to the design of UTF-8, there cannot be false matches:
766    A sequence of valid UTF-8 bytes cannot be a subsequence of
767    another valid sequence of UTF-8 bytes.
768
769 */
770                  char *s0 = STRING(scan), *s, *t;
771                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
772                  char *t0 = "\xcc\x88\xcc\x81";
773                  char *t1 = t0 + 3;
774                  
775                  for (s = s0 + 2;
776                       s < s2 && (t = ninstr(s, s1, t0, t1));
777                       s = t + 4) {
778                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
779                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
780                            min -= 4;
781                  }
782             }
783
784 #ifdef DEBUGGING
785             /* Allow dumping */
786             n = scan + NODE_SZ_STR(scan);
787             while (n <= stop) {
788                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
789                     OP(n) = OPTIMIZED;
790                     NEXT_OFF(n) = 0;
791                 }
792                 n++;
793             }
794 #endif
795         }
796         /* Follow the next-chain of the current node and optimize
797            away all the NOTHINGs from it.  */
798         if (OP(scan) != CURLYX) {
799             int max = (reg_off_by_arg[OP(scan)]
800                        ? I32_MAX
801                        /* I32 may be smaller than U16 on CRAYs! */
802                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
803             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
804             int noff;
805             regnode *n = scan;
806         
807             /* Skip NOTHING and LONGJMP. */
808             while ((n = regnext(n))
809                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
810                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811                    && off + noff < max)
812                 off += noff;
813             if (reg_off_by_arg[OP(scan)])
814                 ARG(scan) = off;
815             else
816                 NEXT_OFF(scan) = off;
817         }
818         /* The principal pseudo-switch.  Cannot be a switch, since we
819            look into several different things.  */
820         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
821                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
822             next = regnext(scan);
823             code = OP(scan);
824         
825             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
826                 I32 max1 = 0, min1 = I32_MAX, num = 0;
827                 struct regnode_charclass_class accum;
828                 
829                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
831                 if (flags & SCF_DO_STCLASS)
832                     cl_init_zero(pRExC_state, &accum);
833                 while (OP(scan) == code) {
834                     I32 deltanext, minnext, f = 0, fake;
835                     struct regnode_charclass_class this_class;
836
837                     num++;
838                     data_fake.flags = 0;
839                     if (data) {         
840                         data_fake.whilem_c = data->whilem_c;
841                         data_fake.last_closep = data->last_closep;
842                     }
843                     else
844                         data_fake.last_closep = &fake;
845                     next = regnext(scan);
846                     scan = NEXTOPER(scan);
847                     if (code != BRANCH)
848                         scan = NEXTOPER(scan);
849                     if (flags & SCF_DO_STCLASS) {
850                         cl_init(pRExC_state, &this_class);
851                         data_fake.start_class = &this_class;
852                         f = SCF_DO_STCLASS_AND;
853                     }           
854                     if (flags & SCF_WHILEM_VISITED_POS)
855                         f |= SCF_WHILEM_VISITED_POS;
856                     /* we suppose the run is continuous, last=next...*/
857                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
858                                           next, &data_fake, f);
859                     if (min1 > minnext)
860                         min1 = minnext;
861                     if (max1 < minnext + deltanext)
862                         max1 = minnext + deltanext;
863                     if (deltanext == I32_MAX)
864                         is_inf = is_inf_internal = 1;
865                     scan = next;
866                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
867                         pars++;
868                     if (data && (data_fake.flags & SF_HAS_EVAL))
869                         data->flags |= SF_HAS_EVAL;
870                     if (data)
871                         data->whilem_c = data_fake.whilem_c;
872                     if (flags & SCF_DO_STCLASS)
873                         cl_or(pRExC_state, &accum, &this_class);
874                     if (code == SUSPEND)
875                         break;
876                 }
877                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
878                     min1 = 0;
879                 if (flags & SCF_DO_SUBSTR) {
880                     data->pos_min += min1;
881                     data->pos_delta += max1 - min1;
882                     if (max1 != min1 || is_inf)
883                         data->longest = &(data->longest_float);
884                 }
885                 min += min1;
886                 delta += max1 - min1;
887                 if (flags & SCF_DO_STCLASS_OR) {
888                     cl_or(pRExC_state, data->start_class, &accum);
889                     if (min1) {
890                         cl_and(data->start_class, &and_with);
891                         flags &= ~SCF_DO_STCLASS;
892                     }
893                 }
894                 else if (flags & SCF_DO_STCLASS_AND) {
895                     if (min1) {
896                         cl_and(data->start_class, &accum);
897                         flags &= ~SCF_DO_STCLASS;
898                     }
899                     else {
900                         /* Switch to OR mode: cache the old value of
901                          * data->start_class */
902                         StructCopy(data->start_class, &and_with,
903                                    struct regnode_charclass_class);
904                         flags &= ~SCF_DO_STCLASS_AND;
905                         StructCopy(&accum, data->start_class,
906                                    struct regnode_charclass_class);
907                         flags |= SCF_DO_STCLASS_OR;
908                         data->start_class->flags |= ANYOF_EOS;
909                     }
910                 }
911             }
912             else if (code == BRANCHJ)   /* single branch is optimized. */
913                 scan = NEXTOPER(NEXTOPER(scan));
914             else                        /* single branch is optimized. */
915                 scan = NEXTOPER(scan);
916             continue;
917         }
918         else if (OP(scan) == EXACT) {
919             I32 l = STR_LEN(scan);
920             UV uc = *((U8*)STRING(scan));
921             if (UTF) {
922                 U8 *s = (U8*)STRING(scan);
923                 l = utf8_length(s, s + l);
924                 uc = utf8_to_uvchr(s, NULL);
925             }
926             min += l;
927             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
928                 /* The code below prefers earlier match for fixed
929                    offset, later match for variable offset.  */
930                 if (data->last_end == -1) { /* Update the start info. */
931                     data->last_start_min = data->pos_min;
932                     data->last_start_max = is_inf
933                         ? I32_MAX : data->pos_min + data->pos_delta;
934                 }
935                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
936                 if (UTF)
937                     SvUTF8_on(data->last_found);
938                 data->last_end = data->pos_min + l;
939                 data->pos_min += l; /* As in the first entry. */
940                 data->flags &= ~SF_BEFORE_EOL;
941             }
942             if (flags & SCF_DO_STCLASS_AND) {
943                 /* Check whether it is compatible with what we know already! */
944                 int compat = 1;
945
946                 if (uc >= 0x100 ||
947                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
948                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
949                     && (!(data->start_class->flags & ANYOF_FOLD)
950                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
951                     )
952                     compat = 0;
953                 ANYOF_CLASS_ZERO(data->start_class);
954                 ANYOF_BITMAP_ZERO(data->start_class);
955                 if (compat)
956                     ANYOF_BITMAP_SET(data->start_class, uc);
957                 data->start_class->flags &= ~ANYOF_EOS;
958                 if (uc < 0x100)
959                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960             }
961             else if (flags & SCF_DO_STCLASS_OR) {
962                 /* false positive possible if the class is case-folded */
963                 if (uc < 0x100)
964                     ANYOF_BITMAP_SET(data->start_class, uc);
965                 else
966                     data->start_class->flags |= ANYOF_UNICODE_ALL;
967                 data->start_class->flags &= ~ANYOF_EOS;
968                 cl_and(data->start_class, &and_with);
969             }
970             flags &= ~SCF_DO_STCLASS;
971         }
972         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
973             I32 l = STR_LEN(scan);
974             UV uc = *((U8*)STRING(scan));
975
976             /* Search for fixed substrings supports EXACT only. */
977             if (flags & SCF_DO_SUBSTR)
978                 scan_commit(pRExC_state, data);
979             if (UTF) {
980                 U8 *s = (U8 *)STRING(scan);
981                 l = utf8_length(s, s + l);
982                 uc = utf8_to_uvchr(s, NULL);
983             }
984             min += l;
985             if (data && (flags & SCF_DO_SUBSTR))
986                 data->pos_min += l;
987             if (flags & SCF_DO_STCLASS_AND) {
988                 /* Check whether it is compatible with what we know already! */
989                 int compat = 1;
990
991                 if (uc >= 0x100 ||
992                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
993                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
994                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995                     compat = 0;
996                 ANYOF_CLASS_ZERO(data->start_class);
997                 ANYOF_BITMAP_ZERO(data->start_class);
998                 if (compat) {
999                     ANYOF_BITMAP_SET(data->start_class, uc);
1000                     data->start_class->flags &= ~ANYOF_EOS;
1001                     data->start_class->flags |= ANYOF_FOLD;
1002                     if (OP(scan) == EXACTFL)
1003                         data->start_class->flags |= ANYOF_LOCALE;
1004                 }
1005             }
1006             else if (flags & SCF_DO_STCLASS_OR) {
1007                 if (data->start_class->flags & ANYOF_FOLD) {
1008                     /* false positive possible if the class is case-folded.
1009                        Assume that the locale settings are the same... */
1010                     if (uc < 0x100)
1011                         ANYOF_BITMAP_SET(data->start_class, uc);
1012                     data->start_class->flags &= ~ANYOF_EOS;
1013                 }
1014                 cl_and(data->start_class, &and_with);
1015             }
1016             flags &= ~SCF_DO_STCLASS;
1017         }
1018         else if (strchr((char*)PL_varies,OP(scan))) {
1019             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1020             I32 f = flags, pos_before = 0;
1021             regnode *oscan = scan;
1022             struct regnode_charclass_class this_class;
1023             struct regnode_charclass_class *oclass = NULL;
1024             I32 next_is_eval = 0;
1025
1026             switch (PL_regkind[(U8)OP(scan)]) {
1027             case WHILEM:                /* End of (?:...)* . */
1028                 scan = NEXTOPER(scan);
1029                 goto finish;
1030             case PLUS:
1031                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1032                     next = NEXTOPER(scan);
1033                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034                         mincount = 1;
1035                         maxcount = REG_INFTY;
1036                         next = regnext(scan);
1037                         scan = NEXTOPER(scan);
1038                         goto do_curly;
1039                     }
1040                 }
1041                 if (flags & SCF_DO_SUBSTR)
1042                     data->pos_min++;
1043                 min++;
1044                 /* Fall through. */
1045             case STAR:
1046                 if (flags & SCF_DO_STCLASS) {
1047                     mincount = 0;
1048                     maxcount = REG_INFTY;
1049                     next = regnext(scan);
1050                     scan = NEXTOPER(scan);
1051                     goto do_curly;
1052                 }
1053                 is_inf = is_inf_internal = 1;
1054                 scan = regnext(scan);
1055                 if (flags & SCF_DO_SUBSTR) {
1056                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1057                     data->longest = &(data->longest_float);
1058                 }
1059                 goto optimize_curly_tail;
1060             case CURLY:
1061                 mincount = ARG1(scan);
1062                 maxcount = ARG2(scan);
1063                 next = regnext(scan);
1064                 if (OP(scan) == CURLYX) {
1065                     I32 lp = (data ? *(data->last_closep) : 0);
1066
1067                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068                 }
1069                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1070                 next_is_eval = (OP(scan) == EVAL);
1071               do_curly:
1072                 if (flags & SCF_DO_SUBSTR) {
1073                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1074                     pos_before = data->pos_min;
1075                 }
1076                 if (data) {
1077                     fl = data->flags;
1078                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079                     if (is_inf)
1080                         data->flags |= SF_IS_INF;
1081                 }
1082                 if (flags & SCF_DO_STCLASS) {
1083                     cl_init(pRExC_state, &this_class);
1084                     oclass = data->start_class;
1085                     data->start_class = &this_class;
1086                     f |= SCF_DO_STCLASS_AND;
1087                     f &= ~SCF_DO_STCLASS_OR;
1088                 }
1089                 /* These are the cases when once a subexpression
1090                    fails at a particular position, it cannot succeed
1091                    even after backtracking at the enclosing scope.
1092                 
1093                    XXXX what if minimal match and we are at the
1094                         initial run of {n,m}? */
1095                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1096                     f &= ~SCF_WHILEM_VISITED_POS;
1097
1098                 /* This will finish on WHILEM, setting scan, or on NULL: */
1099                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100                                       mincount == 0
1101                                         ? (f & ~SCF_DO_SUBSTR) : f);
1102
1103                 if (flags & SCF_DO_STCLASS)
1104                     data->start_class = oclass;
1105                 if (mincount == 0 || minnext == 0) {
1106                     if (flags & SCF_DO_STCLASS_OR) {
1107                         cl_or(pRExC_state, data->start_class, &this_class);
1108                     }
1109                     else if (flags & SCF_DO_STCLASS_AND) {
1110                         /* Switch to OR mode: cache the old value of
1111                          * data->start_class */
1112                         StructCopy(data->start_class, &and_with,
1113                                    struct regnode_charclass_class);
1114                         flags &= ~SCF_DO_STCLASS_AND;
1115                         StructCopy(&this_class, data->start_class,
1116                                    struct regnode_charclass_class);
1117                         flags |= SCF_DO_STCLASS_OR;
1118                         data->start_class->flags |= ANYOF_EOS;
1119                     }
1120                 } else {                /* Non-zero len */
1121                     if (flags & SCF_DO_STCLASS_OR) {
1122                         cl_or(pRExC_state, data->start_class, &this_class);
1123                         cl_and(data->start_class, &and_with);
1124                     }
1125                     else if (flags & SCF_DO_STCLASS_AND)
1126                         cl_and(data->start_class, &this_class);
1127                     flags &= ~SCF_DO_STCLASS;
1128                 }
1129                 if (!scan)              /* It was not CURLYX, but CURLY. */
1130                     scan = next;
1131                 if (ckWARN(WARN_REGEXP)
1132                        /* ? quantifier ok, except for (?{ ... }) */
1133                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1134                     && (minnext == 0) && (deltanext == 0)
1135                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1136                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1137                 {
1138                     vWARN(RExC_parse,
1139                           "Quantifier unexpected on zero-length expression");
1140                 }
1141
1142                 min += minnext * mincount;
1143                 is_inf_internal |= ((maxcount == REG_INFTY
1144                                      && (minnext + deltanext) > 0)
1145                                     || deltanext == I32_MAX);
1146                 is_inf |= is_inf_internal;
1147                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148
1149                 /* Try powerful optimization CURLYX => CURLYN. */
1150                 if (  OP(oscan) == CURLYX && data
1151                       && data->flags & SF_IN_PAR
1152                       && !(data->flags & SF_HAS_EVAL)
1153                       && !deltanext && minnext == 1 ) {
1154                     /* Try to optimize to CURLYN.  */
1155                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1156                     regnode *nxt1 = nxt;
1157 #ifdef DEBUGGING
1158                     regnode *nxt2;
1159 #endif
1160
1161                     /* Skip open. */
1162                     nxt = regnext(nxt);
1163                     if (!strchr((char*)PL_simple,OP(nxt))
1164                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1165                              && STR_LEN(nxt) == 1))
1166                         goto nogo;
1167 #ifdef DEBUGGING
1168                     nxt2 = nxt;
1169 #endif
1170                     nxt = regnext(nxt);
1171                     if (OP(nxt) != CLOSE)
1172                         goto nogo;
1173                     /* Now we know that nxt2 is the only contents: */
1174                     oscan->flags = ARG(nxt);
1175                     OP(oscan) = CURLYN;
1176                     OP(nxt1) = NOTHING; /* was OPEN. */
1177 #ifdef DEBUGGING
1178                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1179                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1180                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1181                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1182                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1183                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1184 #endif
1185                 }
1186               nogo:
1187
1188                 /* Try optimization CURLYX => CURLYM. */
1189                 if (  OP(oscan) == CURLYX && data
1190                       && !(data->flags & SF_HAS_PAR)
1191                       && !(data->flags & SF_HAS_EVAL)
1192                       && !deltanext  ) {
1193                     /* XXXX How to optimize if data == 0? */
1194                     /* Optimize to a simpler form.  */
1195                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1196                     regnode *nxt2;
1197
1198                     OP(oscan) = CURLYM;
1199                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1200                             && (OP(nxt2) != WHILEM))
1201                         nxt = nxt2;
1202                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1203                     /* Need to optimize away parenths. */
1204                     if (data->flags & SF_IN_PAR) {
1205                         /* Set the parenth number.  */
1206                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207
1208                         if (OP(nxt) != CLOSE)
1209                             FAIL("Panic opt close");
1210                         oscan->flags = ARG(nxt);
1211                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1212                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1213 #ifdef DEBUGGING
1214                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1215                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1216                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1217                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1218 #endif
1219 #if 0
1220                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1221                             regnode *nnxt = regnext(nxt1);
1222                         
1223                             if (nnxt == nxt) {
1224                                 if (reg_off_by_arg[OP(nxt1)])
1225                                     ARG_SET(nxt1, nxt2 - nxt1);
1226                                 else if (nxt2 - nxt1 < U16_MAX)
1227                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1228                                 else
1229                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1230                             }
1231                             nxt1 = nnxt;
1232                         }
1233 #endif
1234                         /* Optimize again: */
1235                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1236                                     NULL, 0);
1237                     }
1238                     else
1239                         oscan->flags = 0;
1240                 }
1241                 else if ((OP(oscan) == CURLYX)
1242                          && (flags & SCF_WHILEM_VISITED_POS)
1243                          /* See the comment on a similar expression above.
1244                             However, this time it not a subexpression
1245                             we care about, but the expression itself. */
1246                          && (maxcount == REG_INFTY)
1247                          && data && ++data->whilem_c < 16) {
1248                     /* This stays as CURLYX, we can put the count/of pair. */
1249                     /* Find WHILEM (as in regexec.c) */
1250                     regnode *nxt = oscan + NEXT_OFF(oscan);
1251
1252                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253                         nxt += ARG(nxt);
1254                     PREVOPER(nxt)->flags = data->whilem_c
1255                         | (RExC_whilem_seen << 4); /* On WHILEM */
1256                 }
1257                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1258                     pars++;
1259                 if (flags & SCF_DO_SUBSTR) {
1260                     SV *last_str = Nullsv;
1261                     int counted = mincount != 0;
1262
1263                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1264 #if defined(SPARC64_GCC_WORKAROUND)
1265                         I32 b = 0;
1266                         STRLEN l = 0;
1267                         char *s = NULL;
1268                         I32 old = 0;
1269
1270                         if (pos_before >= data->last_start_min)
1271                             b = pos_before;
1272                         else
1273                             b = data->last_start_min;
1274
1275                         l = 0;
1276                         s = SvPV(data->last_found, l);
1277                         old = b - data->last_start_min;
1278
1279 #else
1280                         I32 b = pos_before >= data->last_start_min
1281                             ? pos_before : data->last_start_min;
1282                         STRLEN l;
1283                         char *s = SvPV(data->last_found, l);
1284                         I32 old = b - data->last_start_min;
1285 #endif
1286
1287                         if (UTF)
1288                             old = utf8_hop((U8*)s, old) - (U8*)s;
1289                         
1290                         l -= old;
1291                         /* Get the added string: */
1292                         last_str = newSVpvn(s  + old, l);
1293                         if (deltanext == 0 && pos_before == b) {
1294                             /* What was added is a constant string */
1295                             if (mincount > 1) {
1296                                 SvGROW(last_str, (mincount * l) + 1);
1297                                 repeatcpy(SvPVX(last_str) + l,
1298                                           SvPVX(last_str), l, mincount - 1);
1299                                 SvCUR(last_str) *= mincount;
1300                                 /* Add additional parts. */
1301                                 SvCUR_set(data->last_found,
1302                                           SvCUR(data->last_found) - l);
1303                                 sv_catsv(data->last_found, last_str);
1304                                 data->last_end += l * (mincount - 1);
1305                             }
1306                         } else {
1307                             /* start offset must point into the last copy */
1308                             data->last_start_min += minnext * (mincount - 1);
1309                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1310                                 * (minnext + data->pos_delta);
1311                         }
1312                     }
1313                     /* It is counted once already... */
1314                     data->pos_min += minnext * (mincount - counted);
1315                     data->pos_delta += - counted * deltanext +
1316                         (minnext + deltanext) * maxcount - minnext * mincount;
1317                     if (mincount != maxcount) {
1318                          /* Cannot extend fixed substrings found inside
1319                             the group.  */
1320                         scan_commit(pRExC_state,data);
1321                         if (mincount && last_str) {
1322                             sv_setsv(data->last_found, last_str);
1323                             data->last_end = data->pos_min;
1324                             data->last_start_min =
1325                                 data->pos_min - CHR_SVLEN(last_str);
1326                             data->last_start_max = is_inf
1327                                 ? I32_MAX
1328                                 : data->pos_min + data->pos_delta
1329                                 - CHR_SVLEN(last_str);
1330                         }
1331                         data->longest = &(data->longest_float);
1332                     }
1333                     SvREFCNT_dec(last_str);
1334                 }
1335                 if (data && (fl & SF_HAS_EVAL))
1336                     data->flags |= SF_HAS_EVAL;
1337               optimize_curly_tail:
1338                 if (OP(oscan) != CURLYX) {
1339                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1340                            && NEXT_OFF(next))
1341                         NEXT_OFF(oscan) += NEXT_OFF(next);
1342                 }
1343                 continue;
1344             default:                    /* REF and CLUMP only? */
1345                 if (flags & SCF_DO_SUBSTR) {
1346                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1347                     data->longest = &(data->longest_float);
1348                 }
1349                 is_inf = is_inf_internal = 1;
1350                 if (flags & SCF_DO_STCLASS_OR)
1351                     cl_anything(pRExC_state, data->start_class);
1352                 flags &= ~SCF_DO_STCLASS;
1353                 break;
1354             }
1355         }
1356         else if (strchr((char*)PL_simple,OP(scan))) {
1357             int value = 0;
1358
1359             if (flags & SCF_DO_SUBSTR) {
1360                 scan_commit(pRExC_state,data);
1361                 data->pos_min++;
1362             }
1363             min++;
1364             if (flags & SCF_DO_STCLASS) {
1365                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1366
1367                 /* Some of the logic below assumes that switching
1368                    locale on will only add false positives. */
1369                 switch (PL_regkind[(U8)OP(scan)]) {
1370                 case SANY:
1371                 default:
1372                   do_default:
1373                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1374                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1375                         cl_anything(pRExC_state, data->start_class);
1376                     break;
1377                 case REG_ANY:
1378                     if (OP(scan) == SANY)
1379                         goto do_default;
1380                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1381                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1382                                  || (data->start_class->flags & ANYOF_CLASS));
1383                         cl_anything(pRExC_state, data->start_class);
1384                     }
1385                     if (flags & SCF_DO_STCLASS_AND || !value)
1386                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387                     break;
1388                 case ANYOF:
1389                     if (flags & SCF_DO_STCLASS_AND)
1390                         cl_and(data->start_class,
1391                                (struct regnode_charclass_class*)scan);
1392                     else
1393                         cl_or(pRExC_state, data->start_class,
1394                               (struct regnode_charclass_class*)scan);
1395                     break;
1396                 case ALNUM:
1397                     if (flags & SCF_DO_STCLASS_AND) {
1398                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1399                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1400                             for (value = 0; value < 256; value++)
1401                                 if (!isALNUM(value))
1402                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1403                         }
1404                     }
1405                     else {
1406                         if (data->start_class->flags & ANYOF_LOCALE)
1407                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1408                         else {
1409                             for (value = 0; value < 256; value++)
1410                                 if (isALNUM(value))
1411                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1412                         }
1413                     }
1414                     break;
1415                 case ALNUML:
1416                     if (flags & SCF_DO_STCLASS_AND) {
1417                         if (data->start_class->flags & ANYOF_LOCALE)
1418                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419                     }
1420                     else {
1421                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1422                         data->start_class->flags |= ANYOF_LOCALE;
1423                     }
1424                     break;
1425                 case NALNUM:
1426                     if (flags & SCF_DO_STCLASS_AND) {
1427                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1428                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1429                             for (value = 0; value < 256; value++)
1430                                 if (isALNUM(value))
1431                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1432                         }
1433                     }
1434                     else {
1435                         if (data->start_class->flags & ANYOF_LOCALE)
1436                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437                         else {
1438                             for (value = 0; value < 256; value++)
1439                                 if (!isALNUM(value))
1440                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1441                         }
1442                     }
1443                     break;
1444                 case NALNUML:
1445                     if (flags & SCF_DO_STCLASS_AND) {
1446                         if (data->start_class->flags & ANYOF_LOCALE)
1447                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448                     }
1449                     else {
1450                         data->start_class->flags |= ANYOF_LOCALE;
1451                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1452                     }
1453                     break;
1454                 case SPACE:
1455                     if (flags & SCF_DO_STCLASS_AND) {
1456                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1457                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1458                             for (value = 0; value < 256; value++)
1459                                 if (!isSPACE(value))
1460                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1461                         }
1462                     }
1463                     else {
1464                         if (data->start_class->flags & ANYOF_LOCALE)
1465                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466                         else {
1467                             for (value = 0; value < 256; value++)
1468                                 if (isSPACE(value))
1469                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1470                         }
1471                     }
1472                     break;
1473                 case SPACEL:
1474                     if (flags & SCF_DO_STCLASS_AND) {
1475                         if (data->start_class->flags & ANYOF_LOCALE)
1476                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477                     }
1478                     else {
1479                         data->start_class->flags |= ANYOF_LOCALE;
1480                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1481                     }
1482                     break;
1483                 case NSPACE:
1484                     if (flags & SCF_DO_STCLASS_AND) {
1485                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1486                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1487                             for (value = 0; value < 256; value++)
1488                                 if (isSPACE(value))
1489                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1490                         }
1491                     }
1492                     else {
1493                         if (data->start_class->flags & ANYOF_LOCALE)
1494                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1495                         else {
1496                             for (value = 0; value < 256; value++)
1497                                 if (!isSPACE(value))
1498                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1499                         }
1500                     }
1501                     break;
1502                 case NSPACEL:
1503                     if (flags & SCF_DO_STCLASS_AND) {
1504                         if (data->start_class->flags & ANYOF_LOCALE) {
1505                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1506                             for (value = 0; value < 256; value++)
1507                                 if (!isSPACE(value))
1508                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1509                         }
1510                     }
1511                     else {
1512                         data->start_class->flags |= ANYOF_LOCALE;
1513                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1514                     }
1515                     break;
1516                 case DIGIT:
1517                     if (flags & SCF_DO_STCLASS_AND) {
1518                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1519                         for (value = 0; value < 256; value++)
1520                             if (!isDIGIT(value))
1521                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522                     }
1523                     else {
1524                         if (data->start_class->flags & ANYOF_LOCALE)
1525                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1526                         else {
1527                             for (value = 0; value < 256; value++)
1528                                 if (isDIGIT(value))
1529                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1530                         }
1531                     }
1532                     break;
1533                 case NDIGIT:
1534                     if (flags & SCF_DO_STCLASS_AND) {
1535                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1536                         for (value = 0; value < 256; value++)
1537                             if (isDIGIT(value))
1538                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539                     }
1540                     else {
1541                         if (data->start_class->flags & ANYOF_LOCALE)
1542                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1543                         else {
1544                             for (value = 0; value < 256; value++)
1545                                 if (!isDIGIT(value))
1546                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1547                         }
1548                     }
1549                     break;
1550                 }
1551                 if (flags & SCF_DO_STCLASS_OR)
1552                     cl_and(data->start_class, &and_with);
1553                 flags &= ~SCF_DO_STCLASS;
1554             }
1555         }
1556         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1557             data->flags |= (OP(scan) == MEOL
1558                             ? SF_BEFORE_MEOL
1559                             : SF_BEFORE_SEOL);
1560         }
1561         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1562                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1563                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1564                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1565             /* Lookahead/lookbehind */
1566             I32 deltanext, minnext, fake = 0;
1567             regnode *nscan;
1568             struct regnode_charclass_class intrnl;
1569             int f = 0;
1570
1571             data_fake.flags = 0;
1572             if (data) {         
1573                 data_fake.whilem_c = data->whilem_c;
1574                 data_fake.last_closep = data->last_closep;
1575             }
1576             else
1577                 data_fake.last_closep = &fake;
1578             if ( flags & SCF_DO_STCLASS && !scan->flags
1579                  && OP(scan) == IFMATCH ) { /* Lookahead */
1580                 cl_init(pRExC_state, &intrnl);
1581                 data_fake.start_class = &intrnl;
1582                 f |= SCF_DO_STCLASS_AND;
1583             }
1584             if (flags & SCF_WHILEM_VISITED_POS)
1585                 f |= SCF_WHILEM_VISITED_POS;
1586             next = regnext(scan);
1587             nscan = NEXTOPER(NEXTOPER(scan));
1588             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1589             if (scan->flags) {
1590                 if (deltanext) {
1591                     vFAIL("Variable length lookbehind not implemented");
1592                 }
1593                 else if (minnext > U8_MAX) {
1594                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1595                 }
1596                 scan->flags = minnext;
1597             }
1598             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1599                 pars++;
1600             if (data && (data_fake.flags & SF_HAS_EVAL))
1601                 data->flags |= SF_HAS_EVAL;
1602             if (data)
1603                 data->whilem_c = data_fake.whilem_c;
1604             if (f & SCF_DO_STCLASS_AND) {
1605                 int was = (data->start_class->flags & ANYOF_EOS);
1606
1607                 cl_and(data->start_class, &intrnl);
1608                 if (was)
1609                     data->start_class->flags |= ANYOF_EOS;
1610             }
1611         }
1612         else if (OP(scan) == OPEN) {
1613             pars++;
1614         }
1615         else if (OP(scan) == CLOSE) {
1616             if (ARG(scan) == is_par) {
1617                 next = regnext(scan);
1618
1619                 if ( next && (OP(next) != WHILEM) && next < last)
1620                     is_par = 0;         /* Disable optimization */
1621             }
1622             if (data)
1623                 *(data->last_closep) = ARG(scan);
1624         }
1625         else if (OP(scan) == EVAL) {
1626                 if (data)
1627                     data->flags |= SF_HAS_EVAL;
1628         }
1629         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1630                 if (flags & SCF_DO_SUBSTR) {
1631                     scan_commit(pRExC_state,data);
1632                     data->longest = &(data->longest_float);
1633                 }
1634                 is_inf = is_inf_internal = 1;
1635                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1636                     cl_anything(pRExC_state, data->start_class);
1637                 flags &= ~SCF_DO_STCLASS;
1638         }
1639         /* Else: zero-length, ignore. */
1640         scan = regnext(scan);
1641     }
1642
1643   finish:
1644     *scanp = scan;
1645     *deltap = is_inf_internal ? I32_MAX : delta;
1646     if (flags & SCF_DO_SUBSTR && is_inf)
1647         data->pos_delta = I32_MAX - data->pos_min;
1648     if (is_par > U8_MAX)
1649         is_par = 0;
1650     if (is_par && pars==1 && data) {
1651         data->flags |= SF_IN_PAR;
1652         data->flags &= ~SF_HAS_PAR;
1653     }
1654     else if (pars && data) {
1655         data->flags |= SF_HAS_PAR;
1656         data->flags &= ~SF_IN_PAR;
1657     }
1658     if (flags & SCF_DO_STCLASS_OR)
1659         cl_and(data->start_class, &and_with);
1660     return min;
1661 }
1662
1663 STATIC I32
1664 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1665 {
1666     if (RExC_rx->data) {
1667         Renewc(RExC_rx->data,
1668                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1669                char, struct reg_data);
1670         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1671         RExC_rx->data->count += n;
1672     }
1673     else {
1674         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1675              char, struct reg_data);
1676         New(1208, RExC_rx->data->what, n, U8);
1677         RExC_rx->data->count = n;
1678     }
1679     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1680     return RExC_rx->data->count - n;
1681 }
1682
1683 void
1684 Perl_reginitcolors(pTHX)
1685 {
1686     int i = 0;
1687     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1688         
1689     if (s) {
1690         PL_colors[0] = s = savepv(s);
1691         while (++i < 6) {
1692             s = strchr(s, '\t');
1693             if (s) {
1694                 *s = '\0';
1695                 PL_colors[i] = ++s;
1696             }
1697             else
1698                 PL_colors[i] = s = "";
1699         }
1700     } else {
1701         while (i < 6)
1702             PL_colors[i++] = "";
1703     }
1704     PL_colorset = 1;
1705 }
1706
1707
1708 /*
1709  - pregcomp - compile a regular expression into internal code
1710  *
1711  * We can't allocate space until we know how big the compiled form will be,
1712  * but we can't compile it (and thus know how big it is) until we've got a
1713  * place to put the code.  So we cheat:  we compile it twice, once with code
1714  * generation turned off and size counting turned on, and once "for real".
1715  * This also means that we don't allocate space until we are sure that the
1716  * thing really will compile successfully, and we never have to move the
1717  * code and thus invalidate pointers into it.  (Note that it has to be in
1718  * one piece because free() must be able to free it all.) [NB: not true in perl]
1719  *
1720  * Beware that the optimization-preparation code in here knows about some
1721  * of the structure of the compiled regexp.  [I'll say.]
1722  */
1723 regexp *
1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1725 {
1726     register regexp *r;
1727     regnode *scan;
1728     regnode *first;
1729     I32 flags;
1730     I32 minlen = 0;
1731     I32 sawplus = 0;
1732     I32 sawopen = 0;
1733     scan_data_t data;
1734     RExC_state_t RExC_state;
1735     RExC_state_t *pRExC_state = &RExC_state;
1736
1737     if (exp == NULL)
1738         FAIL("NULL regexp argument");
1739
1740     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1741
1742     RExC_precomp = exp;
1743     DEBUG_r({
1744          if (!PL_colorset) reginitcolors();
1745          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746                        PL_colors[4],PL_colors[5],PL_colors[0],
1747                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1748     });
1749     RExC_flags16 = pm->op_pmflags;
1750     RExC_sawback = 0;
1751
1752     RExC_seen = 0;
1753     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754     RExC_seen_evals = 0;
1755     RExC_extralen = 0;
1756
1757     /* First pass: determine size, legality. */
1758     RExC_parse = exp;
1759     RExC_start = exp;
1760     RExC_end = xend;
1761     RExC_naughty = 0;
1762     RExC_npar = 1;
1763     RExC_size = 0L;
1764     RExC_emit = &PL_regdummy;
1765     RExC_whilem_seen = 0;
1766 #if 0 /* REGC() is (currently) a NOP at the first pass.
1767        * Clever compilers notice this and complain. --jhi */
1768     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1769 #endif
1770     if (reg(pRExC_state, 0, &flags) == NULL) {
1771         RExC_precomp = Nullch;
1772         return(NULL);
1773     }
1774     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1775
1776     /* Small enough for pointer-storage convention?
1777        If extralen==0, this means that we will not need long jumps. */
1778     if (RExC_size >= 0x10000L && RExC_extralen)
1779         RExC_size += RExC_extralen;
1780     else
1781         RExC_extralen = 0;
1782     if (RExC_whilem_seen > 15)
1783         RExC_whilem_seen = 15;
1784
1785     /* Allocate space and initialize. */
1786     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787          char, regexp);
1788     if (r == NULL)
1789         FAIL("Regexp out of space");
1790
1791 #ifdef DEBUGGING
1792     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1793     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 #endif
1795     r->refcnt = 1;
1796     r->prelen = xend - exp;
1797     r->precomp = savepvn(RExC_precomp, r->prelen);
1798     r->subbeg = NULL;
1799     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1800     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1801
1802     r->substrs = 0;                     /* Useful during FAIL. */
1803     r->startp = 0;                      /* Useful during FAIL. */
1804     r->endp = 0;                        /* Useful during FAIL. */
1805
1806     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1807     if (r->offsets) {
1808       r->offsets[0] = RExC_size; 
1809     }
1810     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1811                           "%s %"UVuf" bytes for offset annotations.\n", 
1812                           r->offsets ? "Got" : "Couldn't get", 
1813                           (UV)((2*RExC_size+1) * sizeof(U32))));
1814
1815     RExC_rx = r;
1816
1817     /* Second pass: emit code. */
1818     RExC_flags16 = pm->op_pmflags;      /* don't let top level (?i) bleed */
1819     RExC_parse = exp;
1820     RExC_end = xend;
1821     RExC_naughty = 0;
1822     RExC_npar = 1;
1823     RExC_emit_start = r->program;
1824     RExC_emit = r->program;
1825     /* Store the count of eval-groups for security checks: */
1826     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1827     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1828     r->data = 0;
1829     if (reg(pRExC_state, 0, &flags) == NULL)
1830         return(NULL);
1831
1832     /* Dig out information for optimizations. */
1833     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1834     pm->op_pmflags = RExC_flags16;
1835     if (UTF)
1836         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1837     r->regstclass = NULL;
1838     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1839         r->reganch |= ROPT_NAUGHTY;
1840     scan = r->program + 1;              /* First BRANCH. */
1841
1842     /* XXXX To minimize changes to RE engine we always allocate
1843        3-units-long substrs field. */
1844     Newz(1004, r->substrs, 1, struct reg_substr_data);
1845
1846     StructCopy(&zero_scan_data, &data, scan_data_t);
1847     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1848     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1849         I32 fake;
1850         STRLEN longest_float_length, longest_fixed_length;
1851         struct regnode_charclass_class ch_class;
1852         int stclass_flag;
1853         I32 last_close = 0;
1854
1855         first = scan;
1856         /* Skip introductions and multiplicators >= 1. */
1857         while ((OP(first) == OPEN && (sawopen = 1)) ||
1858                /* An OR of *one* alternative - should not happen now. */
1859             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1860             (OP(first) == PLUS) ||
1861             (OP(first) == MINMOD) ||
1862                /* An {n,m} with n>0 */
1863             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1864                 if (OP(first) == PLUS)
1865                     sawplus = 1;
1866                 else
1867                     first += regarglen[(U8)OP(first)];
1868                 first = NEXTOPER(first);
1869         }
1870
1871         /* Starting-point info. */
1872       again:
1873         if (PL_regkind[(U8)OP(first)] == EXACT) {
1874             if (OP(first) == EXACT)
1875                 ;       /* Empty, get anchored substr later. */
1876             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1877                 r->regstclass = first;
1878         }
1879         else if (strchr((char*)PL_simple,OP(first)))
1880             r->regstclass = first;
1881         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1882                  PL_regkind[(U8)OP(first)] == NBOUND)
1883             r->regstclass = first;
1884         else if (PL_regkind[(U8)OP(first)] == BOL) {
1885             r->reganch |= (OP(first) == MBOL
1886                            ? ROPT_ANCH_MBOL
1887                            : (OP(first) == SBOL
1888                               ? ROPT_ANCH_SBOL
1889                               : ROPT_ANCH_BOL));
1890             first = NEXTOPER(first);
1891             goto again;
1892         }
1893         else if (OP(first) == GPOS) {
1894             r->reganch |= ROPT_ANCH_GPOS;
1895             first = NEXTOPER(first);
1896             goto again;
1897         }
1898         else if (!sawopen && (OP(first) == STAR &&
1899             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1900             !(r->reganch & ROPT_ANCH) )
1901         {
1902             /* turn .* into ^.* with an implied $*=1 */
1903             int type = OP(NEXTOPER(first));
1904
1905             if (type == REG_ANY)
1906                 type = ROPT_ANCH_MBOL;
1907             else
1908                 type = ROPT_ANCH_SBOL;
1909
1910             r->reganch |= type | ROPT_IMPLICIT;
1911             first = NEXTOPER(first);
1912             goto again;
1913         }
1914         if (sawplus && (!sawopen || !RExC_sawback)
1915             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1916             /* x+ must match at the 1st pos of run of x's */
1917             r->reganch |= ROPT_SKIP;
1918
1919         /* Scan is after the zeroth branch, first is atomic matcher. */
1920         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1921                               (IV)(first - scan + 1)));
1922         /*
1923         * If there's something expensive in the r.e., find the
1924         * longest literal string that must appear and make it the
1925         * regmust.  Resolve ties in favor of later strings, since
1926         * the regstart check works with the beginning of the r.e.
1927         * and avoiding duplication strengthens checking.  Not a
1928         * strong reason, but sufficient in the absence of others.
1929         * [Now we resolve ties in favor of the earlier string if
1930         * it happens that c_offset_min has been invalidated, since the
1931         * earlier string may buy us something the later one won't.]
1932         */
1933         minlen = 0;
1934
1935         data.longest_fixed = newSVpvn("",0);
1936         data.longest_float = newSVpvn("",0);
1937         data.last_found = newSVpvn("",0);
1938         data.longest = &(data.longest_fixed);
1939         first = scan;
1940         if (!r->regstclass) {
1941             cl_init(pRExC_state, &ch_class);
1942             data.start_class = &ch_class;
1943             stclass_flag = SCF_DO_STCLASS_AND;
1944         } else                          /* XXXX Check for BOUND? */
1945             stclass_flag = 0;
1946         data.last_closep = &last_close;
1947
1948         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1949                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1950         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1951              && data.last_start_min == 0 && data.last_end > 0
1952              && !RExC_seen_zerolen
1953              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1954             r->reganch |= ROPT_CHECK_ALL;
1955         scan_commit(pRExC_state, &data);
1956         SvREFCNT_dec(data.last_found);
1957
1958         longest_float_length = CHR_SVLEN(data.longest_float);
1959         if (longest_float_length
1960             || (data.flags & SF_FL_BEFORE_EOL
1961                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1962                     || (RExC_flags16 & PMf_MULTILINE)))) {
1963             int t;
1964
1965             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1966                 && data.offset_fixed == data.offset_float_min
1967                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1968                     goto remove_float;          /* As in (a)+. */
1969
1970             if (SvUTF8(data.longest_float)) {
1971                 r->float_utf8 = data.longest_float;
1972                 r->float_substr = Nullsv;
1973             } else {
1974                 r->float_substr = data.longest_float;
1975                 r->float_utf8 = Nullsv;
1976             }
1977             r->float_min_offset = data.offset_float_min;
1978             r->float_max_offset = data.offset_float_max;
1979             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1980                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1981                            || (RExC_flags16 & PMf_MULTILINE)));
1982             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1983         }
1984         else {
1985           remove_float:
1986             r->float_substr = r->float_utf8 = Nullsv;
1987             SvREFCNT_dec(data.longest_float);
1988             longest_float_length = 0;
1989         }
1990
1991         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1992         if (longest_fixed_length
1993             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1994                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1995                     || (RExC_flags16 & PMf_MULTILINE)))) {
1996             int t;
1997
1998             if (SvUTF8(data.longest_fixed)) {
1999                 r->anchored_utf8 = data.longest_fixed;
2000                 r->anchored_substr = Nullsv;
2001             } else {
2002                 r->anchored_substr = data.longest_fixed;
2003                 r->anchored_utf8 = Nullsv;
2004             }
2005             r->anchored_offset = data.offset_fixed;
2006             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2007                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2008                      || (RExC_flags16 & PMf_MULTILINE)));
2009             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2010         }
2011         else {
2012             r->anchored_substr = r->anchored_utf8 = Nullsv;
2013             SvREFCNT_dec(data.longest_fixed);
2014             longest_fixed_length = 0;
2015         }
2016         if (r->regstclass
2017             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2018             r->regstclass = NULL;
2019         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2020             && stclass_flag
2021             && !(data.start_class->flags & ANYOF_EOS)
2022             && !cl_is_anything(data.start_class)) {
2023             I32 n = add_data(pRExC_state, 1, "f");
2024
2025             New(1006, RExC_rx->data->data[n], 1,
2026                 struct regnode_charclass_class);
2027             StructCopy(data.start_class,
2028                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2029                        struct regnode_charclass_class);
2030             r->regstclass = (regnode*)RExC_rx->data->data[n];
2031             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2032             PL_regdata = r->data; /* for regprop() */
2033             DEBUG_r({ SV *sv = sv_newmortal();
2034                       regprop(sv, (regnode*)data.start_class);
2035                       PerlIO_printf(Perl_debug_log,
2036                                     "synthetic stclass `%s'.\n",
2037                                     SvPVX(sv));});
2038         }
2039
2040         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2041         if (longest_fixed_length > longest_float_length) {
2042             r->check_substr = r->anchored_substr;
2043             r->check_utf8 = r->anchored_utf8;
2044             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2045             if (r->reganch & ROPT_ANCH_SINGLE)
2046                 r->reganch |= ROPT_NOSCAN;
2047         }
2048         else {
2049             r->check_substr = r->float_substr;
2050             r->check_utf8 = r->float_utf8;
2051             r->check_offset_min = data.offset_float_min;
2052             r->check_offset_max = data.offset_float_max;
2053         }
2054         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2055            This should be changed ASAP!  */
2056         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2057             r->reganch |= RE_USE_INTUIT;
2058             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2059                 r->reganch |= RE_INTUIT_TAIL;
2060         }
2061     }
2062     else {
2063         /* Several toplevels. Best we can is to set minlen. */
2064         I32 fake;
2065         struct regnode_charclass_class ch_class;
2066         I32 last_close = 0;
2067         
2068         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2069         scan = r->program + 1;
2070         cl_init(pRExC_state, &ch_class);
2071         data.start_class = &ch_class;
2072         data.last_closep = &last_close;
2073         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2074         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2075                 = r->float_substr = r->float_utf8 = Nullsv;
2076         if (!(data.start_class->flags & ANYOF_EOS)
2077             && !cl_is_anything(data.start_class)) {
2078             I32 n = add_data(pRExC_state, 1, "f");
2079
2080             New(1006, RExC_rx->data->data[n], 1,
2081                 struct regnode_charclass_class);
2082             StructCopy(data.start_class,
2083                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2084                        struct regnode_charclass_class);
2085             r->regstclass = (regnode*)RExC_rx->data->data[n];
2086             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2087             DEBUG_r({ SV* sv = sv_newmortal();
2088                       regprop(sv, (regnode*)data.start_class);
2089                       PerlIO_printf(Perl_debug_log,
2090                                     "synthetic stclass `%s'.\n",
2091                                     SvPVX(sv));});
2092         }
2093     }
2094
2095     r->minlen = minlen;
2096     if (RExC_seen & REG_SEEN_GPOS)
2097         r->reganch |= ROPT_GPOS_SEEN;
2098     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2099         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2100     if (RExC_seen & REG_SEEN_EVAL)
2101         r->reganch |= ROPT_EVAL_SEEN;
2102     if (RExC_seen & REG_SEEN_CANY)
2103         r->reganch |= ROPT_CANY_SEEN;
2104     Newz(1002, r->startp, RExC_npar, I32);
2105     Newz(1002, r->endp, RExC_npar, I32);
2106     PL_regdata = r->data; /* for regprop() */
2107     DEBUG_r(regdump(r));
2108     return(r);
2109 }
2110
2111 /*
2112  - reg - regular expression, i.e. main body or parenthesized thing
2113  *
2114  * Caller must absorb opening parenthesis.
2115  *
2116  * Combining parenthesis handling with the base level of regular expression
2117  * is a trifle forced, but the need to tie the tails of the branches to what
2118  * follows makes it hard to avoid.
2119  */
2120 STATIC regnode *
2121 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2122     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2123 {
2124     register regnode *ret;              /* Will be the head of the group. */
2125     register regnode *br;
2126     register regnode *lastbr;
2127     register regnode *ender = 0;
2128     register I32 parno = 0;
2129     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2130
2131     /* for (?g), (?gc), and (?o) warnings; warning
2132        about (?c) will warn about (?g) -- japhy    */
2133
2134     I32 wastedflags = 0x00,
2135         wasted_o    = 0x01,
2136         wasted_g    = 0x02,
2137         wasted_gc   = 0x02 | 0x04,
2138         wasted_c    = 0x04;
2139
2140     char * parse_start = RExC_parse; /* MJD */
2141     char *oregcomp_parse = RExC_parse;
2142     char c;
2143
2144     *flagp = 0;                         /* Tentatively. */
2145
2146
2147     /* Make an OPEN node, if parenthesized. */
2148     if (paren) {
2149         if (*RExC_parse == '?') { /* (?...) */
2150             U16 posflags = 0, negflags = 0;
2151             U16 *flagsp = &posflags;
2152             int logical = 0;
2153             char *seqstart = RExC_parse;
2154
2155             RExC_parse++;
2156             paren = *RExC_parse++;
2157             ret = NULL;                 /* For look-ahead/behind. */
2158             switch (paren) {
2159             case '<':           /* (?<...) */
2160                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2161                 if (*RExC_parse == '!')
2162                     paren = ',';
2163                 if (*RExC_parse != '=' && *RExC_parse != '!')
2164                     goto unknown;
2165                 RExC_parse++;
2166             case '=':           /* (?=...) */
2167             case '!':           /* (?!...) */
2168                 RExC_seen_zerolen++;
2169             case ':':           /* (?:...) */
2170             case '>':           /* (?>...) */
2171                 break;
2172             case '$':           /* (?$...) */
2173             case '@':           /* (?@...) */
2174                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2175                 break;
2176             case '#':           /* (?#...) */
2177                 while (*RExC_parse && *RExC_parse != ')')
2178                     RExC_parse++;
2179                 if (*RExC_parse != ')')
2180                     FAIL("Sequence (?#... not terminated");
2181                 nextchar(pRExC_state);
2182                 *flagp = TRYAGAIN;
2183                 return NULL;
2184             case 'p':           /* (?p...) */
2185                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2186                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2187                 /* FALL THROUGH*/
2188             case '?':           /* (??...) */
2189                 logical = 1;
2190                 if (*RExC_parse != '{')
2191                     goto unknown;
2192                 paren = *RExC_parse++;
2193                 /* FALL THROUGH */
2194             case '{':           /* (?{...}) */
2195             {
2196                 I32 count = 1, n = 0;
2197                 char c;
2198                 char *s = RExC_parse;
2199                 SV *sv;
2200                 OP_4tree *sop, *rop;
2201
2202                 RExC_seen_zerolen++;
2203                 RExC_seen |= REG_SEEN_EVAL;
2204                 while (count && (c = *RExC_parse)) {
2205                     if (c == '\\' && RExC_parse[1])
2206                         RExC_parse++;
2207                     else if (c == '{')
2208                         count++;
2209                     else if (c == '}')
2210                         count--;
2211                     RExC_parse++;
2212                 }
2213                 if (*RExC_parse != ')')
2214                 {
2215                     RExC_parse = s;             
2216                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2217                 }
2218                 if (!SIZE_ONLY) {
2219                     AV *av;
2220                 
2221                     if (RExC_parse - 1 - s)
2222                         sv = newSVpvn(s, RExC_parse - 1 - s);
2223                     else
2224                         sv = newSVpvn("", 0);
2225
2226                     ENTER;
2227                     Perl_save_re_context(aTHX);
2228                     rop = sv_compile_2op(sv, &sop, "re", &av);
2229                     sop->op_private |= OPpREFCOUNTED;
2230                     /* re_dup will OpREFCNT_inc */
2231                     OpREFCNT_set(sop, 1);
2232                     LEAVE;
2233
2234                     n = add_data(pRExC_state, 3, "nop");
2235                     RExC_rx->data->data[n] = (void*)rop;
2236                     RExC_rx->data->data[n+1] = (void*)sop;
2237                     RExC_rx->data->data[n+2] = (void*)av;
2238                     SvREFCNT_dec(sv);
2239                 }
2240                 else {                                          /* First pass */
2241                     if (PL_reginterp_cnt < ++RExC_seen_evals
2242                         && PL_curcop != &PL_compiling)
2243                         /* No compiled RE interpolated, has runtime
2244                            components ===> unsafe.  */
2245                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2246                     if (PL_tainting && PL_tainted)
2247                         FAIL("Eval-group in insecure regular expression");
2248                 }
2249                 
2250                 nextchar(pRExC_state);
2251                 if (logical) {
2252                     ret = reg_node(pRExC_state, LOGICAL);
2253                     if (!SIZE_ONLY)
2254                         ret->flags = 2;
2255                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2256                     /* deal with the length of this later - MJD */
2257                     return ret;
2258                 }
2259                 return reganode(pRExC_state, EVAL, n);
2260             }
2261             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2262             {
2263                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2264                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2265                         || RExC_parse[1] == '<'
2266                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2267                         I32 flag;
2268                         
2269                         ret = reg_node(pRExC_state, LOGICAL);
2270                         if (!SIZE_ONLY)
2271                             ret->flags = 1;
2272                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2273                         goto insert_if;
2274                     }
2275                 }
2276                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2277                     /* (?(1)...) */
2278                     parno = atoi(RExC_parse++);
2279
2280                     while (isDIGIT(*RExC_parse))
2281                         RExC_parse++;
2282                     ret = reganode(pRExC_state, GROUPP, parno);
2283                     
2284                     if ((c = *nextchar(pRExC_state)) != ')')
2285                         vFAIL("Switch condition not recognized");
2286                   insert_if:
2287                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2288                     br = regbranch(pRExC_state, &flags, 1);
2289                     if (br == NULL)
2290                         br = reganode(pRExC_state, LONGJMP, 0);
2291                     else
2292                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2293                     c = *nextchar(pRExC_state);
2294                     if (flags&HASWIDTH)
2295                         *flagp |= HASWIDTH;
2296                     if (c == '|') {
2297                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2298                         regbranch(pRExC_state, &flags, 1);
2299                         regtail(pRExC_state, ret, lastbr);
2300                         if (flags&HASWIDTH)
2301                             *flagp |= HASWIDTH;
2302                         c = *nextchar(pRExC_state);
2303                     }
2304                     else
2305                         lastbr = NULL;
2306                     if (c != ')')
2307                         vFAIL("Switch (?(condition)... contains too many branches");
2308                     ender = reg_node(pRExC_state, TAIL);
2309                     regtail(pRExC_state, br, ender);
2310                     if (lastbr) {
2311                         regtail(pRExC_state, lastbr, ender);
2312                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2313                     }
2314                     else
2315                         regtail(pRExC_state, ret, ender);
2316                     return ret;
2317                 }
2318                 else {
2319                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2320                 }
2321             }
2322             case 0:
2323                 RExC_parse--; /* for vFAIL to print correctly */
2324                 vFAIL("Sequence (? incomplete");
2325                 break;
2326             default:
2327                 --RExC_parse;
2328               parse_flags:      /* (?i) */
2329                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2330                     /* (?g), (?gc) and (?o) are useless here
2331                        and must be globally applied -- japhy */
2332
2333                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2334                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2335                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2336                             if (! (wastedflags & wflagbit) ) {
2337                                 wastedflags |= wflagbit;
2338                                 vWARN5(
2339                                     RExC_parse + 1,
2340                                     "Useless (%s%c) - %suse /%c modifier",
2341                                     flagsp == &negflags ? "?-" : "?",
2342                                     *RExC_parse,
2343                                     flagsp == &negflags ? "don't " : "",
2344                                     *RExC_parse
2345                                 );
2346                             }
2347                         }
2348                     }
2349                     else if (*RExC_parse == 'c') {
2350                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2351                             if (! (wastedflags & wasted_c) ) {
2352                                 wastedflags |= wasted_gc;
2353                                 vWARN3(
2354                                     RExC_parse + 1,
2355                                     "Useless (%sc) - %suse /gc modifier",
2356                                     flagsp == &negflags ? "?-" : "?",
2357                                     flagsp == &negflags ? "don't " : ""
2358                                 );
2359                             }
2360                         }
2361                     }
2362                     else { pmflag(flagsp, *RExC_parse); }
2363
2364                     ++RExC_parse;
2365                 }
2366                 if (*RExC_parse == '-') {
2367                     flagsp = &negflags;
2368                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2369                     ++RExC_parse;
2370                     goto parse_flags;
2371                 }
2372                 RExC_flags16 |= posflags;
2373                 RExC_flags16 &= ~negflags;
2374                 if (*RExC_parse == ':') {
2375                     RExC_parse++;
2376                     paren = ':';
2377                     break;
2378                 }               
2379               unknown:
2380                 if (*RExC_parse != ')') {
2381                     RExC_parse++;
2382                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2383                 }
2384                 nextchar(pRExC_state);
2385                 *flagp = TRYAGAIN;
2386                 return NULL;
2387             }
2388         }
2389         else {                  /* (...) */
2390             parno = RExC_npar;
2391             RExC_npar++;
2392             ret = reganode(pRExC_state, OPEN, parno);
2393             Set_Node_Length(ret, 1); /* MJD */
2394             Set_Node_Offset(ret, RExC_parse); /* MJD */
2395             open = 1;
2396         }
2397     }
2398     else                        /* ! paren */
2399         ret = NULL;
2400
2401     /* Pick up the branches, linking them together. */
2402     parse_start = RExC_parse;   /* MJD */
2403     br = regbranch(pRExC_state, &flags, 1);
2404     /*     branch_len = (paren != 0); */
2405     
2406     if (br == NULL)
2407         return(NULL);
2408     if (*RExC_parse == '|') {
2409         if (!SIZE_ONLY && RExC_extralen) {
2410             reginsert(pRExC_state, BRANCHJ, br);
2411         }
2412         else {                  /* MJD */
2413             reginsert(pRExC_state, BRANCH, br);
2414             Set_Node_Length(br, paren != 0);
2415             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2416         }
2417         have_branch = 1;
2418         if (SIZE_ONLY)
2419             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2420     }
2421     else if (paren == ':') {
2422         *flagp |= flags&SIMPLE;
2423     }
2424     if (open) {                         /* Starts with OPEN. */
2425         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2426     }
2427     else if (paren != '?')              /* Not Conditional */
2428         ret = br;
2429     *flagp |= flags & (SPSTART | HASWIDTH);
2430     lastbr = br;
2431     while (*RExC_parse == '|') {
2432         if (!SIZE_ONLY && RExC_extralen) {
2433             ender = reganode(pRExC_state, LONGJMP,0);
2434             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2435         }
2436         if (SIZE_ONLY)
2437             RExC_extralen += 2;         /* Account for LONGJMP. */
2438         nextchar(pRExC_state);
2439         br = regbranch(pRExC_state, &flags, 0);
2440         
2441         if (br == NULL)
2442             return(NULL);
2443         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2444         lastbr = br;
2445         if (flags&HASWIDTH)
2446             *flagp |= HASWIDTH;
2447         *flagp |= flags&SPSTART;
2448     }
2449
2450     if (have_branch || paren != ':') {
2451         /* Make a closing node, and hook it on the end. */
2452         switch (paren) {
2453         case ':':
2454             ender = reg_node(pRExC_state, TAIL);
2455             break;
2456         case 1:
2457             ender = reganode(pRExC_state, CLOSE, parno);
2458             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2459             Set_Node_Length(ender,1); /* MJD */
2460             break;
2461         case '<':
2462         case ',':
2463         case '=':
2464         case '!':
2465             *flagp &= ~HASWIDTH;
2466             /* FALL THROUGH */
2467         case '>':
2468             ender = reg_node(pRExC_state, SUCCEED);
2469             break;
2470         case 0:
2471             ender = reg_node(pRExC_state, END);
2472             break;
2473         }
2474         regtail(pRExC_state, lastbr, ender);
2475
2476         if (have_branch) {
2477             /* Hook the tails of the branches to the closing node. */
2478             for (br = ret; br != NULL; br = regnext(br)) {
2479                 regoptail(pRExC_state, br, ender);
2480             }
2481         }
2482     }
2483
2484     {
2485         char *p;
2486         static char parens[] = "=!<,>";
2487
2488         if (paren && (p = strchr(parens, paren))) {
2489             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2490             int flag = (p - parens) > 1;
2491
2492             if (paren == '>')
2493                 node = SUSPEND, flag = 0;
2494             reginsert(pRExC_state, node,ret);
2495             ret->flags = flag;
2496             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2497         }
2498     }
2499
2500     /* Check for proper termination. */
2501     if (paren) {
2502         RExC_flags16 = oregflags;
2503         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2504             RExC_parse = oregcomp_parse;
2505             vFAIL("Unmatched (");
2506         }
2507     }
2508     else if (!paren && RExC_parse < RExC_end) {
2509         if (*RExC_parse == ')') {
2510             RExC_parse++;
2511             vFAIL("Unmatched )");
2512         }
2513         else
2514             FAIL("Junk on end of regexp");      /* "Can't happen". */
2515         /* NOTREACHED */
2516     }
2517
2518     return(ret);
2519 }
2520
2521 /*
2522  - regbranch - one alternative of an | operator
2523  *
2524  * Implements the concatenation operator.
2525  */
2526 STATIC regnode *
2527 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2528 {
2529     register regnode *ret;
2530     register regnode *chain = NULL;
2531     register regnode *latest;
2532     I32 flags = 0, c = 0;
2533
2534     if (first)
2535         ret = NULL;
2536     else {
2537         if (!SIZE_ONLY && RExC_extralen)
2538             ret = reganode(pRExC_state, BRANCHJ,0);
2539         else {
2540             ret = reg_node(pRExC_state, BRANCH);
2541             Set_Node_Length(ret, 1);
2542         }
2543     }
2544         
2545     if (!first && SIZE_ONLY)
2546         RExC_extralen += 1;                     /* BRANCHJ */
2547
2548     *flagp = WORST;                     /* Tentatively. */
2549
2550     RExC_parse--;
2551     nextchar(pRExC_state);
2552     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2553         flags &= ~TRYAGAIN;
2554         latest = regpiece(pRExC_state, &flags);
2555         if (latest == NULL) {
2556             if (flags & TRYAGAIN)
2557                 continue;
2558             return(NULL);
2559         }
2560         else if (ret == NULL)
2561             ret = latest;
2562         *flagp |= flags&HASWIDTH;
2563         if (chain == NULL)      /* First piece. */
2564             *flagp |= flags&SPSTART;
2565         else {
2566             RExC_naughty++;
2567             regtail(pRExC_state, chain, latest);
2568         }
2569         chain = latest;
2570         c++;
2571     }
2572     if (chain == NULL) {        /* Loop ran zero times. */
2573         chain = reg_node(pRExC_state, NOTHING);
2574         if (ret == NULL)
2575             ret = chain;
2576     }
2577     if (c == 1) {
2578         *flagp |= flags&SIMPLE;
2579     }
2580
2581     return(ret);
2582 }
2583
2584 /*
2585  - regpiece - something followed by possible [*+?]
2586  *
2587  * Note that the branching code sequences used for ? and the general cases
2588  * of * and + are somewhat optimized:  they use the same NOTHING node as
2589  * both the endmarker for their branch list and the body of the last branch.
2590  * It might seem that this node could be dispensed with entirely, but the
2591  * endmarker role is not redundant.
2592  */
2593 STATIC regnode *
2594 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2595 {
2596     register regnode *ret;
2597     register char op;
2598     register char *next;
2599     I32 flags;
2600     char *origparse = RExC_parse;
2601     char *maxpos;
2602     I32 min;
2603     I32 max = REG_INFTY;
2604     char *parse_start;
2605
2606     ret = regatom(pRExC_state, &flags);
2607     if (ret == NULL) {
2608         if (flags & TRYAGAIN)
2609             *flagp |= TRYAGAIN;
2610         return(NULL);
2611     }
2612
2613     op = *RExC_parse;
2614
2615     if (op == '{' && regcurly(RExC_parse)) {
2616         parse_start = RExC_parse; /* MJD */
2617         next = RExC_parse + 1;
2618         maxpos = Nullch;
2619         while (isDIGIT(*next) || *next == ',') {
2620             if (*next == ',') {
2621                 if (maxpos)
2622                     break;
2623                 else
2624                     maxpos = next;
2625             }
2626             next++;
2627         }
2628         if (*next == '}') {             /* got one */
2629             if (!maxpos)
2630                 maxpos = next;
2631             RExC_parse++;
2632             min = atoi(RExC_parse);
2633             if (*maxpos == ',')
2634                 maxpos++;
2635             else
2636                 maxpos = RExC_parse;
2637             max = atoi(maxpos);
2638             if (!max && *maxpos != '0')
2639                 max = REG_INFTY;                /* meaning "infinity" */
2640             else if (max >= REG_INFTY)
2641                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2642             RExC_parse = next;
2643             nextchar(pRExC_state);
2644
2645         do_curly:
2646             if ((flags&SIMPLE)) {
2647                 RExC_naughty += 2 + RExC_naughty / 2;
2648                 reginsert(pRExC_state, CURLY, ret);
2649                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2650                 Set_Node_Cur_Length(ret);
2651             }
2652             else {
2653                 regnode *w = reg_node(pRExC_state, WHILEM);
2654
2655                 w->flags = 0;
2656                 regtail(pRExC_state, ret, w);
2657                 if (!SIZE_ONLY && RExC_extralen) {
2658                     reginsert(pRExC_state, LONGJMP,ret);
2659                     reginsert(pRExC_state, NOTHING,ret);
2660                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2661                 }
2662                 reginsert(pRExC_state, CURLYX,ret);
2663                                 /* MJD hk */
2664                 Set_Node_Offset(ret, parse_start+1);
2665                 Set_Node_Length(ret, 
2666                                 op == '{' ? (RExC_parse - parse_start) : 1);
2667                 
2668                 if (!SIZE_ONLY && RExC_extralen)
2669                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2670                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2671                 if (SIZE_ONLY)
2672                     RExC_whilem_seen++, RExC_extralen += 3;
2673                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2674             }
2675             ret->flags = 0;
2676
2677             if (min > 0)
2678                 *flagp = WORST;
2679             if (max > 0)
2680                 *flagp |= HASWIDTH;
2681             if (max && max < min)
2682                 vFAIL("Can't do {n,m} with n > m");
2683             if (!SIZE_ONLY) {
2684                 ARG1_SET(ret, min);
2685                 ARG2_SET(ret, max);
2686             }
2687
2688             goto nest_check;
2689         }
2690     }
2691
2692     if (!ISMULT1(op)) {
2693         *flagp = flags;
2694         return(ret);
2695     }
2696
2697 #if 0                           /* Now runtime fix should be reliable. */
2698
2699     /* if this is reinstated, don't forget to put this back into perldiag:
2700
2701             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2702
2703            (F) The part of the regexp subject to either the * or + quantifier
2704            could match an empty string. The {#} shows in the regular
2705            expression about where the problem was discovered.
2706
2707     */
2708
2709     if (!(flags&HASWIDTH) && op != '?')
2710       vFAIL("Regexp *+ operand could be empty");
2711 #endif
2712
2713     parse_start = RExC_parse;
2714     nextchar(pRExC_state);
2715
2716     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2717
2718     if (op == '*' && (flags&SIMPLE)) {
2719         reginsert(pRExC_state, STAR, ret);
2720         ret->flags = 0;
2721         RExC_naughty += 4;
2722     }
2723     else if (op == '*') {
2724         min = 0;
2725         goto do_curly;
2726     }
2727     else if (op == '+' && (flags&SIMPLE)) {
2728         reginsert(pRExC_state, PLUS, ret);
2729         ret->flags = 0;
2730         RExC_naughty += 3;
2731     }
2732     else if (op == '+') {
2733         min = 1;
2734         goto do_curly;
2735     }
2736     else if (op == '?') {
2737         min = 0; max = 1;
2738         goto do_curly;
2739     }
2740   nest_check:
2741     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2742         vWARN3(RExC_parse,
2743                "%.*s matches null string many times",
2744                RExC_parse - origparse,
2745                origparse);
2746     }
2747
2748     if (*RExC_parse == '?') {
2749         nextchar(pRExC_state);
2750         reginsert(pRExC_state, MINMOD, ret);
2751         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2752     }
2753     if (ISMULT2(RExC_parse)) {
2754         RExC_parse++;
2755         vFAIL("Nested quantifiers");
2756     }
2757
2758     return(ret);
2759 }
2760
2761 /*
2762  - regatom - the lowest level
2763  *
2764  * Optimization:  gobbles an entire sequence of ordinary characters so that
2765  * it can turn them into a single node, which is smaller to store and
2766  * faster to run.  Backslashed characters are exceptions, each becoming a
2767  * separate node; the code is simpler that way and it's not worth fixing.
2768  *
2769  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2770 STATIC regnode *
2771 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2772 {
2773     register regnode *ret = 0;
2774     I32 flags;
2775     char *parse_start = 0;
2776
2777     *flagp = WORST;             /* Tentatively. */
2778
2779 tryagain:
2780     switch (*RExC_parse) {
2781     case '^':
2782         RExC_seen_zerolen++;
2783         nextchar(pRExC_state);
2784         if (RExC_flags16 & PMf_MULTILINE)
2785             ret = reg_node(pRExC_state, MBOL);
2786         else if (RExC_flags16 & PMf_SINGLELINE)
2787             ret = reg_node(pRExC_state, SBOL);
2788         else
2789             ret = reg_node(pRExC_state, BOL);
2790         Set_Node_Length(ret, 1); /* MJD */
2791         break;
2792     case '$':
2793         nextchar(pRExC_state);
2794         if (*RExC_parse)
2795             RExC_seen_zerolen++;
2796         if (RExC_flags16 & PMf_MULTILINE)
2797             ret = reg_node(pRExC_state, MEOL);
2798         else if (RExC_flags16 & PMf_SINGLELINE)
2799             ret = reg_node(pRExC_state, SEOL);
2800         else
2801             ret = reg_node(pRExC_state, EOL);
2802         Set_Node_Length(ret, 1); /* MJD */
2803         break;
2804     case '.':
2805         nextchar(pRExC_state);
2806         if (RExC_flags16 & PMf_SINGLELINE)
2807             ret = reg_node(pRExC_state, SANY);
2808         else
2809             ret = reg_node(pRExC_state, REG_ANY);
2810         *flagp |= HASWIDTH|SIMPLE;
2811         RExC_naughty++;
2812         Set_Node_Length(ret, 1); /* MJD */
2813         break;
2814     case '[':
2815     {
2816         char *oregcomp_parse = ++RExC_parse;
2817         ret = regclass(pRExC_state);
2818         if (*RExC_parse != ']') {
2819             RExC_parse = oregcomp_parse;
2820             vFAIL("Unmatched [");
2821         }
2822         nextchar(pRExC_state);
2823         *flagp |= HASWIDTH|SIMPLE;
2824         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2825         break;
2826     }
2827     case '(':
2828         nextchar(pRExC_state);
2829         ret = reg(pRExC_state, 1, &flags);
2830         if (ret == NULL) {
2831                 if (flags & TRYAGAIN) {
2832                     if (RExC_parse == RExC_end) {
2833                          /* Make parent create an empty node if needed. */
2834                         *flagp |= TRYAGAIN;
2835                         return(NULL);
2836                     }
2837                     goto tryagain;
2838                 }
2839                 return(NULL);
2840         }
2841         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2842         break;
2843     case '|':
2844     case ')':
2845         if (flags & TRYAGAIN) {
2846             *flagp |= TRYAGAIN;
2847             return NULL;
2848         }
2849         vFAIL("Internal urp");
2850                                 /* Supposed to be caught earlier. */
2851         break;
2852     case '{':
2853         if (!regcurly(RExC_parse)) {
2854             RExC_parse++;
2855             goto defchar;
2856         }
2857         /* FALL THROUGH */
2858     case '?':
2859     case '+':
2860     case '*':
2861         RExC_parse++;
2862         vFAIL("Quantifier follows nothing");
2863         break;
2864     case '\\':
2865         switch (*++RExC_parse) {
2866         case 'A':
2867             RExC_seen_zerolen++;
2868             ret = reg_node(pRExC_state, SBOL);
2869             *flagp |= SIMPLE;
2870             nextchar(pRExC_state);
2871             Set_Node_Length(ret, 2); /* MJD */
2872             break;
2873         case 'G':
2874             ret = reg_node(pRExC_state, GPOS);
2875             RExC_seen |= REG_SEEN_GPOS;
2876             *flagp |= SIMPLE;
2877             nextchar(pRExC_state);
2878             Set_Node_Length(ret, 2); /* MJD */
2879             break;
2880         case 'Z':
2881             ret = reg_node(pRExC_state, SEOL);
2882             *flagp |= SIMPLE;
2883             RExC_seen_zerolen++;                /* Do not optimize RE away */
2884             nextchar(pRExC_state);
2885             break;
2886         case 'z':
2887             ret = reg_node(pRExC_state, EOS);
2888             *flagp |= SIMPLE;
2889             RExC_seen_zerolen++;                /* Do not optimize RE away */
2890             nextchar(pRExC_state);
2891             Set_Node_Length(ret, 2); /* MJD */
2892             break;
2893         case 'C':
2894             ret = reg_node(pRExC_state, CANY);
2895             RExC_seen |= REG_SEEN_CANY;
2896             *flagp |= HASWIDTH|SIMPLE;
2897             nextchar(pRExC_state);
2898             Set_Node_Length(ret, 2); /* MJD */
2899             break;
2900         case 'X':
2901             ret = reg_node(pRExC_state, CLUMP);
2902             *flagp |= HASWIDTH;
2903             nextchar(pRExC_state);
2904             Set_Node_Length(ret, 2); /* MJD */
2905             break;
2906         case 'w':
2907             ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
2908             *flagp |= HASWIDTH|SIMPLE;
2909             nextchar(pRExC_state);
2910             Set_Node_Length(ret, 2); /* MJD */
2911             break;
2912         case 'W':
2913             ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
2914             *flagp |= HASWIDTH|SIMPLE;
2915             nextchar(pRExC_state);
2916             Set_Node_Length(ret, 2); /* MJD */
2917             break;
2918         case 'b':
2919             RExC_seen_zerolen++;
2920             RExC_seen |= REG_SEEN_LOOKBEHIND;
2921             ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
2922             *flagp |= SIMPLE;
2923             nextchar(pRExC_state);
2924             Set_Node_Length(ret, 2); /* MJD */
2925             break;
2926         case 'B':
2927             RExC_seen_zerolen++;
2928             RExC_seen |= REG_SEEN_LOOKBEHIND;
2929             ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
2930             *flagp |= SIMPLE;
2931             nextchar(pRExC_state);
2932             Set_Node_Length(ret, 2); /* MJD */
2933             break;
2934         case 's':
2935             ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
2936             *flagp |= HASWIDTH|SIMPLE;
2937             nextchar(pRExC_state);
2938             Set_Node_Length(ret, 2); /* MJD */
2939             break;
2940         case 'S':
2941             ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
2942             *flagp |= HASWIDTH|SIMPLE;
2943             nextchar(pRExC_state);
2944             Set_Node_Length(ret, 2); /* MJD */
2945             break;
2946         case 'd':
2947             ret = reg_node(pRExC_state, DIGIT);
2948             *flagp |= HASWIDTH|SIMPLE;
2949             nextchar(pRExC_state);
2950             Set_Node_Length(ret, 2); /* MJD */
2951             break;
2952         case 'D':
2953             ret = reg_node(pRExC_state, NDIGIT);
2954             *flagp |= HASWIDTH|SIMPLE;
2955             nextchar(pRExC_state);
2956             Set_Node_Length(ret, 2); /* MJD */
2957             break;
2958         case 'p':
2959         case 'P':
2960             {   
2961                 char* oldregxend = RExC_end;
2962                 char* parse_start = RExC_parse;
2963
2964                 if (RExC_parse[1] == '{') {
2965                   /* a lovely hack--pretend we saw [\pX] instead */
2966                     RExC_end = strchr(RExC_parse, '}');
2967                     if (!RExC_end) {
2968                         U8 c = (U8)*RExC_parse;
2969                         RExC_parse += 2;
2970                         RExC_end = oldregxend;
2971                         vFAIL2("Missing right brace on \\%c{}", c);
2972                     }
2973                     RExC_end++;
2974                 }
2975                 else {
2976                     RExC_end = RExC_parse + 2;
2977                     if (RExC_end > oldregxend)
2978                         RExC_end = oldregxend;
2979                 }
2980                 RExC_parse--;
2981
2982                 ret = regclass(pRExC_state);
2983
2984                 RExC_end = oldregxend;
2985                 RExC_parse--;
2986                 Set_Node_Cur_Length(ret); /* MJD */
2987                 nextchar(pRExC_state);
2988                 *flagp |= HASWIDTH|SIMPLE;
2989             }
2990             break;
2991         case 'n':
2992         case 'r':
2993         case 't':
2994         case 'f':
2995         case 'e':
2996         case 'a':
2997         case 'x':
2998         case 'c':
2999         case '0':
3000             goto defchar;
3001         case '1': case '2': case '3': case '4':
3002         case '5': case '6': case '7': case '8': case '9':
3003             {
3004                 I32 num = atoi(RExC_parse);
3005
3006                 if (num > 9 && num >= RExC_npar)
3007                     goto defchar;
3008                 else {
3009                     char * parse_start = RExC_parse - 1; /* MJD */
3010                     while (isDIGIT(*RExC_parse))
3011                         RExC_parse++;
3012
3013                     if (!SIZE_ONLY && num > RExC_rx->nparens)
3014                         vFAIL("Reference to nonexistent group");
3015                     RExC_sawback = 1;
3016                     ret = reganode(pRExC_state, FOLD
3017                                    ? (LOC ? REFFL : REFF)
3018                                    : REF, num);
3019                     *flagp |= HASWIDTH;
3020                     
3021                     /* override incorrect value set in reganode MJD */
3022                     Set_Node_Offset(ret, parse_start+1); 
3023                     Set_Node_Cur_Length(ret); /* MJD */
3024                     RExC_parse--;
3025                     nextchar(pRExC_state);
3026                 }
3027             }
3028             break;
3029         case '\0':
3030             if (RExC_parse >= RExC_end)
3031                 FAIL("Trailing \\");
3032             /* FALL THROUGH */
3033         default:
3034             /* Do not generate `unrecognized' warnings here, we fall
3035                back into the quick-grab loop below */
3036             goto defchar;
3037         }
3038         break;
3039
3040     case '#':
3041         if (RExC_flags16 & PMf_EXTENDED) {
3042             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3043             if (RExC_parse < RExC_end)
3044                 goto tryagain;
3045         }
3046         /* FALL THROUGH */
3047
3048     default: {
3049             register STRLEN len;
3050             register UV ender;
3051             register char *p;
3052             char *oldp, *s;
3053             STRLEN numlen;
3054             STRLEN foldlen;
3055             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3056
3057             parse_start = RExC_parse - 1;
3058
3059             RExC_parse++;
3060
3061         defchar:
3062             ender = 0;
3063             ret = reg_node(pRExC_state, FOLD
3064                           ? (LOC ? EXACTFL : EXACTF)
3065                           : EXACT);
3066             s = STRING(ret);
3067             for (len = 0, p = RExC_parse - 1;
3068               len < 127 && p < RExC_end;
3069               len++)
3070             {
3071                 oldp = p;
3072
3073                 if (RExC_flags16 & PMf_EXTENDED)
3074                     p = regwhite(p, RExC_end);
3075                 switch (*p) {
3076                 case '^':
3077                 case '$':
3078                 case '.':
3079                 case '[':
3080                 case '(':
3081                 case ')':
3082                 case '|':
3083                     goto loopdone;
3084                 case '\\':
3085                     switch (*++p) {
3086                     case 'A':
3087                     case 'C':
3088                     case 'X':
3089                     case 'G':
3090                     case 'Z':
3091                     case 'z':
3092                     case 'w':
3093                     case 'W':
3094                     case 'b':
3095                     case 'B':
3096                     case 's':
3097                     case 'S':
3098                     case 'd':
3099                     case 'D':
3100                     case 'p':
3101                     case 'P':
3102                         --p;
3103                         goto loopdone;
3104                     case 'n':
3105                         ender = '\n';
3106                         p++;
3107                         break;
3108                     case 'r':
3109                         ender = '\r';
3110                         p++;
3111                         break;
3112                     case 't':
3113                         ender = '\t';
3114                         p++;
3115                         break;
3116                     case 'f':
3117                         ender = '\f';
3118                         p++;
3119                         break;
3120                     case 'e':
3121                           ender = ASCII_TO_NATIVE('\033');
3122                         p++;
3123                         break;
3124                     case 'a':
3125                           ender = ASCII_TO_NATIVE('\007');
3126                         p++;
3127                         break;
3128                     case 'x':
3129                         if (*++p == '{') {
3130                             char* e = strchr(p, '}');
3131         
3132                             if (!e) {
3133                                 RExC_parse = p + 1;
3134                                 vFAIL("Missing right brace on \\x{}");
3135                             }
3136                             else {
3137                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3138                                     | PERL_SCAN_DISALLOW_PREFIX;
3139                                 numlen = e - p - 1;
3140                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3141                                 if (ender > 0xff)
3142                                     RExC_utf8 = 1;
3143                                 /* numlen is generous */
3144                                 if (numlen + len >= 127) {
3145                                     p--;
3146                                     goto loopdone;
3147                                 }
3148                                 p = e + 1;
3149                             }
3150                         }
3151                         else {
3152                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3153                             numlen = 2;
3154                             ender = grok_hex(p, &numlen, &flags, NULL);
3155                             p += numlen;
3156                         }
3157                         break;
3158                     case 'c':
3159                         p++;
3160                         ender = UCHARAT(p++);
3161                         ender = toCTRL(ender);
3162                         break;
3163                     case '0': case '1': case '2': case '3':case '4':
3164                     case '5': case '6': case '7': case '8':case '9':
3165                         if (*p == '0' ||
3166                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3167                             I32 flags = 0;
3168                             numlen = 3;
3169                             ender = grok_oct(p, &numlen, &flags, NULL);
3170                             p += numlen;
3171                         }
3172                         else {
3173                             --p;
3174                             goto loopdone;
3175                         }
3176                         break;
3177                     case '\0':
3178                         if (p >= RExC_end)
3179                             FAIL("Trailing \\");
3180                         /* FALL THROUGH */
3181                     default:
3182                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3183                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3184                         goto normal_default;
3185                     }
3186                     break;
3187                 default:
3188                   normal_default:
3189                     if (UTF8_IS_START(*p) && UTF) {
3190                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3191                                                &numlen, 0);
3192                         p += numlen;
3193                     }
3194                     else
3195                         ender = *p++;
3196                     break;
3197                 }
3198                 if (RExC_flags16 & PMf_EXTENDED)
3199                     p = regwhite(p, RExC_end);
3200                 if (UTF && FOLD) {
3201                     /* Prime the casefolded buffer. */
3202                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3203                 }
3204                 if (ISMULT2(p)) { /* Back off on ?+*. */
3205                     if (len)
3206                         p = oldp;
3207                     else if (UTF) {
3208                          STRLEN unilen;
3209
3210                          if (FOLD) {
3211                               /* Emit all the Unicode characters. */
3212                               for (foldbuf = tmpbuf;
3213                                    foldlen;
3214                                    foldlen -= numlen) {
3215                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3216                                    if (numlen > 0) {
3217                                         reguni(pRExC_state, ender, s, &unilen);
3218                                         s       += unilen;
3219                                         len     += unilen;
3220                                         /* In EBCDIC the numlen
3221                                          * and unilen can differ. */
3222                                         foldbuf += numlen;
3223                                         if (numlen >= foldlen)
3224                                              break;
3225                                    }
3226                                    else
3227                                         break; /* "Can't happen." */
3228                               }
3229                          }
3230                          else {
3231                               reguni(pRExC_state, ender, s, &unilen);
3232                               if (unilen > 0) {
3233                                    s   += unilen;
3234                                    len += unilen;
3235                               }
3236                          }
3237                     }
3238                     else {
3239                         len++;
3240                         REGC(ender, s++);
3241                     }
3242                     break;
3243                 }
3244                 if (UTF) {
3245                      STRLEN unilen;
3246
3247                      if (FOLD) {
3248                           /* Emit all the Unicode characters. */
3249                           for (foldbuf = tmpbuf;
3250                                foldlen;
3251                                foldlen -= numlen) {
3252                                ender = utf8_to_uvchr(foldbuf, &numlen);
3253                                if (numlen > 0) {
3254                                     reguni(pRExC_state, ender, s, &unilen);
3255                                     len     += unilen;
3256                                     s       += unilen;
3257                                     /* In EBCDIC the numlen
3258                                      * and unilen can differ. */
3259                                     foldbuf += numlen;
3260                                     if (numlen >= foldlen)
3261                                          break;
3262                                }
3263                                else
3264                                     break;
3265                           }
3266                      }
3267                      else {
3268                           reguni(pRExC_state, ender, s, &unilen);
3269                           if (unilen > 0) {
3270                                s   += unilen;
3271                                len += unilen;
3272                           }
3273                      }
3274                      len--;
3275                 }
3276                 else
3277                     REGC(ender, s++);
3278             }
3279         loopdone:
3280             RExC_parse = p - 1;
3281             Set_Node_Cur_Length(ret); /* MJD */
3282             nextchar(pRExC_state);
3283             {
3284                 /* len is STRLEN which is unsigned, need to copy to signed */
3285                 IV iv = len;
3286                 if (iv < 0)
3287                     vFAIL("Internal disaster");
3288             }
3289             if (len > 0)
3290                 *flagp |= HASWIDTH;
3291             if (len == 1)
3292                 *flagp |= SIMPLE;
3293             if (!SIZE_ONLY)
3294                 STR_LEN(ret) = len;
3295             if (SIZE_ONLY)
3296                 RExC_size += STR_SZ(len);
3297             else
3298                 RExC_emit += STR_SZ(len);
3299         }
3300         break;
3301     }
3302
3303     /* If the encoding pragma is in effect recode the text of
3304      * any EXACT-kind nodes. */
3305     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3306          STRLEN oldlen = STR_LEN(ret);
3307          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3308
3309          if (RExC_utf8)
3310               SvUTF8_on(sv);
3311          if (sv_utf8_downgrade(sv, TRUE)) {
3312               char *s       = sv_recode_to_utf8(sv, PL_encoding);
3313               STRLEN newlen = SvCUR(sv);
3314          
3315               if (!SIZE_ONLY) {
3316                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3317                                          (int)oldlen, STRING(ret),
3318                                          (int)newlen, s));
3319                    Copy(s, STRING(ret), newlen, char);
3320                    STR_LEN(ret) += newlen - oldlen;
3321                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3322               } else
3323                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3324          }
3325     }
3326
3327     return(ret);
3328 }
3329
3330 STATIC char *
3331 S_regwhite(pTHX_ char *p, char *e)
3332 {
3333     while (p < e) {
3334         if (isSPACE(*p))
3335             ++p;
3336         else if (*p == '#') {
3337             do {
3338                 p++;
3339             } while (p < e && *p != '\n');
3340         }
3341         else
3342             break;
3343     }
3344     return p;
3345 }
3346
3347 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3348    Character classes ([:foo:]) can also be negated ([:^foo:]).
3349    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3350    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3351    but trigger failures because they are currently unimplemented. */
3352
3353 #define POSIXCC_DONE(c)   ((c) == ':')
3354 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3355 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3356
3357 STATIC I32
3358 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3359 {
3360     char *posixcc = 0;
3361     I32 namedclass = OOB_NAMEDCLASS;
3362
3363     if (value == '[' && RExC_parse + 1 < RExC_end &&
3364         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3365         POSIXCC(UCHARAT(RExC_parse))) {
3366         char  c = UCHARAT(RExC_parse);
3367         char* s = RExC_parse++;
3368         
3369         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3370             RExC_parse++;
3371         if (RExC_parse == RExC_end)
3372             /* Grandfather lone [:, [=, [. */
3373             RExC_parse = s;
3374         else {
3375             char* t = RExC_parse++; /* skip over the c */
3376
3377             if (UCHARAT(RExC_parse) == ']') {
3378                 RExC_parse++; /* skip over the ending ] */
3379                 posixcc = s + 1;
3380                 if (*s == ':') {
3381                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3382                     I32 skip = 5; /* the most common skip */
3383
3384                     switch (*posixcc) {
3385                     case 'a':
3386                         if (strnEQ(posixcc, "alnum", 5))
3387                             namedclass =
3388                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3389                         else if (strnEQ(posixcc, "alpha", 5))
3390                             namedclass =
3391                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3392                         else if (strnEQ(posixcc, "ascii", 5))
3393                             namedclass =
3394                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3395                         break;
3396                     case 'b':
3397                         if (strnEQ(posixcc, "blank", 5))
3398                             namedclass =
3399                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3400                         break;
3401                     case 'c':
3402                         if (strnEQ(posixcc, "cntrl", 5))
3403                             namedclass =
3404                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3405                         break;
3406                     case 'd':
3407                         if (strnEQ(posixcc, "digit", 5))
3408                             namedclass =
3409                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3410                         break;
3411                     case 'g':
3412                         if (strnEQ(posixcc, "graph", 5))
3413                             namedclass =
3414                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3415                         break;
3416                     case 'l':
3417                         if (strnEQ(posixcc, "lower", 5))
3418                             namedclass =
3419                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3420                         break;
3421                     case 'p':
3422                         if (strnEQ(posixcc, "print", 5))
3423                             namedclass =
3424                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3425                         else if (strnEQ(posixcc, "punct", 5))
3426                             namedclass =
3427                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3428                         break;
3429                     case 's':
3430                         if (strnEQ(posixcc, "space", 5))
3431                             namedclass =
3432                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3433                         break;
3434                     case 'u':
3435                         if (strnEQ(posixcc, "upper", 5))
3436                             namedclass =
3437                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3438                         break;
3439                     case 'w': /* this is not POSIX, this is the Perl \w */
3440                         if (strnEQ(posixcc, "word", 4)) {
3441                             namedclass =
3442                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3443                             skip = 4;
3444                         }
3445                         break;
3446                     case 'x':
3447                         if (strnEQ(posixcc, "xdigit", 6)) {
3448                             namedclass =
3449                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3450                             skip = 6;
3451                         }
3452                         break;
3453                     }
3454                     if (namedclass == OOB_NAMEDCLASS ||
3455                         posixcc[skip] != ':' ||
3456                         posixcc[skip+1] != ']')
3457                     {
3458                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3459                                       t - s - 1, s + 1);
3460                     }
3461                 } else if (!SIZE_ONLY) {
3462                     /* [[=foo=]] and [[.foo.]] are still future. */
3463
3464                     /* adjust RExC_parse so the warning shows after
3465                        the class closes */
3466                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3467                         RExC_parse++;
3468                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3469                 }
3470             } else {
3471                 /* Maternal grandfather:
3472                  * "[:" ending in ":" but not in ":]" */
3473                 RExC_parse = s;
3474             }
3475         }
3476     }
3477
3478     return namedclass;
3479 }
3480
3481 STATIC void
3482 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3483 {
3484     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3485         char *s = RExC_parse;
3486         char  c = *s++;
3487
3488         while(*s && isALNUM(*s))
3489             s++;
3490         if (*s && c == *s && s[1] == ']') {
3491             if (ckWARN(WARN_REGEXP))
3492                 vWARN3(s+2,
3493                         "POSIX syntax [%c %c] belongs inside character classes",
3494                         c, c);
3495
3496             /* [[=foo=]] and [[.foo.]] are still future. */
3497             if (POSIXCC_NOTYET(c)) {
3498                 /* adjust RExC_parse so the error shows after
3499                    the class closes */
3500                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3501                     ;
3502                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3503             }
3504         }
3505     }
3506 }
3507
3508 STATIC regnode *
3509 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3510 {
3511     register UV value;
3512     register UV nextvalue;
3513     register IV prevvalue = OOB_UNICODE;
3514     register IV range = 0;
3515     register regnode *ret;
3516     STRLEN numlen;
3517     IV namedclass;
3518     char *rangebegin = 0;
3519     bool need_class = 0;
3520     SV *listsv = Nullsv;
3521     register char *e;
3522     UV n;
3523     bool optimize_invert   = TRUE;
3524     AV* unicode_alternate  = 0;
3525
3526     ret = reganode(pRExC_state, ANYOF, 0);
3527
3528     if (!SIZE_ONLY)
3529         ANYOF_FLAGS(ret) = 0;
3530
3531     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3532         RExC_naughty++;
3533         RExC_parse++;
3534         if (!SIZE_ONLY)
3535             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3536     }
3537
3538     if (SIZE_ONLY)
3539         RExC_size += ANYOF_SKIP;
3540     else {
3541         RExC_emit += ANYOF_SKIP;
3542         if (FOLD)
3543             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3544         if (LOC)
3545             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3546         ANYOF_BITMAP_ZERO(ret);
3547         listsv = newSVpvn("# comment\n", 10);
3548     }
3549
3550     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3551
3552     if (!SIZE_ONLY && POSIXCC(nextvalue))
3553         checkposixcc(pRExC_state);
3554
3555     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3556     if (UCHARAT(RExC_parse) == ']')
3557         goto charclassloop;
3558
3559     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3560
3561     charclassloop:
3562
3563         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3564
3565         if (!range)
3566             rangebegin = RExC_parse;
3567         if (UTF) {
3568             value = utf8n_to_uvchr((U8*)RExC_parse,
3569                                    RExC_end - RExC_parse,
3570                                    &numlen, 0);
3571             RExC_parse += numlen;
3572         }
3573         else
3574             value = UCHARAT(RExC_parse++);
3575         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3576         if (value == '[' && POSIXCC(nextvalue))
3577             namedclass = regpposixcc(pRExC_state, value);
3578         else if (value == '\\') {
3579             if (UTF) {
3580                 value = utf8n_to_uvchr((U8*)RExC_parse,
3581                                    RExC_end - RExC_parse,
3582                                    &numlen, 0);
3583                 RExC_parse += numlen;
3584             }
3585             else
3586                 value = UCHARAT(RExC_parse++);
3587             /* Some compilers cannot handle switching on 64-bit integer
3588              * values, therefore value cannot be an UV.  Yes, this will
3589              * be a problem later if we want switch on Unicode.
3590              * A similar issue a little bit later when switching on
3591              * namedclass. --jhi */
3592             switch ((I32)value) {
3593             case 'w':   namedclass = ANYOF_ALNUM;       break;
3594             case 'W':   namedclass = ANYOF_NALNUM;      break;
3595             case 's':   namedclass = ANYOF_SPACE;       break;
3596             case 'S':   namedclass = ANYOF_NSPACE;      break;
3597             case 'd':   namedclass = ANYOF_DIGIT;       break;
3598             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3599             case 'p':
3600             case 'P':
3601                 if (RExC_parse >= RExC_end)
3602                     vFAIL2("Empty \\%c{}", (U8)value);
3603                 if (*RExC_parse == '{') {
3604                     U8 c = (U8)value;
3605                     e = strchr(RExC_parse++, '}');
3606                     if (!e)
3607                         vFAIL2("Missing right brace on \\%c{}", c);
3608                     while (isSPACE(UCHARAT(RExC_parse)))
3609                         RExC_parse++;
3610                     if (e == RExC_parse)
3611                         vFAIL2("Empty \\%c{}", c);
3612                     n = e - RExC_parse;
3613                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3614                         n--;
3615                 }
3616                 else {
3617                     e = RExC_parse;
3618                     n = 1;
3619                 }
3620                 if (!SIZE_ONLY) {
3621                     if (UCHARAT(RExC_parse) == '^') {
3622                          RExC_parse++;
3623                          n--;
3624                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3625                          while (isSPACE(UCHARAT(RExC_parse))) {
3626                               RExC_parse++;
3627                               n--;
3628                          }
3629                     }
3630                     if (value == 'p')
3631                          Perl_sv_catpvf(aTHX_ listsv,
3632                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3633                     else
3634                          Perl_sv_catpvf(aTHX_ listsv,
3635                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3636                 }
3637                 RExC_parse = e + 1;
3638                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3639                 continue;
3640             case 'n':   value = '\n';                   break;
3641             case 'r':   value = '\r';                   break;
3642             case 't':   value = '\t';                   break;
3643             case 'f':   value = '\f';                   break;
3644             case 'b':   value = '\b';                   break;
3645             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3646             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3647             case 'x':
3648                 if (*RExC_parse == '{') {
3649                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3650                         | PERL_SCAN_DISALLOW_PREFIX;
3651                     e = strchr(RExC_parse++, '}');
3652                     if (!e)
3653                         vFAIL("Missing right brace on \\x{}");
3654
3655                     numlen = e - RExC_parse;
3656                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3657                     RExC_parse = e + 1;
3658                 }
3659                 else {
3660                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3661                     numlen = 2;
3662                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3663                     RExC_parse += numlen;
3664                 }
3665                 break;
3666             case 'c':
3667                 value = UCHARAT(RExC_parse++);
3668                 value = toCTRL(value);
3669                 break;
3670             case '0': case '1': case '2': case '3': case '4':
3671             case '5': case '6': case '7': case '8': case '9':
3672             {
3673                 I32 flags = 0;
3674                 numlen = 3;
3675                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3676                 RExC_parse += numlen;
3677                 break;
3678             }
3679             default:
3680                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3681                     vWARN2(RExC_parse,
3682                            "Unrecognized escape \\%c in character class passed through",
3683                            (int)value);
3684                 break;
3685             }
3686         } /* end of \blah */
3687
3688         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3689
3690             if (!SIZE_ONLY && !need_class)
3691                 ANYOF_CLASS_ZERO(ret);
3692
3693             need_class = 1;
3694
3695             /* a bad range like a-\d, a-[:digit:] ? */
3696             if (range) {
3697                 if (!SIZE_ONLY) {
3698                     if (ckWARN(WARN_REGEXP))
3699                         vWARN4(RExC_parse,
3700                                "False [] range \"%*.*s\"",
3701                                RExC_parse - rangebegin,
3702                                RExC_parse - rangebegin,
3703                                rangebegin);
3704                     if (prevvalue < 256) {
3705                         ANYOF_BITMAP_SET(ret, prevvalue);
3706                         ANYOF_BITMAP_SET(ret, '-');
3707                     }
3708                     else {
3709                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3710                         Perl_sv_catpvf(aTHX_ listsv,
3711                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3712                     }
3713                 }
3714
3715                 range = 0; /* this was not a true range */
3716             }
3717
3718             if (!SIZE_ONLY) {
3719                 if (namedclass > OOB_NAMEDCLASS)
3720                     optimize_invert = FALSE;
3721                 /* Possible truncation here but in some 64-bit environments
3722                  * the compiler gets heartburn about switch on 64-bit values.
3723                  * A similar issue a little earlier when switching on value.
3724                  * --jhi */
3725                 switch ((I32)namedclass) {
3726                 case ANYOF_ALNUM:
3727                     if (LOC)
3728                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3729                     else {
3730                         for (value = 0; value < 256; value++)
3731                             if (isALNUM(value))
3732                                 ANYOF_BITMAP_SET(ret, value);
3733                     }
3734                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3735                     break;
3736                 case ANYOF_NALNUM:
3737                     if (LOC)
3738                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3739                     else {
3740                         for (value = 0; value < 256; value++)
3741                             if (!isALNUM(value))
3742                                 ANYOF_BITMAP_SET(ret, value);
3743                     }
3744                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3745                     break;
3746                 case ANYOF_ALNUMC:
3747                     if (LOC)
3748                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3749                     else {
3750                         for (value = 0; value < 256; value++)
3751                             if (isALNUMC(value))
3752                                 ANYOF_BITMAP_SET(ret, value);
3753                     }
3754                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3755                     break;
3756                 case ANYOF_NALNUMC:
3757                     if (LOC)
3758                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3759                     else {
3760                         for (value = 0; value < 256; value++)
3761                             if (!isALNUMC(value))
3762                                 ANYOF_BITMAP_SET(ret, value);
3763                     }
3764                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3765                     break;
3766                 case ANYOF_ALPHA:
3767                     if (LOC)
3768                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3769                     else {
3770                         for (value = 0; value < 256; value++)
3771                             if (isALPHA(value))
3772                                 ANYOF_BITMAP_SET(ret, value);
3773                     }
3774                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3775                     break;
3776                 case ANYOF_NALPHA:
3777                     if (LOC)
3778                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3779                     else {
3780                         for (value = 0; value < 256; value++)
3781                             if (!isALPHA(value))
3782                                 ANYOF_BITMAP_SET(ret, value);
3783                     }
3784                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3785                     break;
3786                 case ANYOF_ASCII:
3787                     if (LOC)
3788                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3789                     else {
3790 #ifndef EBCDIC
3791                         for (value = 0; value < 128; value++)
3792                             ANYOF_BITMAP_SET(ret, value);
3793 #else  /* EBCDIC */
3794                         for (value = 0; value < 256; value++) {
3795                             if (isASCII(value))
3796                                 ANYOF_BITMAP_SET(ret, value);
3797                         }
3798 #endif /* EBCDIC */
3799                     }
3800                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3801                     break;
3802                 case ANYOF_NASCII:
3803                     if (LOC)
3804                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3805                     else {
3806 #ifndef EBCDIC
3807                         for (value = 128; value < 256; value++)
3808                             ANYOF_BITMAP_SET(ret, value);
3809 #else  /* EBCDIC */
3810                         for (value = 0; value < 256; value++) {
3811                             if (!isASCII(value))
3812                                 ANYOF_BITMAP_SET(ret, value);
3813                         }
3814 #endif /* EBCDIC */
3815                     }
3816                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3817                     break;
3818                 case ANYOF_BLANK:
3819                     if (LOC)
3820                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3821                     else {
3822                         for (value = 0; value < 256; value++)
3823                             if (isBLANK(value))
3824                                 ANYOF_BITMAP_SET(ret, value);
3825                     }
3826                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3827                     break;
3828                 case ANYOF_NBLANK:
3829                     if (LOC)
3830                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3831                     else {
3832                         for (value = 0; value < 256; value++)
3833                             if (!isBLANK(value))
3834                                 ANYOF_BITMAP_SET(ret, value);
3835                     }
3836                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3837                     break;
3838                 case ANYOF_CNTRL:
3839                     if (LOC)
3840                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3841                     else {
3842                         for (value = 0; value < 256; value++)
3843                             if (isCNTRL(value))
3844                                 ANYOF_BITMAP_SET(ret, value);
3845                     }
3846                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3847                     break;
3848                 case ANYOF_NCNTRL:
3849                     if (LOC)
3850                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3851                     else {
3852                         for (value = 0; value < 256; value++)
3853                             if (!isCNTRL(value))
3854                                 ANYOF_BITMAP_SET(ret, value);
3855                     }
3856                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3857                     break;
3858                 case ANYOF_DIGIT:
3859                     if (LOC)
3860                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3861                     else {
3862                         /* consecutive digits assumed */
3863                         for (value = '0'; value <= '9'; value++)
3864                             ANYOF_BITMAP_SET(ret, value);
3865                     }
3866                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3867                     break;
3868                 case ANYOF_NDIGIT:
3869                     if (LOC)
3870                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3871                     else {
3872                         /* consecutive digits assumed */
3873                         for (value = 0; value < '0'; value++)
3874                             ANYOF_BITMAP_SET(ret, value);
3875                         for (value = '9' + 1; value < 256; value++)
3876                             ANYOF_BITMAP_SET(ret, value);
3877                     }
3878                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3879                     break;
3880                 case ANYOF_GRAPH:
3881                     if (LOC)
3882                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3883                     else {
3884                         for (value = 0; value < 256; value++)
3885                             if (isGRAPH(value))
3886                                 ANYOF_BITMAP_SET(ret, value);
3887                     }
3888                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3889                     break;
3890                 case ANYOF_NGRAPH:
3891                     if (LOC)
3892                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3893                     else {
3894                         for (value = 0; value < 256; value++)
3895                             if (!isGRAPH(value))
3896                                 ANYOF_BITMAP_SET(ret, value);
3897                     }
3898