This is 5.8.0.
[perl.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
397     } STMT_END                                                               \
398
399
400 #define vWARN2(loc, m, a1)                                                   \
401     STMT_START {                                                             \
402         IV offset = loc - RExC_precomp;          \
403         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
404                  a1,                                                         \
405                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
406     } STMT_END
407
408 #define vWARN3(loc, m, a1, a2)                                               \
409     STMT_START {                                                             \
410       IV offset = loc - RExC_precomp;        \
411         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
412                  a1, a2,                                                     \
413                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
414     } STMT_END
415
416 #define vWARN4(loc, m, a1, a2, a3)                                           \
417     STMT_START {                                                             \
418       IV offset = loc - RExC_precomp;            \
419         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
420                  a1, a2, a3,                                                 \
421                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
422     } STMT_END
423
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
426   STMT_START {                                                   \
427       IV offset = loc - RExC_precomp;   \
428         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
429                  a1, a2, a3, a4,                                 \
430                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
431     } STMT_END
432
433
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
436
437 /* Macros for recording node offsets.   20001227 mjd@plover.com 
438  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
439  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
440  * Element 0 holds the number n.
441  */
442
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447 #  define Set_Node_Offset_To_R(node,byte)                           \
448    STMT_START {                                        \
449      if (! SIZE_ONLY) {                                  \
450        if((node) < 0) {                   \
451          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452        } else {                                                        \
453          RExC_offsets[2*(node)-1] = (byte);                               \
454        }                                                               \
455      }                                                                 \
456    } STMT_END
457
458 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461 #  define Set_Node_Length_To_R(node,len)                            \
462    STMT_START {                                        \
463      if (! SIZE_ONLY) {                                  \
464        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465        if((node) < 0) {                   \
466          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467        } else {                                                        \
468          RExC_offsets[2*(node)] = (len);                               \
469        }                                                               \
470      }                                                                 \
471    } STMT_END
472
473 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
475 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
476
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
481 static void clear_re(pTHX_ void *r);
482
483 /* Mark that we cannot extend a found fixed substring at this point.
484    Updata the longest found anchored substring and the longest found
485    floating substrings if needed. */
486
487 STATIC void
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
489 {
490     STRLEN l = CHR_SVLEN(data->last_found);
491     STRLEN old_l = CHR_SVLEN(*data->longest);
492
493     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494         sv_setsv(*data->longest, data->last_found);
495         if (*data->longest == data->longest_fixed) {
496             data->offset_fixed = l ? data->last_start_min : data->pos_min;
497             if (data->flags & SF_BEFORE_EOL)
498                 data->flags
499                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500             else
501                 data->flags &= ~SF_FIX_BEFORE_EOL;
502         }
503         else {
504             data->offset_float_min = l ? data->last_start_min : data->pos_min;
505             data->offset_float_max = (l
506                                       ? data->last_start_max
507                                       : data->pos_min + data->pos_delta);
508             if ((U32)data->offset_float_max > (U32)I32_MAX)
509                 data->offset_float_max = I32_MAX;
510             if (data->flags & SF_BEFORE_EOL)
511                 data->flags
512                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
513             else
514                 data->flags &= ~SF_FL_BEFORE_EOL;
515         }
516     }
517     SvCUR_set(data->last_found, 0);
518     data->last_end = -1;
519     data->flags &= ~SF_BEFORE_EOL;
520 }
521
522 /* Can match anything (initialization) */
523 STATIC void
524 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
525 {
526     ANYOF_CLASS_ZERO(cl);
527     ANYOF_BITMAP_SETALL(cl);
528     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
529     if (LOC)
530         cl->flags |= ANYOF_LOCALE;
531 }
532
533 /* Can match anything (initialization) */
534 STATIC int
535 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
536 {
537     int value;
538
539     for (value = 0; value <= ANYOF_MAX; value += 2)
540         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
541             return 1;
542     if (!(cl->flags & ANYOF_UNICODE_ALL))
543         return 0;
544     if (!ANYOF_BITMAP_TESTALLSET(cl))
545         return 0;
546     return 1;
547 }
548
549 /* Can match anything (initialization) */
550 STATIC void
551 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
552 {
553     Zero(cl, 1, struct regnode_charclass_class);
554     cl->type = ANYOF;
555     cl_anything(pRExC_state, cl);
556 }
557
558 STATIC void
559 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
560 {
561     Zero(cl, 1, struct regnode_charclass_class);
562     cl->type = ANYOF;
563     cl_anything(pRExC_state, cl);
564     if (LOC)
565         cl->flags |= ANYOF_LOCALE;
566 }
567
568 /* 'And' a given class with another one.  Can create false positives */
569 /* We assume that cl is not inverted */
570 STATIC void
571 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
572          struct regnode_charclass_class *and_with)
573 {
574     if (!(and_with->flags & ANYOF_CLASS)
575         && !(cl->flags & ANYOF_CLASS)
576         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
577         && !(and_with->flags & ANYOF_FOLD)
578         && !(cl->flags & ANYOF_FOLD)) {
579         int i;
580
581         if (and_with->flags & ANYOF_INVERT)
582             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583                 cl->bitmap[i] &= ~and_with->bitmap[i];
584         else
585             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586                 cl->bitmap[i] &= and_with->bitmap[i];
587     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
588     if (!(and_with->flags & ANYOF_EOS))
589         cl->flags &= ~ANYOF_EOS;
590
591     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
592         cl->flags &= ~ANYOF_UNICODE_ALL;
593         cl->flags |= ANYOF_UNICODE;
594         ARG_SET(cl, ARG(and_with));
595     }
596     if (!(and_with->flags & ANYOF_UNICODE_ALL))
597         cl->flags &= ~ANYOF_UNICODE_ALL;
598     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
599         cl->flags &= ~ANYOF_UNICODE;
600 }
601
602 /* 'OR' a given class with another one.  Can create false positives */
603 /* We assume that cl is not inverted */
604 STATIC void
605 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
606 {
607     if (or_with->flags & ANYOF_INVERT) {
608         /* We do not use
609          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
610          *   <= (B1 | !B2) | (CL1 | !CL2)
611          * which is wasteful if CL2 is small, but we ignore CL2:
612          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
613          * XXXX Can we handle case-fold?  Unclear:
614          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
615          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
616          */
617         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618              && !(or_with->flags & ANYOF_FOLD)
619              && !(cl->flags & ANYOF_FOLD) ) {
620             int i;
621
622             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
623                 cl->bitmap[i] |= ~or_with->bitmap[i];
624         } /* XXXX: logic is complicated otherwise */
625         else {
626             cl_anything(pRExC_state, cl);
627         }
628     } else {
629         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
630         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
631              && (!(or_with->flags & ANYOF_FOLD)
632                  || (cl->flags & ANYOF_FOLD)) ) {
633             int i;
634
635             /* OR char bitmap and class bitmap separately */
636             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
637                 cl->bitmap[i] |= or_with->bitmap[i];
638             if (or_with->flags & ANYOF_CLASS) {
639                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
640                     cl->classflags[i] |= or_with->classflags[i];
641                 cl->flags |= ANYOF_CLASS;
642             }
643         }
644         else { /* XXXX: logic is complicated, leave it along for a moment. */
645             cl_anything(pRExC_state, cl);
646         }
647     }
648     if (or_with->flags & ANYOF_EOS)
649         cl->flags |= ANYOF_EOS;
650
651     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
652         ARG(cl) != ARG(or_with)) {
653         cl->flags |= ANYOF_UNICODE_ALL;
654         cl->flags &= ~ANYOF_UNICODE;
655     }
656     if (or_with->flags & ANYOF_UNICODE_ALL) {
657         cl->flags |= ANYOF_UNICODE_ALL;
658         cl->flags &= ~ANYOF_UNICODE;
659     }
660 }
661
662 /*
663  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
664  * These need to be revisited when a newer toolchain becomes available.
665  */
666 #if defined(__sparc64__) && defined(__GNUC__)
667 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
668 #       undef  SPARC64_GCC_WORKAROUND
669 #       define SPARC64_GCC_WORKAROUND 1
670 #   endif
671 #endif
672
673 /* REx optimizer.  Converts nodes into quickier variants "in place".
674    Finds fixed substrings.  */
675
676 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
677    to the position after last scanned or to NULL. */
678
679 STATIC I32
680 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
681                         /* scanp: Start here (read-write). */
682                         /* deltap: Write maxlen-minlen here. */
683                         /* last: Stop before this one. */
684 {
685     I32 min = 0, pars = 0, code;
686     regnode *scan = *scanp, *next;
687     I32 delta = 0;
688     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
689     int is_inf_internal = 0;            /* The studied chunk is infinite */
690     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
691     scan_data_t data_fake;
692     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
693
694     while (scan && OP(scan) != END && scan < last) {
695         /* Peephole optimizer: */
696
697         if (PL_regkind[(U8)OP(scan)] == EXACT) {
698             /* Merge several consecutive EXACTish nodes into one. */
699             regnode *n = regnext(scan);
700             U32 stringok = 1;
701 #ifdef DEBUGGING
702             regnode *stop = scan;
703 #endif
704
705             next = scan + NODE_SZ_STR(scan);
706             /* Skip NOTHING, merge EXACT*. */
707             while (n &&
708                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
709                      (stringok && (OP(n) == OP(scan))))
710                    && NEXT_OFF(n)
711                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
712                 if (OP(n) == TAIL || n > next)
713                     stringok = 0;
714                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
715                     NEXT_OFF(scan) += NEXT_OFF(n);
716                     next = n + NODE_STEP_REGNODE;
717 #ifdef DEBUGGING
718                     if (stringok)
719                         stop = n;
720 #endif
721                     n = regnext(n);
722                 }
723                 else if (stringok) {
724                     int oldl = STR_LEN(scan);
725                     regnode *nnext = regnext(n);
726
727                     if (oldl + STR_LEN(n) > U8_MAX)
728                         break;
729                     NEXT_OFF(scan) += NEXT_OFF(n);
730                     STR_LEN(scan) += STR_LEN(n);
731                     next = n + NODE_SZ_STR(n);
732                     /* Now we can overwrite *n : */
733                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
734 #ifdef DEBUGGING
735                     stop = next - 1;
736 #endif
737                     n = nnext;
738                 }
739             }
740
741             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
742 /*
743   Two problematic code points in Unicode casefolding of EXACT nodes:
744
745    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
746    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
747
748    which casefold to
749
750    Unicode                      UTF-8
751
752    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
753    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
754
755    This means that in case-insensitive matching (or "loose matching",
756    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
757    length of the above casefolded versions) can match a target string
758    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
759    This would rather mess up the minimum length computation.
760
761    What we'll do is to look for the tail four bytes, and then peek
762    at the preceding two bytes to see whether we need to decrease
763    the minimum length by four (six minus two).
764
765    Thanks to the design of UTF-8, there cannot be false matches:
766    A sequence of valid UTF-8 bytes cannot be a subsequence of
767    another valid sequence of UTF-8 bytes.
768
769 */
770                  char *s0 = STRING(scan), *s, *t;
771                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
772                  char *t0 = "\xcc\x88\xcc\x81";
773                  char *t1 = t0 + 3;
774                  
775                  for (s = s0 + 2;
776                       s < s2 && (t = ninstr(s, s1, t0, t1));
777                       s = t + 4) {
778                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
779                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
780                            min -= 4;
781                  }
782             }
783
784 #ifdef DEBUGGING
785             /* Allow dumping */
786             n = scan + NODE_SZ_STR(scan);
787             while (n <= stop) {
788                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
789                     OP(n) = OPTIMIZED;
790                     NEXT_OFF(n) = 0;
791                 }
792                 n++;
793             }
794 #endif
795         }
796         /* Follow the next-chain of the current node and optimize
797            away all the NOTHINGs from it.  */
798         if (OP(scan) != CURLYX) {
799             int max = (reg_off_by_arg[OP(scan)]
800                        ? I32_MAX
801                        /* I32 may be smaller than U16 on CRAYs! */
802                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
803             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
804             int noff;
805             regnode *n = scan;
806         
807             /* Skip NOTHING and LONGJMP. */
808             while ((n = regnext(n))
809                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
810                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811                    && off + noff < max)
812                 off += noff;
813             if (reg_off_by_arg[OP(scan)])
814                 ARG(scan) = off;
815             else
816                 NEXT_OFF(scan) = off;
817         }
818         /* The principal pseudo-switch.  Cannot be a switch, since we
819            look into several different things.  */
820         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
821                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
822             next = regnext(scan);
823             code = OP(scan);
824         
825             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
826                 I32 max1 = 0, min1 = I32_MAX, num = 0;
827                 struct regnode_charclass_class accum;
828                 
829                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
831                 if (flags & SCF_DO_STCLASS)
832                     cl_init_zero(pRExC_state, &accum);
833                 while (OP(scan) == code) {
834                     I32 deltanext, minnext, f = 0, fake;
835                     struct regnode_charclass_class this_class;
836
837                     num++;
838                     data_fake.flags = 0;
839                     if (data) {         
840                         data_fake.whilem_c = data->whilem_c;
841                         data_fake.last_closep = data->last_closep;
842                     }
843                     else
844                         data_fake.last_closep = &fake;
845                     next = regnext(scan);
846                     scan = NEXTOPER(scan);
847                     if (code != BRANCH)
848                         scan = NEXTOPER(scan);
849                     if (flags & SCF_DO_STCLASS) {
850                         cl_init(pRExC_state, &this_class);
851                         data_fake.start_class = &this_class;
852                         f = SCF_DO_STCLASS_AND;
853                     }           
854                     if (flags & SCF_WHILEM_VISITED_POS)
855                         f |= SCF_WHILEM_VISITED_POS;
856                     /* we suppose the run is continuous, last=next...*/
857                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
858                                           next, &data_fake, f);
859                     if (min1 > minnext)
860                         min1 = minnext;
861                     if (max1 < minnext + deltanext)
862                         max1 = minnext + deltanext;
863                     if (deltanext == I32_MAX)
864                         is_inf = is_inf_internal = 1;
865                     scan = next;
866                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
867                         pars++;
868                     if (data && (data_fake.flags & SF_HAS_EVAL))
869                         data->flags |= SF_HAS_EVAL;
870                     if (data)
871                         data->whilem_c = data_fake.whilem_c;
872                     if (flags & SCF_DO_STCLASS)
873                         cl_or(pRExC_state, &accum, &this_class);
874                     if (code == SUSPEND)
875                         break;
876                 }
877                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
878                     min1 = 0;
879                 if (flags & SCF_DO_SUBSTR) {
880                     data->pos_min += min1;
881                     data->pos_delta += max1 - min1;
882                     if (max1 != min1 || is_inf)
883                         data->longest = &(data->longest_float);
884                 }
885                 min += min1;
886                 delta += max1 - min1;
887                 if (flags & SCF_DO_STCLASS_OR) {
888                     cl_or(pRExC_state, data->start_class, &accum);
889                     if (min1) {
890                         cl_and(data->start_class, &and_with);
891                         flags &= ~SCF_DO_STCLASS;
892                     }
893                 }
894                 else if (flags & SCF_DO_STCLASS_AND) {
895                     if (min1) {
896                         cl_and(data->start_class, &accum);
897                         flags &= ~SCF_DO_STCLASS;
898                     }
899                     else {
900                         /* Switch to OR mode: cache the old value of
901                          * data->start_class */
902                         StructCopy(data->start_class, &and_with,
903                                    struct regnode_charclass_class);
904                         flags &= ~SCF_DO_STCLASS_AND;
905                         StructCopy(&accum, data->start_class,
906                                    struct regnode_charclass_class);
907                         flags |= SCF_DO_STCLASS_OR;
908                         data->start_class->flags |= ANYOF_EOS;
909                     }
910                 }
911             }
912             else if (code == BRANCHJ)   /* single branch is optimized. */
913                 scan = NEXTOPER(NEXTOPER(scan));
914             else                        /* single branch is optimized. */
915                 scan = NEXTOPER(scan);
916             continue;
917         }
918         else if (OP(scan) == EXACT) {
919             I32 l = STR_LEN(scan);
920             UV uc = *((U8*)STRING(scan));
921             if (UTF) {
922                 U8 *s = (U8*)STRING(scan);
923                 l = utf8_length(s, s + l);
924                 uc = utf8_to_uvchr(s, NULL);
925             }
926             min += l;
927             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
928                 /* The code below prefers earlier match for fixed
929                    offset, later match for variable offset.  */
930                 if (data->last_end == -1) { /* Update the start info. */
931                     data->last_start_min = data->pos_min;
932                     data->last_start_max = is_inf
933                         ? I32_MAX : data->pos_min + data->pos_delta;
934                 }
935                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
936                 if (UTF)
937                     SvUTF8_on(data->last_found);
938                 data->last_end = data->pos_min + l;
939                 data->pos_min += l; /* As in the first entry. */
940                 data->flags &= ~SF_BEFORE_EOL;
941             }
942             if (flags & SCF_DO_STCLASS_AND) {
943                 /* Check whether it is compatible with what we know already! */
944                 int compat = 1;
945
946                 if (uc >= 0x100 ||
947                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
948                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
949                     && (!(data->start_class->flags & ANYOF_FOLD)
950                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
951                     )
952                     compat = 0;
953                 ANYOF_CLASS_ZERO(data->start_class);
954                 ANYOF_BITMAP_ZERO(data->start_class);
955                 if (compat)
956                     ANYOF_BITMAP_SET(data->start_class, uc);
957                 data->start_class->flags &= ~ANYOF_EOS;
958                 if (uc < 0x100)
959                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960             }
961             else if (flags & SCF_DO_STCLASS_OR) {
962                 /* false positive possible if the class is case-folded */
963                 if (uc < 0x100)
964                     ANYOF_BITMAP_SET(data->start_class, uc);
965                 else
966                     data->start_class->flags |= ANYOF_UNICODE_ALL;
967                 data->start_class->flags &= ~ANYOF_EOS;
968                 cl_and(data->start_class, &and_with);
969             }
970             flags &= ~SCF_DO_STCLASS;
971         }
972         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
973             I32 l = STR_LEN(scan);
974             UV uc = *((U8*)STRING(scan));
975
976             /* Search for fixed substrings supports EXACT only. */
977             if (flags & SCF_DO_SUBSTR)
978                 scan_commit(pRExC_state, data);
979             if (UTF) {
980                 U8 *s = (U8 *)STRING(scan);
981                 l = utf8_length(s, s + l);
982                 uc = utf8_to_uvchr(s, NULL);
983             }
984             min += l;
985             if (data && (flags & SCF_DO_SUBSTR))
986                 data->pos_min += l;
987             if (flags & SCF_DO_STCLASS_AND) {
988                 /* Check whether it is compatible with what we know already! */
989                 int compat = 1;
990
991                 if (uc >= 0x100 ||
992                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
993                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
994                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995                     compat = 0;
996                 ANYOF_CLASS_ZERO(data->start_class);
997                 ANYOF_BITMAP_ZERO(data->start_class);
998                 if (compat) {
999                     ANYOF_BITMAP_SET(data->start_class, uc);
1000                     data->start_class->flags &= ~ANYOF_EOS;
1001                     data->start_class->flags |= ANYOF_FOLD;
1002                     if (OP(scan) == EXACTFL)
1003                         data->start_class->flags |= ANYOF_LOCALE;
1004                 }
1005             }
1006             else if (flags & SCF_DO_STCLASS_OR) {
1007                 if (data->start_class->flags & ANYOF_FOLD) {
1008                     /* false positive possible if the class is case-folded.
1009                        Assume that the locale settings are the same... */
1010                     if (uc < 0x100)
1011                         ANYOF_BITMAP_SET(data->start_class, uc);
1012                     data->start_class->flags &= ~ANYOF_EOS;
1013                 }
1014                 cl_and(data->start_class, &and_with);
1015             }
1016             flags &= ~SCF_DO_STCLASS;
1017         }
1018         else if (strchr((char*)PL_varies,OP(scan))) {
1019             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1020             I32 f = flags, pos_before = 0;
1021             regnode *oscan = scan;
1022             struct regnode_charclass_class this_class;
1023             struct regnode_charclass_class *oclass = NULL;
1024             I32 next_is_eval = 0;
1025
1026             switch (PL_regkind[(U8)OP(scan)]) {
1027             case WHILEM:                /* End of (?:...)* . */
1028                 scan = NEXTOPER(scan);
1029                 goto finish;
1030             case PLUS:
1031                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1032                     next = NEXTOPER(scan);
1033                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034                         mincount = 1;
1035                         maxcount = REG_INFTY;
1036                         next = regnext(scan);
1037                         scan = NEXTOPER(scan);
1038                         goto do_curly;
1039                     }
1040                 }
1041                 if (flags & SCF_DO_SUBSTR)
1042                     data->pos_min++;
1043                 min++;
1044                 /* Fall through. */
1045             case STAR:
1046                 if (flags & SCF_DO_STCLASS) {
1047                     mincount = 0;
1048                     maxcount = REG_INFTY;
1049                     next = regnext(scan);
1050                     scan = NEXTOPER(scan);
1051                     goto do_curly;
1052                 }
1053                 is_inf = is_inf_internal = 1;
1054                 scan = regnext(scan);
1055                 if (flags & SCF_DO_SUBSTR) {
1056                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1057                     data->longest = &(data->longest_float);
1058                 }
1059                 goto optimize_curly_tail;
1060             case CURLY:
1061                 mincount = ARG1(scan);
1062                 maxcount = ARG2(scan);
1063                 next = regnext(scan);
1064                 if (OP(scan) == CURLYX) {
1065                     I32 lp = (data ? *(data->last_closep) : 0);
1066
1067                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068                 }
1069                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1070                 next_is_eval = (OP(scan) == EVAL);
1071               do_curly:
1072                 if (flags & SCF_DO_SUBSTR) {
1073                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1074                     pos_before = data->pos_min;
1075                 }
1076                 if (data) {
1077                     fl = data->flags;
1078                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079                     if (is_inf)
1080                         data->flags |= SF_IS_INF;
1081                 }
1082                 if (flags & SCF_DO_STCLASS) {
1083                     cl_init(pRExC_state, &this_class);
1084                     oclass = data->start_class;
1085                     data->start_class = &this_class;
1086                     f |= SCF_DO_STCLASS_AND;
1087                     f &= ~SCF_DO_STCLASS_OR;
1088                 }
1089                 /* These are the cases when once a subexpression
1090                    fails at a particular position, it cannot succeed
1091                    even after backtracking at the enclosing scope.
1092                 
1093                    XXXX what if minimal match and we are at the
1094                         initial run of {n,m}? */
1095                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1096                     f &= ~SCF_WHILEM_VISITED_POS;
1097
1098                 /* This will finish on WHILEM, setting scan, or on NULL: */
1099                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100                                       mincount == 0
1101                                         ? (f & ~SCF_DO_SUBSTR) : f);
1102
1103                 if (flags & SCF_DO_STCLASS)
1104                     data->start_class = oclass;
1105                 if (mincount == 0 || minnext == 0) {
1106                     if (flags & SCF_DO_STCLASS_OR) {
1107                         cl_or(pRExC_state, data->start_class, &this_class);
1108                     }
1109                     else if (flags & SCF_DO_STCLASS_AND) {
1110                         /* Switch to OR mode: cache the old value of
1111                          * data->start_class */
1112                         StructCopy(data->start_class, &and_with,
1113                                    struct regnode_charclass_class);
1114                         flags &= ~SCF_DO_STCLASS_AND;
1115                         StructCopy(&this_class, data->start_class,
1116                                    struct regnode_charclass_class);
1117                         flags |= SCF_DO_STCLASS_OR;
1118                         data->start_class->flags |= ANYOF_EOS;
1119                     }
1120                 } else {                /* Non-zero len */
1121                     if (flags & SCF_DO_STCLASS_OR) {
1122                         cl_or(pRExC_state, data->start_class, &this_class);
1123                         cl_and(data->start_class, &and_with);
1124                     }
1125                     else if (flags & SCF_DO_STCLASS_AND)
1126                         cl_and(data->start_class, &this_class);
1127                     flags &= ~SCF_DO_STCLASS;
1128                 }
1129                 if (!scan)              /* It was not CURLYX, but CURLY. */
1130                     scan = next;
1131                 if (ckWARN(WARN_REGEXP)
1132                        /* ? quantifier ok, except for (?{ ... }) */
1133                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1134                     && (minnext == 0) && (deltanext == 0)
1135                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1136                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1137                 {
1138                     vWARN(RExC_parse,
1139                           "Quantifier unexpected on zero-length expression");
1140                 }
1141
1142                 min += minnext * mincount;
1143                 is_inf_internal |= ((maxcount == REG_INFTY
1144                                      && (minnext + deltanext) > 0)
1145                                     || deltanext == I32_MAX);
1146                 is_inf |= is_inf_internal;
1147                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148
1149                 /* Try powerful optimization CURLYX => CURLYN. */
1150                 if (  OP(oscan) == CURLYX && data
1151                       && data->flags & SF_IN_PAR
1152                       && !(data->flags & SF_HAS_EVAL)
1153                       && !deltanext && minnext == 1 ) {
1154                     /* Try to optimize to CURLYN.  */
1155                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1156                     regnode *nxt1 = nxt;
1157 #ifdef DEBUGGING
1158                     regnode *nxt2;
1159 #endif
1160
1161                     /* Skip open. */
1162                     nxt = regnext(nxt);
1163                     if (!strchr((char*)PL_simple,OP(nxt))
1164                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1165                              && STR_LEN(nxt) == 1))
1166                         goto nogo;
1167 #ifdef DEBUGGING
1168                     nxt2 = nxt;
1169 #endif
1170                     nxt = regnext(nxt);
1171                     if (OP(nxt) != CLOSE)
1172                         goto nogo;
1173                     /* Now we know that nxt2 is the only contents: */
1174                     oscan->flags = (U8)ARG(nxt);
1175                     OP(oscan) = CURLYN;
1176                     OP(nxt1) = NOTHING; /* was OPEN. */
1177 #ifdef DEBUGGING
1178                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1179                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1180                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1181                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1182                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1183                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1184 #endif
1185                 }
1186               nogo:
1187
1188                 /* Try optimization CURLYX => CURLYM. */
1189                 if (  OP(oscan) == CURLYX && data
1190                       && !(data->flags & SF_HAS_PAR)
1191                       && !(data->flags & SF_HAS_EVAL)
1192                       && !deltanext  ) {
1193                     /* XXXX How to optimize if data == 0? */
1194                     /* Optimize to a simpler form.  */
1195                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1196                     regnode *nxt2;
1197
1198                     OP(oscan) = CURLYM;
1199                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1200                             && (OP(nxt2) != WHILEM))
1201                         nxt = nxt2;
1202                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1203                     /* Need to optimize away parenths. */
1204                     if (data->flags & SF_IN_PAR) {
1205                         /* Set the parenth number.  */
1206                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207
1208                         if (OP(nxt) != CLOSE)
1209                             FAIL("Panic opt close");
1210                         oscan->flags = (U8)ARG(nxt);
1211                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1212                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1213 #ifdef DEBUGGING
1214                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1215                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1216                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1217                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1218 #endif
1219 #if 0
1220                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1221                             regnode *nnxt = regnext(nxt1);
1222                         
1223                             if (nnxt == nxt) {
1224                                 if (reg_off_by_arg[OP(nxt1)])
1225                                     ARG_SET(nxt1, nxt2 - nxt1);
1226                                 else if (nxt2 - nxt1 < U16_MAX)
1227                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1228                                 else
1229                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1230                             }
1231                             nxt1 = nnxt;
1232                         }
1233 #endif
1234                         /* Optimize again: */
1235                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1236                                     NULL, 0);
1237                     }
1238                     else
1239                         oscan->flags = 0;
1240                 }
1241                 else if ((OP(oscan) == CURLYX)
1242                          && (flags & SCF_WHILEM_VISITED_POS)
1243                          /* See the comment on a similar expression above.
1244                             However, this time it not a subexpression
1245                             we care about, but the expression itself. */
1246                          && (maxcount == REG_INFTY)
1247                          && data && ++data->whilem_c < 16) {
1248                     /* This stays as CURLYX, we can put the count/of pair. */
1249                     /* Find WHILEM (as in regexec.c) */
1250                     regnode *nxt = oscan + NEXT_OFF(oscan);
1251
1252                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253                         nxt += ARG(nxt);
1254                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1255                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1256                 }
1257                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1258                     pars++;
1259                 if (flags & SCF_DO_SUBSTR) {
1260                     SV *last_str = Nullsv;
1261                     int counted = mincount != 0;
1262
1263                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1264 #if defined(SPARC64_GCC_WORKAROUND)
1265                         I32 b = 0;
1266                         STRLEN l = 0;
1267                         char *s = NULL;
1268                         I32 old = 0;
1269
1270                         if (pos_before >= data->last_start_min)
1271                             b = pos_before;
1272                         else
1273                             b = data->last_start_min;
1274
1275                         l = 0;
1276                         s = SvPV(data->last_found, l);
1277                         old = b - data->last_start_min;
1278
1279 #else
1280                         I32 b = pos_before >= data->last_start_min
1281                             ? pos_before : data->last_start_min;
1282                         STRLEN l;
1283                         char *s = SvPV(data->last_found, l);
1284                         I32 old = b - data->last_start_min;
1285 #endif
1286
1287                         if (UTF)
1288                             old = utf8_hop((U8*)s, old) - (U8*)s;
1289                         
1290                         l -= old;
1291                         /* Get the added string: */
1292                         last_str = newSVpvn(s  + old, l);
1293                         if (deltanext == 0 && pos_before == b) {
1294                             /* What was added is a constant string */
1295                             if (mincount > 1) {
1296                                 SvGROW(last_str, (mincount * l) + 1);
1297                                 repeatcpy(SvPVX(last_str) + l,
1298                                           SvPVX(last_str), l, mincount - 1);
1299                                 SvCUR(last_str) *= mincount;
1300                                 /* Add additional parts. */
1301                                 SvCUR_set(data->last_found,
1302                                           SvCUR(data->last_found) - l);
1303                                 sv_catsv(data->last_found, last_str);
1304                                 data->last_end += l * (mincount - 1);
1305                             }
1306                         } else {
1307                             /* start offset must point into the last copy */
1308                             data->last_start_min += minnext * (mincount - 1);
1309                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1310                                 * (minnext + data->pos_delta);
1311                         }
1312                     }
1313                     /* It is counted once already... */
1314                     data->pos_min += minnext * (mincount - counted);
1315                     data->pos_delta += - counted * deltanext +
1316                         (minnext + deltanext) * maxcount - minnext * mincount;
1317                     if (mincount != maxcount) {
1318                          /* Cannot extend fixed substrings found inside
1319                             the group.  */
1320                         scan_commit(pRExC_state,data);
1321                         if (mincount && last_str) {
1322                             sv_setsv(data->last_found, last_str);
1323                             data->last_end = data->pos_min;
1324                             data->last_start_min =
1325                                 data->pos_min - CHR_SVLEN(last_str);
1326                             data->last_start_max = is_inf
1327                                 ? I32_MAX
1328                                 : data->pos_min + data->pos_delta
1329                                 - CHR_SVLEN(last_str);
1330                         }
1331                         data->longest = &(data->longest_float);
1332                     }
1333                     SvREFCNT_dec(last_str);
1334                 }
1335                 if (data && (fl & SF_HAS_EVAL))
1336                     data->flags |= SF_HAS_EVAL;
1337               optimize_curly_tail:
1338                 if (OP(oscan) != CURLYX) {
1339                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1340                            && NEXT_OFF(next))
1341                         NEXT_OFF(oscan) += NEXT_OFF(next);
1342                 }
1343                 continue;
1344             default:                    /* REF and CLUMP only? */
1345                 if (flags & SCF_DO_SUBSTR) {
1346                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1347                     data->longest = &(data->longest_float);
1348                 }
1349                 is_inf = is_inf_internal = 1;
1350                 if (flags & SCF_DO_STCLASS_OR)
1351                     cl_anything(pRExC_state, data->start_class);
1352                 flags &= ~SCF_DO_STCLASS;
1353                 break;
1354             }
1355         }
1356         else if (strchr((char*)PL_simple,OP(scan))) {
1357             int value = 0;
1358
1359             if (flags & SCF_DO_SUBSTR) {
1360                 scan_commit(pRExC_state,data);
1361                 data->pos_min++;
1362             }
1363             min++;
1364             if (flags & SCF_DO_STCLASS) {
1365                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1366
1367                 /* Some of the logic below assumes that switching
1368                    locale on will only add false positives. */
1369                 switch (PL_regkind[(U8)OP(scan)]) {
1370                 case SANY:
1371                 default:
1372                   do_default:
1373                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1374                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1375                         cl_anything(pRExC_state, data->start_class);
1376                     break;
1377                 case REG_ANY:
1378                     if (OP(scan) == SANY)
1379                         goto do_default;
1380                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1381                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1382                                  || (data->start_class->flags & ANYOF_CLASS));
1383                         cl_anything(pRExC_state, data->start_class);
1384                     }
1385                     if (flags & SCF_DO_STCLASS_AND || !value)
1386                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387                     break;
1388                 case ANYOF:
1389                     if (flags & SCF_DO_STCLASS_AND)
1390                         cl_and(data->start_class,
1391                                (struct regnode_charclass_class*)scan);
1392                     else
1393                         cl_or(pRExC_state, data->start_class,
1394                               (struct regnode_charclass_class*)scan);
1395                     break;
1396                 case ALNUM:
1397                     if (flags & SCF_DO_STCLASS_AND) {
1398                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1399                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1400                             for (value = 0; value < 256; value++)
1401                                 if (!isALNUM(value))
1402                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1403                         }
1404                     }
1405                     else {
1406                         if (data->start_class->flags & ANYOF_LOCALE)
1407                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1408                         else {
1409                             for (value = 0; value < 256; value++)
1410                                 if (isALNUM(value))
1411                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1412                         }
1413                     }
1414                     break;
1415                 case ALNUML:
1416                     if (flags & SCF_DO_STCLASS_AND) {
1417                         if (data->start_class->flags & ANYOF_LOCALE)
1418                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419                     }
1420                     else {
1421                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1422                         data->start_class->flags |= ANYOF_LOCALE;
1423                     }
1424                     break;
1425                 case NALNUM:
1426                     if (flags & SCF_DO_STCLASS_AND) {
1427                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1428                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1429                             for (value = 0; value < 256; value++)
1430                                 if (isALNUM(value))
1431                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1432                         }
1433                     }
1434                     else {
1435                         if (data->start_class->flags & ANYOF_LOCALE)
1436                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437                         else {
1438                             for (value = 0; value < 256; value++)
1439                                 if (!isALNUM(value))
1440                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1441                         }
1442                     }
1443                     break;
1444                 case NALNUML:
1445                     if (flags & SCF_DO_STCLASS_AND) {
1446                         if (data->start_class->flags & ANYOF_LOCALE)
1447                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448                     }
1449                     else {
1450                         data->start_class->flags |= ANYOF_LOCALE;
1451                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1452                     }
1453                     break;
1454                 case SPACE:
1455                     if (flags & SCF_DO_STCLASS_AND) {
1456                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1457                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1458                             for (value = 0; value < 256; value++)
1459                                 if (!isSPACE(value))
1460                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1461                         }
1462                     }
1463                     else {
1464                         if (data->start_class->flags & ANYOF_LOCALE)
1465                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466                         else {
1467                             for (value = 0; value < 256; value++)
1468                                 if (isSPACE(value))
1469                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1470                         }
1471                     }
1472                     break;
1473                 case SPACEL:
1474                     if (flags & SCF_DO_STCLASS_AND) {
1475                         if (data->start_class->flags & ANYOF_LOCALE)
1476                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477                     }
1478                     else {
1479                         data->start_class->flags |= ANYOF_LOCALE;
1480                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1481                     }
1482                     break;
1483                 case NSPACE:
1484                     if (flags & SCF_DO_STCLASS_AND) {
1485                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1486                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1487                             for (value = 0; value < 256; value++)
1488                                 if (isSPACE(value))
1489                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1490                         }
1491                     }
1492                     else {
1493                         if (data->start_class->flags & ANYOF_LOCALE)
1494                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1495                         else {
1496                             for (value = 0; value < 256; value++)
1497                                 if (!isSPACE(value))
1498                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1499                         }
1500                     }
1501                     break;
1502                 case NSPACEL:
1503                     if (flags & SCF_DO_STCLASS_AND) {
1504                         if (data->start_class->flags & ANYOF_LOCALE) {
1505                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1506                             for (value = 0; value < 256; value++)
1507                                 if (!isSPACE(value))
1508                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1509                         }
1510                     }
1511                     else {
1512                         data->start_class->flags |= ANYOF_LOCALE;
1513                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1514                     }
1515                     break;
1516                 case DIGIT:
1517                     if (flags & SCF_DO_STCLASS_AND) {
1518                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1519                         for (value = 0; value < 256; value++)
1520                             if (!isDIGIT(value))
1521                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522                     }
1523                     else {
1524                         if (data->start_class->flags & ANYOF_LOCALE)
1525                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1526                         else {
1527                             for (value = 0; value < 256; value++)
1528                                 if (isDIGIT(value))
1529                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1530                         }
1531                     }
1532                     break;
1533                 case NDIGIT:
1534                     if (flags & SCF_DO_STCLASS_AND) {
1535                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1536                         for (value = 0; value < 256; value++)
1537                             if (isDIGIT(value))
1538                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539                     }
1540                     else {
1541                         if (data->start_class->flags & ANYOF_LOCALE)
1542                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1543                         else {
1544                             for (value = 0; value < 256; value++)
1545                                 if (!isDIGIT(value))
1546                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1547                         }
1548                     }
1549                     break;
1550                 }
1551                 if (flags & SCF_DO_STCLASS_OR)
1552                     cl_and(data->start_class, &and_with);
1553                 flags &= ~SCF_DO_STCLASS;
1554             }
1555         }
1556         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1557             data->flags |= (OP(scan) == MEOL
1558                             ? SF_BEFORE_MEOL
1559                             : SF_BEFORE_SEOL);
1560         }
1561         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1562                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1563                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1564                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1565             /* Lookahead/lookbehind */
1566             I32 deltanext, minnext, fake = 0;
1567             regnode *nscan;
1568             struct regnode_charclass_class intrnl;
1569             int f = 0;
1570
1571             data_fake.flags = 0;
1572             if (data) {         
1573                 data_fake.whilem_c = data->whilem_c;
1574                 data_fake.last_closep = data->last_closep;
1575             }
1576             else
1577                 data_fake.last_closep = &fake;
1578             if ( flags & SCF_DO_STCLASS && !scan->flags
1579                  && OP(scan) == IFMATCH ) { /* Lookahead */
1580                 cl_init(pRExC_state, &intrnl);
1581                 data_fake.start_class = &intrnl;
1582                 f |= SCF_DO_STCLASS_AND;
1583             }
1584             if (flags & SCF_WHILEM_VISITED_POS)
1585                 f |= SCF_WHILEM_VISITED_POS;
1586             next = regnext(scan);
1587             nscan = NEXTOPER(NEXTOPER(scan));
1588             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1589             if (scan->flags) {
1590                 if (deltanext) {
1591                     vFAIL("Variable length lookbehind not implemented");
1592                 }
1593                 else if (minnext > U8_MAX) {
1594                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1595                 }
1596                 scan->flags = (U8)minnext;
1597             }
1598             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1599                 pars++;
1600             if (data && (data_fake.flags & SF_HAS_EVAL))
1601                 data->flags |= SF_HAS_EVAL;
1602             if (data)
1603                 data->whilem_c = data_fake.whilem_c;
1604             if (f & SCF_DO_STCLASS_AND) {
1605                 int was = (data->start_class->flags & ANYOF_EOS);
1606
1607                 cl_and(data->start_class, &intrnl);
1608                 if (was)
1609                     data->start_class->flags |= ANYOF_EOS;
1610             }
1611         }
1612         else if (OP(scan) == OPEN) {
1613             pars++;
1614         }
1615         else if (OP(scan) == CLOSE) {
1616             if ((I32)ARG(scan) == is_par) {
1617                 next = regnext(scan);
1618
1619                 if ( next && (OP(next) != WHILEM) && next < last)
1620                     is_par = 0;         /* Disable optimization */
1621             }
1622             if (data)
1623                 *(data->last_closep) = ARG(scan);
1624         }
1625         else if (OP(scan) == EVAL) {
1626                 if (data)
1627                     data->flags |= SF_HAS_EVAL;
1628         }
1629         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1630                 if (flags & SCF_DO_SUBSTR) {
1631                     scan_commit(pRExC_state,data);
1632                     data->longest = &(data->longest_float);
1633                 }
1634                 is_inf = is_inf_internal = 1;
1635                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1636                     cl_anything(pRExC_state, data->start_class);
1637                 flags &= ~SCF_DO_STCLASS;
1638         }
1639         /* Else: zero-length, ignore. */
1640         scan = regnext(scan);
1641     }
1642
1643   finish:
1644     *scanp = scan;
1645     *deltap = is_inf_internal ? I32_MAX : delta;
1646     if (flags & SCF_DO_SUBSTR && is_inf)
1647         data->pos_delta = I32_MAX - data->pos_min;
1648     if (is_par > U8_MAX)
1649         is_par = 0;
1650     if (is_par && pars==1 && data) {
1651         data->flags |= SF_IN_PAR;
1652         data->flags &= ~SF_HAS_PAR;
1653     }
1654     else if (pars && data) {
1655         data->flags |= SF_HAS_PAR;
1656         data->flags &= ~SF_IN_PAR;
1657     }
1658     if (flags & SCF_DO_STCLASS_OR)
1659         cl_and(data->start_class, &and_with);
1660     return min;
1661 }
1662
1663 STATIC I32
1664 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1665 {
1666     if (RExC_rx->data) {
1667         Renewc(RExC_rx->data,
1668                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1669                char, struct reg_data);
1670         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1671         RExC_rx->data->count += n;
1672     }
1673     else {
1674         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1675              char, struct reg_data);
1676         New(1208, RExC_rx->data->what, n, U8);
1677         RExC_rx->data->count = n;
1678     }
1679     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1680     return RExC_rx->data->count - n;
1681 }
1682
1683 void
1684 Perl_reginitcolors(pTHX)
1685 {
1686     int i = 0;
1687     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1688         
1689     if (s) {
1690         PL_colors[0] = s = savepv(s);
1691         while (++i < 6) {
1692             s = strchr(s, '\t');
1693             if (s) {
1694                 *s = '\0';
1695                 PL_colors[i] = ++s;
1696             }
1697             else
1698                 PL_colors[i] = s = "";
1699         }
1700     } else {
1701         while (i < 6)
1702             PL_colors[i++] = "";
1703     }
1704     PL_colorset = 1;
1705 }
1706
1707
1708 /*
1709  - pregcomp - compile a regular expression into internal code
1710  *
1711  * We can't allocate space until we know how big the compiled form will be,
1712  * but we can't compile it (and thus know how big it is) until we've got a
1713  * place to put the code.  So we cheat:  we compile it twice, once with code
1714  * generation turned off and size counting turned on, and once "for real".
1715  * This also means that we don't allocate space until we are sure that the
1716  * thing really will compile successfully, and we never have to move the
1717  * code and thus invalidate pointers into it.  (Note that it has to be in
1718  * one piece because free() must be able to free it all.) [NB: not true in perl]
1719  *
1720  * Beware that the optimization-preparation code in here knows about some
1721  * of the structure of the compiled regexp.  [I'll say.]
1722  */
1723 regexp *
1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1725 {
1726     register regexp *r;
1727     regnode *scan;
1728     regnode *first;
1729     I32 flags;
1730     I32 minlen = 0;
1731     I32 sawplus = 0;
1732     I32 sawopen = 0;
1733     scan_data_t data;
1734     RExC_state_t RExC_state;
1735     RExC_state_t *pRExC_state = &RExC_state;
1736
1737     if (exp == NULL)
1738         FAIL("NULL regexp argument");
1739
1740     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1741
1742     RExC_precomp = exp;
1743     DEBUG_r({
1744          if (!PL_colorset) reginitcolors();
1745          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746                        PL_colors[4],PL_colors[5],PL_colors[0],
1747                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1748     });
1749     RExC_flags = pm->op_pmflags;
1750     RExC_sawback = 0;
1751
1752     RExC_seen = 0;
1753     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754     RExC_seen_evals = 0;
1755     RExC_extralen = 0;
1756
1757     /* First pass: determine size, legality. */
1758     RExC_parse = exp;
1759     RExC_start = exp;
1760     RExC_end = xend;
1761     RExC_naughty = 0;
1762     RExC_npar = 1;
1763     RExC_size = 0L;
1764     RExC_emit = &PL_regdummy;
1765     RExC_whilem_seen = 0;
1766 #if 0 /* REGC() is (currently) a NOP at the first pass.
1767        * Clever compilers notice this and complain. --jhi */
1768     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1769 #endif
1770     if (reg(pRExC_state, 0, &flags) == NULL) {
1771         RExC_precomp = Nullch;
1772         return(NULL);
1773     }
1774     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1775
1776     /* Small enough for pointer-storage convention?
1777        If extralen==0, this means that we will not need long jumps. */
1778     if (RExC_size >= 0x10000L && RExC_extralen)
1779         RExC_size += RExC_extralen;
1780     else
1781         RExC_extralen = 0;
1782     if (RExC_whilem_seen > 15)
1783         RExC_whilem_seen = 15;
1784
1785     /* Allocate space and initialize. */
1786     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787          char, regexp);
1788     if (r == NULL)
1789         FAIL("Regexp out of space");
1790
1791 #ifdef DEBUGGING
1792     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1793     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 #endif
1795     r->refcnt = 1;
1796     r->prelen = xend - exp;
1797     r->precomp = savepvn(RExC_precomp, r->prelen);
1798     r->subbeg = NULL;
1799     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1800     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1801
1802     r->substrs = 0;                     /* Useful during FAIL. */
1803     r->startp = 0;                      /* Useful during FAIL. */
1804     r->endp = 0;                        /* Useful during FAIL. */
1805
1806     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1807     if (r->offsets) {
1808       r->offsets[0] = RExC_size; 
1809     }
1810     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1811                           "%s %"UVuf" bytes for offset annotations.\n", 
1812                           r->offsets ? "Got" : "Couldn't get", 
1813                           (UV)((2*RExC_size+1) * sizeof(U32))));
1814
1815     RExC_rx = r;
1816
1817     /* Second pass: emit code. */
1818     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1819     RExC_parse = exp;
1820     RExC_end = xend;
1821     RExC_naughty = 0;
1822     RExC_npar = 1;
1823     RExC_emit_start = r->program;
1824     RExC_emit = r->program;
1825     /* Store the count of eval-groups for security checks: */
1826     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1827     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1828     r->data = 0;
1829     if (reg(pRExC_state, 0, &flags) == NULL)
1830         return(NULL);
1831
1832     /* Dig out information for optimizations. */
1833     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1834     pm->op_pmflags = RExC_flags;
1835     if (UTF)
1836         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1837     r->regstclass = NULL;
1838     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1839         r->reganch |= ROPT_NAUGHTY;
1840     scan = r->program + 1;              /* First BRANCH. */
1841
1842     /* XXXX To minimize changes to RE engine we always allocate
1843        3-units-long substrs field. */
1844     Newz(1004, r->substrs, 1, struct reg_substr_data);
1845
1846     StructCopy(&zero_scan_data, &data, scan_data_t);
1847     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1848     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1849         I32 fake;
1850         STRLEN longest_float_length, longest_fixed_length;
1851         struct regnode_charclass_class ch_class;
1852         int stclass_flag;
1853         I32 last_close = 0;
1854
1855         first = scan;
1856         /* Skip introductions and multiplicators >= 1. */
1857         while ((OP(first) == OPEN && (sawopen = 1)) ||
1858                /* An OR of *one* alternative - should not happen now. */
1859             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1860             (OP(first) == PLUS) ||
1861             (OP(first) == MINMOD) ||
1862                /* An {n,m} with n>0 */
1863             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1864                 if (OP(first) == PLUS)
1865                     sawplus = 1;
1866                 else
1867                     first += regarglen[(U8)OP(first)];
1868                 first = NEXTOPER(first);
1869         }
1870
1871         /* Starting-point info. */
1872       again:
1873         if (PL_regkind[(U8)OP(first)] == EXACT) {
1874             if (OP(first) == EXACT)
1875                 ;       /* Empty, get anchored substr later. */
1876             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1877                 r->regstclass = first;
1878         }
1879         else if (strchr((char*)PL_simple,OP(first)))
1880             r->regstclass = first;
1881         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1882                  PL_regkind[(U8)OP(first)] == NBOUND)
1883             r->regstclass = first;
1884         else if (PL_regkind[(U8)OP(first)] == BOL) {
1885             r->reganch |= (OP(first) == MBOL
1886                            ? ROPT_ANCH_MBOL
1887                            : (OP(first) == SBOL
1888                               ? ROPT_ANCH_SBOL
1889                               : ROPT_ANCH_BOL));
1890             first = NEXTOPER(first);
1891             goto again;
1892         }
1893         else if (OP(first) == GPOS) {
1894             r->reganch |= ROPT_ANCH_GPOS;
1895             first = NEXTOPER(first);
1896             goto again;
1897         }
1898         else if (!sawopen && (OP(first) == STAR &&
1899             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1900             !(r->reganch & ROPT_ANCH) )
1901         {
1902             /* turn .* into ^.* with an implied $*=1 */
1903             int type = OP(NEXTOPER(first));
1904
1905             if (type == REG_ANY)
1906                 type = ROPT_ANCH_MBOL;
1907             else
1908                 type = ROPT_ANCH_SBOL;
1909
1910             r->reganch |= type | ROPT_IMPLICIT;
1911             first = NEXTOPER(first);
1912             goto again;
1913         }
1914         if (sawplus && (!sawopen || !RExC_sawback)
1915             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1916             /* x+ must match at the 1st pos of run of x's */
1917             r->reganch |= ROPT_SKIP;
1918
1919         /* Scan is after the zeroth branch, first is atomic matcher. */
1920         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1921                               (IV)(first - scan + 1)));
1922         /*
1923         * If there's something expensive in the r.e., find the
1924         * longest literal string that must appear and make it the
1925         * regmust.  Resolve ties in favor of later strings, since
1926         * the regstart check works with the beginning of the r.e.
1927         * and avoiding duplication strengthens checking.  Not a
1928         * strong reason, but sufficient in the absence of others.
1929         * [Now we resolve ties in favor of the earlier string if
1930         * it happens that c_offset_min has been invalidated, since the
1931         * earlier string may buy us something the later one won't.]
1932         */
1933         minlen = 0;
1934
1935         data.longest_fixed = newSVpvn("",0);
1936         data.longest_float = newSVpvn("",0);
1937         data.last_found = newSVpvn("",0);
1938         data.longest = &(data.longest_fixed);
1939         first = scan;
1940         if (!r->regstclass) {
1941             cl_init(pRExC_state, &ch_class);
1942             data.start_class = &ch_class;
1943             stclass_flag = SCF_DO_STCLASS_AND;
1944         } else                          /* XXXX Check for BOUND? */
1945             stclass_flag = 0;
1946         data.last_closep = &last_close;
1947
1948         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1949                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1950         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1951              && data.last_start_min == 0 && data.last_end > 0
1952              && !RExC_seen_zerolen
1953              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1954             r->reganch |= ROPT_CHECK_ALL;
1955         scan_commit(pRExC_state, &data);
1956         SvREFCNT_dec(data.last_found);
1957
1958         longest_float_length = CHR_SVLEN(data.longest_float);
1959         if (longest_float_length
1960             || (data.flags & SF_FL_BEFORE_EOL
1961                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1962                     || (RExC_flags & PMf_MULTILINE)))) {
1963             int t;
1964
1965             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1966                 && data.offset_fixed == data.offset_float_min
1967                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1968                     goto remove_float;          /* As in (a)+. */
1969
1970             if (SvUTF8(data.longest_float)) {
1971                 r->float_utf8 = data.longest_float;
1972                 r->float_substr = Nullsv;
1973             } else {
1974                 r->float_substr = data.longest_float;
1975                 r->float_utf8 = Nullsv;
1976             }
1977             r->float_min_offset = data.offset_float_min;
1978             r->float_max_offset = data.offset_float_max;
1979             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1980                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1981                            || (RExC_flags & PMf_MULTILINE)));
1982             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1983         }
1984         else {
1985           remove_float:
1986             r->float_substr = r->float_utf8 = Nullsv;
1987             SvREFCNT_dec(data.longest_float);
1988             longest_float_length = 0;
1989         }
1990
1991         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1992         if (longest_fixed_length
1993             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1994                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1995                     || (RExC_flags & PMf_MULTILINE)))) {
1996             int t;
1997
1998             if (SvUTF8(data.longest_fixed)) {
1999                 r->anchored_utf8 = data.longest_fixed;
2000                 r->anchored_substr = Nullsv;
2001             } else {
2002                 r->anchored_substr = data.longest_fixed;
2003                 r->anchored_utf8 = Nullsv;
2004             }
2005             r->anchored_offset = data.offset_fixed;
2006             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2007                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2008                      || (RExC_flags & PMf_MULTILINE)));
2009             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2010         }
2011         else {
2012             r->anchored_substr = r->anchored_utf8 = Nullsv;
2013             SvREFCNT_dec(data.longest_fixed);
2014             longest_fixed_length = 0;
2015         }
2016         if (r->regstclass
2017             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2018             r->regstclass = NULL;
2019         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2020             && stclass_flag
2021             && !(data.start_class->flags & ANYOF_EOS)
2022             && !cl_is_anything(data.start_class))
2023         {
2024             I32 n = add_data(pRExC_state, 1, "f");
2025
2026             New(1006, RExC_rx->data->data[n], 1,
2027                 struct regnode_charclass_class);
2028             StructCopy(data.start_class,
2029                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2030                        struct regnode_charclass_class);
2031             r->regstclass = (regnode*)RExC_rx->data->data[n];
2032             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2033             PL_regdata = r->data; /* for regprop() */
2034             DEBUG_r({ SV *sv = sv_newmortal();
2035                       regprop(sv, (regnode*)data.start_class);
2036                       PerlIO_printf(Perl_debug_log,
2037                                     "synthetic stclass `%s'.\n",
2038                                     SvPVX(sv));});
2039         }
2040
2041         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2042         if (longest_fixed_length > longest_float_length) {
2043             r->check_substr = r->anchored_substr;
2044             r->check_utf8 = r->anchored_utf8;
2045             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2046             if (r->reganch & ROPT_ANCH_SINGLE)
2047                 r->reganch |= ROPT_NOSCAN;
2048         }
2049         else {
2050             r->check_substr = r->float_substr;
2051             r->check_utf8 = r->float_utf8;
2052             r->check_offset_min = data.offset_float_min;
2053             r->check_offset_max = data.offset_float_max;
2054         }
2055         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2056            This should be changed ASAP!  */
2057         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2058             r->reganch |= RE_USE_INTUIT;
2059             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2060                 r->reganch |= RE_INTUIT_TAIL;
2061         }
2062     }
2063     else {
2064         /* Several toplevels. Best we can is to set minlen. */
2065         I32 fake;
2066         struct regnode_charclass_class ch_class;
2067         I32 last_close = 0;
2068         
2069         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2070         scan = r->program + 1;
2071         cl_init(pRExC_state, &ch_class);
2072         data.start_class = &ch_class;
2073         data.last_closep = &last_close;
2074         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2075         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2076                 = r->float_substr = r->float_utf8 = Nullsv;
2077         if (!(data.start_class->flags & ANYOF_EOS)
2078             && !cl_is_anything(data.start_class))
2079         {
2080             I32 n = add_data(pRExC_state, 1, "f");
2081
2082             New(1006, RExC_rx->data->data[n], 1,
2083                 struct regnode_charclass_class);
2084             StructCopy(data.start_class,
2085                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2086                        struct regnode_charclass_class);
2087             r->regstclass = (regnode*)RExC_rx->data->data[n];
2088             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2089             DEBUG_r({ SV* sv = sv_newmortal();
2090                       regprop(sv, (regnode*)data.start_class);
2091                       PerlIO_printf(Perl_debug_log,
2092                                     "synthetic stclass `%s'.\n",
2093                                     SvPVX(sv));});
2094         }
2095     }
2096
2097     r->minlen = minlen;
2098     if (RExC_seen & REG_SEEN_GPOS)
2099         r->reganch |= ROPT_GPOS_SEEN;
2100     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2101         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2102     if (RExC_seen & REG_SEEN_EVAL)
2103         r->reganch |= ROPT_EVAL_SEEN;
2104     if (RExC_seen & REG_SEEN_CANY)
2105         r->reganch |= ROPT_CANY_SEEN;
2106     Newz(1002, r->startp, RExC_npar, I32);
2107     Newz(1002, r->endp, RExC_npar, I32);
2108     PL_regdata = r->data; /* for regprop() */
2109     DEBUG_r(regdump(r));
2110     return(r);
2111 }
2112
2113 /*
2114  - reg - regular expression, i.e. main body or parenthesized thing
2115  *
2116  * Caller must absorb opening parenthesis.
2117  *
2118  * Combining parenthesis handling with the base level of regular expression
2119  * is a trifle forced, but the need to tie the tails of the branches to what
2120  * follows makes it hard to avoid.
2121  */
2122 STATIC regnode *
2123 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2124     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2125 {
2126     register regnode *ret;              /* Will be the head of the group. */
2127     register regnode *br;
2128     register regnode *lastbr;
2129     register regnode *ender = 0;
2130     register I32 parno = 0;
2131     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2132
2133     /* for (?g), (?gc), and (?o) warnings; warning
2134        about (?c) will warn about (?g) -- japhy    */
2135
2136     I32 wastedflags = 0x00,
2137         wasted_o    = 0x01,
2138         wasted_g    = 0x02,
2139         wasted_gc   = 0x02 | 0x04,
2140         wasted_c    = 0x04;
2141
2142     char * parse_start = RExC_parse; /* MJD */
2143     char *oregcomp_parse = RExC_parse;
2144     char c;
2145
2146     *flagp = 0;                         /* Tentatively. */
2147
2148
2149     /* Make an OPEN node, if parenthesized. */
2150     if (paren) {
2151         if (*RExC_parse == '?') { /* (?...) */
2152             U32 posflags = 0, negflags = 0;
2153             U32 *flagsp = &posflags;
2154             int logical = 0;
2155             char *seqstart = RExC_parse;
2156
2157             RExC_parse++;
2158             paren = *RExC_parse++;
2159             ret = NULL;                 /* For look-ahead/behind. */
2160             switch (paren) {
2161             case '<':           /* (?<...) */
2162                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2163                 if (*RExC_parse == '!')
2164                     paren = ',';
2165                 if (*RExC_parse != '=' && *RExC_parse != '!')
2166                     goto unknown;
2167                 RExC_parse++;
2168             case '=':           /* (?=...) */
2169             case '!':           /* (?!...) */
2170                 RExC_seen_zerolen++;
2171             case ':':           /* (?:...) */
2172             case '>':           /* (?>...) */
2173                 break;
2174             case '$':           /* (?$...) */
2175             case '@':           /* (?@...) */
2176                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2177                 break;
2178             case '#':           /* (?#...) */
2179                 while (*RExC_parse && *RExC_parse != ')')
2180                     RExC_parse++;
2181                 if (*RExC_parse != ')')
2182                     FAIL("Sequence (?#... not terminated");
2183                 nextchar(pRExC_state);
2184                 *flagp = TRYAGAIN;
2185                 return NULL;
2186             case 'p':           /* (?p...) */
2187                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2188                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2189                 /* FALL THROUGH*/
2190             case '?':           /* (??...) */
2191                 logical = 1;
2192                 if (*RExC_parse != '{')
2193                     goto unknown;
2194                 paren = *RExC_parse++;
2195                 /* FALL THROUGH */
2196             case '{':           /* (?{...}) */
2197             {
2198                 I32 count = 1, n = 0;
2199                 char c;
2200                 char *s = RExC_parse;
2201                 SV *sv;
2202                 OP_4tree *sop, *rop;
2203
2204                 RExC_seen_zerolen++;
2205                 RExC_seen |= REG_SEEN_EVAL;
2206                 while (count && (c = *RExC_parse)) {
2207                     if (c == '\\' && RExC_parse[1])
2208                         RExC_parse++;
2209                     else if (c == '{')
2210                         count++;
2211                     else if (c == '}')
2212                         count--;
2213                     RExC_parse++;
2214                 }
2215                 if (*RExC_parse != ')')
2216                 {
2217                     RExC_parse = s;             
2218                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2219                 }
2220                 if (!SIZE_ONLY) {
2221                     AV *av;
2222                 
2223                     if (RExC_parse - 1 - s)
2224                         sv = newSVpvn(s, RExC_parse - 1 - s);
2225                     else
2226                         sv = newSVpvn("", 0);
2227
2228                     ENTER;
2229                     Perl_save_re_context(aTHX);
2230                     rop = sv_compile_2op(sv, &sop, "re", &av);
2231                     sop->op_private |= OPpREFCOUNTED;
2232                     /* re_dup will OpREFCNT_inc */
2233                     OpREFCNT_set(sop, 1);
2234                     LEAVE;
2235
2236                     n = add_data(pRExC_state, 3, "nop");
2237                     RExC_rx->data->data[n] = (void*)rop;
2238                     RExC_rx->data->data[n+1] = (void*)sop;
2239                     RExC_rx->data->data[n+2] = (void*)av;
2240                     SvREFCNT_dec(sv);
2241                 }
2242                 else {                                          /* First pass */
2243                     if (PL_reginterp_cnt < ++RExC_seen_evals
2244                         && PL_curcop != &PL_compiling)
2245                         /* No compiled RE interpolated, has runtime
2246                            components ===> unsafe.  */
2247                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2248                     if (PL_tainting && PL_tainted)
2249                         FAIL("Eval-group in insecure regular expression");
2250                 }
2251                 
2252                 nextchar(pRExC_state);
2253                 if (logical) {
2254                     ret = reg_node(pRExC_state, LOGICAL);
2255                     if (!SIZE_ONLY)
2256                         ret->flags = 2;
2257                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2258                     /* deal with the length of this later - MJD */
2259                     return ret;
2260                 }
2261                 return reganode(pRExC_state, EVAL, n);
2262             }
2263             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2264             {
2265                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2266                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2267                         || RExC_parse[1] == '<'
2268                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2269                         I32 flag;
2270                         
2271                         ret = reg_node(pRExC_state, LOGICAL);
2272                         if (!SIZE_ONLY)
2273                             ret->flags = 1;
2274                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2275                         goto insert_if;
2276                     }
2277                 }
2278                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2279                     /* (?(1)...) */
2280                     parno = atoi(RExC_parse++);
2281
2282                     while (isDIGIT(*RExC_parse))
2283                         RExC_parse++;
2284                     ret = reganode(pRExC_state, GROUPP, parno);
2285                     
2286                     if ((c = *nextchar(pRExC_state)) != ')')
2287                         vFAIL("Switch condition not recognized");
2288                   insert_if:
2289                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2290                     br = regbranch(pRExC_state, &flags, 1);
2291                     if (br == NULL)
2292                         br = reganode(pRExC_state, LONGJMP, 0);
2293                     else
2294                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2295                     c = *nextchar(pRExC_state);
2296                     if (flags&HASWIDTH)
2297                         *flagp |= HASWIDTH;
2298                     if (c == '|') {
2299                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2300                         regbranch(pRExC_state, &flags, 1);
2301                         regtail(pRExC_state, ret, lastbr);
2302                         if (flags&HASWIDTH)
2303                             *flagp |= HASWIDTH;
2304                         c = *nextchar(pRExC_state);
2305                     }
2306                     else
2307                         lastbr = NULL;
2308                     if (c != ')')
2309                         vFAIL("Switch (?(condition)... contains too many branches");
2310                     ender = reg_node(pRExC_state, TAIL);
2311                     regtail(pRExC_state, br, ender);
2312                     if (lastbr) {
2313                         regtail(pRExC_state, lastbr, ender);
2314                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2315                     }
2316                     else
2317                         regtail(pRExC_state, ret, ender);
2318                     return ret;
2319                 }
2320                 else {
2321                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2322                 }
2323             }
2324             case 0:
2325                 RExC_parse--; /* for vFAIL to print correctly */
2326                 vFAIL("Sequence (? incomplete");
2327                 break;
2328             default:
2329                 --RExC_parse;
2330               parse_flags:      /* (?i) */
2331                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2332                     /* (?g), (?gc) and (?o) are useless here
2333                        and must be globally applied -- japhy */
2334
2335                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2336                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2337                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2338                             if (! (wastedflags & wflagbit) ) {
2339                                 wastedflags |= wflagbit;
2340                                 vWARN5(
2341                                     RExC_parse + 1,
2342                                     "Useless (%s%c) - %suse /%c modifier",
2343                                     flagsp == &negflags ? "?-" : "?",
2344                                     *RExC_parse,
2345                                     flagsp == &negflags ? "don't " : "",
2346                                     *RExC_parse
2347                                 );
2348                             }
2349                         }
2350                     }
2351                     else if (*RExC_parse == 'c') {
2352                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2353                             if (! (wastedflags & wasted_c) ) {
2354                                 wastedflags |= wasted_gc;
2355                                 vWARN3(
2356                                     RExC_parse + 1,
2357                                     "Useless (%sc) - %suse /gc modifier",
2358                                     flagsp == &negflags ? "?-" : "?",
2359                                     flagsp == &negflags ? "don't " : ""
2360                                 );
2361                             }
2362                         }
2363                     }
2364                     else { pmflag(flagsp, *RExC_parse); }
2365
2366                     ++RExC_parse;
2367                 }
2368                 if (*RExC_parse == '-') {
2369                     flagsp = &negflags;
2370                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2371                     ++RExC_parse;
2372                     goto parse_flags;
2373                 }
2374                 RExC_flags |= posflags;
2375                 RExC_flags &= ~negflags;
2376                 if (*RExC_parse == ':') {
2377                     RExC_parse++;
2378                     paren = ':';
2379                     break;
2380                 }               
2381               unknown:
2382                 if (*RExC_parse != ')') {
2383                     RExC_parse++;
2384                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2385                 }
2386                 nextchar(pRExC_state);
2387                 *flagp = TRYAGAIN;
2388                 return NULL;
2389             }
2390         }
2391         else {                  /* (...) */
2392             parno = RExC_npar;
2393             RExC_npar++;
2394             ret = reganode(pRExC_state, OPEN, parno);
2395             Set_Node_Length(ret, 1); /* MJD */
2396             Set_Node_Offset(ret, RExC_parse); /* MJD */
2397             open = 1;
2398         }
2399     }
2400     else                        /* ! paren */
2401         ret = NULL;
2402
2403     /* Pick up the branches, linking them together. */
2404     parse_start = RExC_parse;   /* MJD */
2405     br = regbranch(pRExC_state, &flags, 1);
2406     /*     branch_len = (paren != 0); */
2407     
2408     if (br == NULL)
2409         return(NULL);
2410     if (*RExC_parse == '|') {
2411         if (!SIZE_ONLY && RExC_extralen) {
2412             reginsert(pRExC_state, BRANCHJ, br);
2413         }
2414         else {                  /* MJD */
2415             reginsert(pRExC_state, BRANCH, br);
2416             Set_Node_Length(br, paren != 0);
2417             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2418         }
2419         have_branch = 1;
2420         if (SIZE_ONLY)
2421             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2422     }
2423     else if (paren == ':') {
2424         *flagp |= flags&SIMPLE;
2425     }
2426     if (open) {                         /* Starts with OPEN. */
2427         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2428     }
2429     else if (paren != '?')              /* Not Conditional */
2430         ret = br;
2431     *flagp |= flags & (SPSTART | HASWIDTH);
2432     lastbr = br;
2433     while (*RExC_parse == '|') {
2434         if (!SIZE_ONLY && RExC_extralen) {
2435             ender = reganode(pRExC_state, LONGJMP,0);
2436             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2437         }
2438         if (SIZE_ONLY)
2439             RExC_extralen += 2;         /* Account for LONGJMP. */
2440         nextchar(pRExC_state);
2441         br = regbranch(pRExC_state, &flags, 0);
2442         
2443         if (br == NULL)
2444             return(NULL);
2445         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2446         lastbr = br;
2447         if (flags&HASWIDTH)
2448             *flagp |= HASWIDTH;
2449         *flagp |= flags&SPSTART;
2450     }
2451
2452     if (have_branch || paren != ':') {
2453         /* Make a closing node, and hook it on the end. */
2454         switch (paren) {
2455         case ':':
2456             ender = reg_node(pRExC_state, TAIL);
2457             break;
2458         case 1:
2459             ender = reganode(pRExC_state, CLOSE, parno);
2460             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2461             Set_Node_Length(ender,1); /* MJD */
2462             break;
2463         case '<':
2464         case ',':
2465         case '=':
2466         case '!':
2467             *flagp &= ~HASWIDTH;
2468             /* FALL THROUGH */
2469         case '>':
2470             ender = reg_node(pRExC_state, SUCCEED);
2471             break;
2472         case 0:
2473             ender = reg_node(pRExC_state, END);
2474             break;
2475         }
2476         regtail(pRExC_state, lastbr, ender);
2477
2478         if (have_branch) {
2479             /* Hook the tails of the branches to the closing node. */
2480             for (br = ret; br != NULL; br = regnext(br)) {
2481                 regoptail(pRExC_state, br, ender);
2482             }
2483         }
2484     }
2485
2486     {
2487         char *p;
2488         static char parens[] = "=!<,>";
2489
2490         if (paren && (p = strchr(parens, paren))) {
2491             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2492             int flag = (p - parens) > 1;
2493
2494             if (paren == '>')
2495                 node = SUSPEND, flag = 0;
2496             reginsert(pRExC_state, node,ret);
2497             ret->flags = flag;
2498             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2499         }
2500     }
2501
2502     /* Check for proper termination. */
2503     if (paren) {
2504         RExC_flags = oregflags;
2505         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2506             RExC_parse = oregcomp_parse;
2507             vFAIL("Unmatched (");
2508         }
2509     }
2510     else if (!paren && RExC_parse < RExC_end) {
2511         if (*RExC_parse == ')') {
2512             RExC_parse++;
2513             vFAIL("Unmatched )");
2514         }
2515         else
2516             FAIL("Junk on end of regexp");      /* "Can't happen". */
2517         /* NOTREACHED */
2518     }
2519
2520     return(ret);
2521 }
2522
2523 /*
2524  - regbranch - one alternative of an | operator
2525  *
2526  * Implements the concatenation operator.
2527  */
2528 STATIC regnode *
2529 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2530 {
2531     register regnode *ret;
2532     register regnode *chain = NULL;
2533     register regnode *latest;
2534     I32 flags = 0, c = 0;
2535
2536     if (first)
2537         ret = NULL;
2538     else {
2539         if (!SIZE_ONLY && RExC_extralen)
2540             ret = reganode(pRExC_state, BRANCHJ,0);
2541         else {
2542             ret = reg_node(pRExC_state, BRANCH);
2543             Set_Node_Length(ret, 1);
2544         }
2545     }
2546         
2547     if (!first && SIZE_ONLY)
2548         RExC_extralen += 1;                     /* BRANCHJ */
2549
2550     *flagp = WORST;                     /* Tentatively. */
2551
2552     RExC_parse--;
2553     nextchar(pRExC_state);
2554     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2555         flags &= ~TRYAGAIN;
2556         latest = regpiece(pRExC_state, &flags);
2557         if (latest == NULL) {
2558             if (flags & TRYAGAIN)
2559                 continue;
2560             return(NULL);
2561         }
2562         else if (ret == NULL)
2563             ret = latest;
2564         *flagp |= flags&HASWIDTH;
2565         if (chain == NULL)      /* First piece. */
2566             *flagp |= flags&SPSTART;
2567         else {
2568             RExC_naughty++;
2569             regtail(pRExC_state, chain, latest);
2570         }
2571         chain = latest;
2572         c++;
2573     }
2574     if (chain == NULL) {        /* Loop ran zero times. */
2575         chain = reg_node(pRExC_state, NOTHING);
2576         if (ret == NULL)
2577             ret = chain;
2578     }
2579     if (c == 1) {
2580         *flagp |= flags&SIMPLE;
2581     }
2582
2583     return(ret);
2584 }
2585
2586 /*
2587  - regpiece - something followed by possible [*+?]
2588  *
2589  * Note that the branching code sequences used for ? and the general cases
2590  * of * and + are somewhat optimized:  they use the same NOTHING node as
2591  * both the endmarker for their branch list and the body of the last branch.
2592  * It might seem that this node could be dispensed with entirely, but the
2593  * endmarker role is not redundant.
2594  */
2595 STATIC regnode *
2596 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2597 {
2598     register regnode *ret;
2599     register char op;
2600     register char *next;
2601     I32 flags;
2602     char *origparse = RExC_parse;
2603     char *maxpos;
2604     I32 min;
2605     I32 max = REG_INFTY;
2606     char *parse_start;
2607
2608     ret = regatom(pRExC_state, &flags);
2609     if (ret == NULL) {
2610         if (flags & TRYAGAIN)
2611             *flagp |= TRYAGAIN;
2612         return(NULL);
2613     }
2614
2615     op = *RExC_parse;
2616
2617     if (op == '{' && regcurly(RExC_parse)) {
2618         parse_start = RExC_parse; /* MJD */
2619         next = RExC_parse + 1;
2620         maxpos = Nullch;
2621         while (isDIGIT(*next) || *next == ',') {
2622             if (*next == ',') {
2623                 if (maxpos)
2624                     break;
2625                 else
2626                     maxpos = next;
2627             }
2628             next++;
2629         }
2630         if (*next == '}') {             /* got one */
2631             if (!maxpos)
2632                 maxpos = next;
2633             RExC_parse++;
2634             min = atoi(RExC_parse);
2635             if (*maxpos == ',')
2636                 maxpos++;
2637             else
2638                 maxpos = RExC_parse;
2639             max = atoi(maxpos);
2640             if (!max && *maxpos != '0')
2641                 max = REG_INFTY;                /* meaning "infinity" */
2642             else if (max >= REG_INFTY)
2643                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2644             RExC_parse = next;
2645             nextchar(pRExC_state);
2646
2647         do_curly:
2648             if ((flags&SIMPLE)) {
2649                 RExC_naughty += 2 + RExC_naughty / 2;
2650                 reginsert(pRExC_state, CURLY, ret);
2651                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2652                 Set_Node_Cur_Length(ret);
2653             }
2654             else {
2655                 regnode *w = reg_node(pRExC_state, WHILEM);
2656
2657                 w->flags = 0;
2658                 regtail(pRExC_state, ret, w);
2659                 if (!SIZE_ONLY && RExC_extralen) {
2660                     reginsert(pRExC_state, LONGJMP,ret);
2661                     reginsert(pRExC_state, NOTHING,ret);
2662                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2663                 }
2664                 reginsert(pRExC_state, CURLYX,ret);
2665                                 /* MJD hk */
2666                 Set_Node_Offset(ret, parse_start+1);
2667                 Set_Node_Length(ret, 
2668                                 op == '{' ? (RExC_parse - parse_start) : 1);
2669                 
2670                 if (!SIZE_ONLY && RExC_extralen)
2671                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2672                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2673                 if (SIZE_ONLY)
2674                     RExC_whilem_seen++, RExC_extralen += 3;
2675                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2676             }
2677             ret->flags = 0;
2678
2679             if (min > 0)
2680                 *flagp = WORST;
2681             if (max > 0)
2682                 *flagp |= HASWIDTH;
2683             if (max && max < min)
2684                 vFAIL("Can't do {n,m} with n > m");
2685             if (!SIZE_ONLY) {
2686                 ARG1_SET(ret, (U16)min);
2687                 ARG2_SET(ret, (U16)max);
2688             }
2689
2690             goto nest_check;
2691         }
2692     }
2693
2694     if (!ISMULT1(op)) {
2695         *flagp = flags;
2696         return(ret);
2697     }
2698
2699 #if 0                           /* Now runtime fix should be reliable. */
2700
2701     /* if this is reinstated, don't forget to put this back into perldiag:
2702
2703             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2704
2705            (F) The part of the regexp subject to either the * or + quantifier
2706            could match an empty string. The {#} shows in the regular
2707            expression about where the problem was discovered.
2708
2709     */
2710
2711     if (!(flags&HASWIDTH) && op != '?')
2712       vFAIL("Regexp *+ operand could be empty");
2713 #endif
2714
2715     parse_start = RExC_parse;
2716     nextchar(pRExC_state);
2717
2718     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2719
2720     if (op == '*' && (flags&SIMPLE)) {
2721         reginsert(pRExC_state, STAR, ret);
2722         ret->flags = 0;
2723         RExC_naughty += 4;
2724     }
2725     else if (op == '*') {
2726         min = 0;
2727         goto do_curly;
2728     }
2729     else if (op == '+' && (flags&SIMPLE)) {
2730         reginsert(pRExC_state, PLUS, ret);
2731         ret->flags = 0;
2732         RExC_naughty += 3;
2733     }
2734     else if (op == '+') {
2735         min = 1;
2736         goto do_curly;
2737     }
2738     else if (op == '?') {
2739         min = 0; max = 1;
2740         goto do_curly;
2741     }
2742   nest_check:
2743     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2744         vWARN3(RExC_parse,
2745                "%.*s matches null string many times",
2746                RExC_parse - origparse,
2747                origparse);
2748     }
2749
2750     if (*RExC_parse == '?') {
2751         nextchar(pRExC_state);
2752         reginsert(pRExC_state, MINMOD, ret);
2753         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2754     }
2755     if (ISMULT2(RExC_parse)) {
2756         RExC_parse++;
2757         vFAIL("Nested quantifiers");
2758     }
2759
2760     return(ret);
2761 }
2762
2763 /*
2764  - regatom - the lowest level
2765  *
2766  * Optimization:  gobbles an entire sequence of ordinary characters so that
2767  * it can turn them into a single node, which is smaller to store and
2768  * faster to run.  Backslashed characters are exceptions, each becoming a
2769  * separate node; the code is simpler that way and it's not worth fixing.
2770  *
2771  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2772 STATIC regnode *
2773 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2774 {
2775     register regnode *ret = 0;
2776     I32 flags;
2777     char *parse_start = 0;
2778
2779     *flagp = WORST;             /* Tentatively. */
2780
2781 tryagain:
2782     switch (*RExC_parse) {
2783     case '^':
2784         RExC_seen_zerolen++;
2785         nextchar(pRExC_state);
2786         if (RExC_flags & PMf_MULTILINE)
2787             ret = reg_node(pRExC_state, MBOL);
2788         else if (RExC_flags & PMf_SINGLELINE)
2789             ret = reg_node(pRExC_state, SBOL);
2790         else
2791             ret = reg_node(pRExC_state, BOL);
2792         Set_Node_Length(ret, 1); /* MJD */
2793         break;
2794     case '$':
2795         nextchar(pRExC_state);
2796         if (*RExC_parse)
2797             RExC_seen_zerolen++;
2798         if (RExC_flags & PMf_MULTILINE)
2799             ret = reg_node(pRExC_state, MEOL);
2800         else if (RExC_flags & PMf_SINGLELINE)
2801             ret = reg_node(pRExC_state, SEOL);
2802         else
2803             ret = reg_node(pRExC_state, EOL);
2804         Set_Node_Length(ret, 1); /* MJD */
2805         break;
2806     case '.':
2807         nextchar(pRExC_state);
2808         if (RExC_flags & PMf_SINGLELINE)
2809             ret = reg_node(pRExC_state, SANY);
2810         else
2811             ret = reg_node(pRExC_state, REG_ANY);
2812         *flagp |= HASWIDTH|SIMPLE;
2813         RExC_naughty++;
2814         Set_Node_Length(ret, 1); /* MJD */
2815         break;
2816     case '[':
2817     {
2818         char *oregcomp_parse = ++RExC_parse;
2819         ret = regclass(pRExC_state);
2820         if (*RExC_parse != ']') {
2821             RExC_parse = oregcomp_parse;
2822             vFAIL("Unmatched [");
2823         }
2824         nextchar(pRExC_state);
2825         *flagp |= HASWIDTH|SIMPLE;
2826         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2827         break;
2828     }
2829     case '(':
2830         nextchar(pRExC_state);
2831         ret = reg(pRExC_state, 1, &flags);
2832         if (ret == NULL) {
2833                 if (flags & TRYAGAIN) {
2834                     if (RExC_parse == RExC_end) {
2835                          /* Make parent create an empty node if needed. */
2836                         *flagp |= TRYAGAIN;
2837                         return(NULL);
2838                     }
2839                     goto tryagain;
2840                 }
2841                 return(NULL);
2842         }
2843         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2844         break;
2845     case '|':
2846     case ')':
2847         if (flags & TRYAGAIN) {
2848             *flagp |= TRYAGAIN;
2849             return NULL;
2850         }
2851         vFAIL("Internal urp");
2852                                 /* Supposed to be caught earlier. */
2853         break;
2854     case '{':
2855         if (!regcurly(RExC_parse)) {
2856             RExC_parse++;
2857             goto defchar;
2858         }
2859         /* FALL THROUGH */
2860     case '?':
2861     case '+':
2862     case '*':
2863         RExC_parse++;
2864         vFAIL("Quantifier follows nothing");
2865         break;
2866     case '\\':
2867         switch (*++RExC_parse) {
2868         case 'A':
2869             RExC_seen_zerolen++;
2870             ret = reg_node(pRExC_state, SBOL);
2871             *flagp |= SIMPLE;
2872             nextchar(pRExC_state);
2873             Set_Node_Length(ret, 2); /* MJD */
2874             break;
2875         case 'G':
2876             ret = reg_node(pRExC_state, GPOS);
2877             RExC_seen |= REG_SEEN_GPOS;
2878             *flagp |= SIMPLE;
2879             nextchar(pRExC_state);
2880             Set_Node_Length(ret, 2); /* MJD */
2881             break;
2882         case 'Z':
2883             ret = reg_node(pRExC_state, SEOL);
2884             *flagp |= SIMPLE;
2885             RExC_seen_zerolen++;                /* Do not optimize RE away */
2886             nextchar(pRExC_state);
2887             break;
2888         case 'z':
2889             ret = reg_node(pRExC_state, EOS);
2890             *flagp |= SIMPLE;
2891             RExC_seen_zerolen++;                /* Do not optimize RE away */
2892             nextchar(pRExC_state);
2893             Set_Node_Length(ret, 2); /* MJD */
2894             break;
2895         case 'C':
2896             ret = reg_node(pRExC_state, CANY);
2897             RExC_seen |= REG_SEEN_CANY;
2898             *flagp |= HASWIDTH|SIMPLE;
2899             nextchar(pRExC_state);
2900             Set_Node_Length(ret, 2); /* MJD */
2901             break;
2902         case 'X':
2903             ret = reg_node(pRExC_state, CLUMP);
2904             *flagp |= HASWIDTH;
2905             nextchar(pRExC_state);
2906             Set_Node_Length(ret, 2); /* MJD */
2907             break;
2908         case 'w':
2909             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2910             *flagp |= HASWIDTH|SIMPLE;
2911             nextchar(pRExC_state);
2912             Set_Node_Length(ret, 2); /* MJD */
2913             break;
2914         case 'W':
2915             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2916             *flagp |= HASWIDTH|SIMPLE;
2917             nextchar(pRExC_state);
2918             Set_Node_Length(ret, 2); /* MJD */
2919             break;
2920         case 'b':
2921             RExC_seen_zerolen++;
2922             RExC_seen |= REG_SEEN_LOOKBEHIND;
2923             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2924             *flagp |= SIMPLE;
2925             nextchar(pRExC_state);
2926             Set_Node_Length(ret, 2); /* MJD */
2927             break;
2928         case 'B':
2929             RExC_seen_zerolen++;
2930             RExC_seen |= REG_SEEN_LOOKBEHIND;
2931             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2932             *flagp |= SIMPLE;
2933             nextchar(pRExC_state);
2934             Set_Node_Length(ret, 2); /* MJD */
2935             break;
2936         case 's':
2937             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2938             *flagp |= HASWIDTH|SIMPLE;
2939             nextchar(pRExC_state);
2940             Set_Node_Length(ret, 2); /* MJD */
2941             break;
2942         case 'S':
2943             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2944             *flagp |= HASWIDTH|SIMPLE;
2945             nextchar(pRExC_state);
2946             Set_Node_Length(ret, 2); /* MJD */
2947             break;
2948         case 'd':
2949             ret = reg_node(pRExC_state, DIGIT);
2950             *flagp |= HASWIDTH|SIMPLE;
2951             nextchar(pRExC_state);
2952             Set_Node_Length(ret, 2); /* MJD */
2953             break;
2954         case 'D':
2955             ret = reg_node(pRExC_state, NDIGIT);
2956             *flagp |= HASWIDTH|SIMPLE;
2957             nextchar(pRExC_state);
2958             Set_Node_Length(ret, 2); /* MJD */
2959             break;
2960         case 'p':
2961         case 'P':
2962             {   
2963                 char* oldregxend = RExC_end;
2964                 char* parse_start = RExC_parse;
2965
2966                 if (RExC_parse[1] == '{') {
2967                   /* a lovely hack--pretend we saw [\pX] instead */
2968                     RExC_end = strchr(RExC_parse, '}');
2969                     if (!RExC_end) {
2970                         U8 c = (U8)*RExC_parse;
2971                         RExC_parse += 2;
2972                         RExC_end = oldregxend;
2973                         vFAIL2("Missing right brace on \\%c{}", c);
2974                     }
2975                     RExC_end++;
2976                 }
2977                 else {
2978                     RExC_end = RExC_parse + 2;
2979                     if (RExC_end > oldregxend)
2980                         RExC_end = oldregxend;
2981                 }
2982                 RExC_parse--;
2983
2984                 ret = regclass(pRExC_state);
2985
2986                 RExC_end = oldregxend;
2987                 RExC_parse--;
2988                 Set_Node_Cur_Length(ret); /* MJD */
2989                 nextchar(pRExC_state);
2990                 *flagp |= HASWIDTH|SIMPLE;
2991             }
2992             break;
2993         case 'n':
2994         case 'r':
2995         case 't':
2996         case 'f':
2997         case 'e':
2998         case 'a':
2999         case 'x':
3000         case 'c':
3001         case '0':
3002             goto defchar;
3003         case '1': case '2': case '3': case '4':
3004         case '5': case '6': case '7': case '8': case '9':
3005             {
3006                 I32 num = atoi(RExC_parse);
3007
3008                 if (num > 9 && num >= RExC_npar)
3009                     goto defchar;
3010                 else {
3011                     char * parse_start = RExC_parse - 1; /* MJD */
3012                     while (isDIGIT(*RExC_parse))
3013                         RExC_parse++;
3014
3015                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3016                         vFAIL("Reference to nonexistent group");
3017                     RExC_sawback = 1;
3018                     ret = reganode(pRExC_state,
3019                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3020                                    num);
3021                     *flagp |= HASWIDTH;
3022                     
3023                     /* override incorrect value set in reganode MJD */
3024                     Set_Node_Offset(ret, parse_start+1); 
3025                     Set_Node_Cur_Length(ret); /* MJD */
3026                     RExC_parse--;
3027                     nextchar(pRExC_state);
3028                 }
3029             }
3030             break;
3031         case '\0':
3032             if (RExC_parse >= RExC_end)
3033                 FAIL("Trailing \\");
3034             /* FALL THROUGH */
3035         default:
3036             /* Do not generate `unrecognized' warnings here, we fall
3037                back into the quick-grab loop below */
3038             goto defchar;
3039         }
3040         break;
3041
3042     case '#':
3043         if (RExC_flags & PMf_EXTENDED) {
3044             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3045             if (RExC_parse < RExC_end)
3046                 goto tryagain;
3047         }
3048         /* FALL THROUGH */
3049
3050     default: {
3051             register STRLEN len;
3052             register UV ender;
3053             register char *p;
3054             char *oldp, *s;
3055             STRLEN numlen;
3056             STRLEN foldlen;
3057             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3058
3059             parse_start = RExC_parse - 1;
3060
3061             RExC_parse++;
3062
3063         defchar:
3064             ender = 0;
3065             ret = reg_node(pRExC_state,
3066                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3067             s = STRING(ret);
3068             for (len = 0, p = RExC_parse - 1;
3069               len < 127 && p < RExC_end;
3070               len++)
3071             {
3072                 oldp = p;
3073
3074                 if (RExC_flags & PMf_EXTENDED)
3075                     p = regwhite(p, RExC_end);
3076                 switch (*p) {
3077                 case '^':
3078                 case '$':
3079                 case '.':
3080                 case '[':
3081                 case '(':
3082                 case ')':
3083                 case '|':
3084                     goto loopdone;
3085                 case '\\':
3086                     switch (*++p) {
3087                     case 'A':
3088                     case 'C':
3089                     case 'X':
3090                     case 'G':
3091                     case 'Z':
3092                     case 'z':
3093                     case 'w':
3094                     case 'W':
3095                     case 'b':
3096                     case 'B':
3097                     case 's':
3098                     case 'S':
3099                     case 'd':
3100                     case 'D':
3101                     case 'p':
3102                     case 'P':
3103                         --p;
3104                         goto loopdone;
3105                     case 'n':
3106                         ender = '\n';
3107                         p++;
3108                         break;
3109                     case 'r':
3110                         ender = '\r';
3111                         p++;
3112                         break;
3113                     case 't':
3114                         ender = '\t';
3115                         p++;
3116                         break;
3117                     case 'f':
3118                         ender = '\f';
3119                         p++;
3120                         break;
3121                     case 'e':
3122                           ender = ASCII_TO_NATIVE('\033');
3123                         p++;
3124                         break;
3125                     case 'a':
3126                           ender = ASCII_TO_NATIVE('\007');
3127                         p++;
3128                         break;
3129                     case 'x':
3130                         if (*++p == '{') {
3131                             char* e = strchr(p, '}');
3132         
3133                             if (!e) {
3134                                 RExC_parse = p + 1;
3135                                 vFAIL("Missing right brace on \\x{}");
3136                             }
3137                             else {
3138                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3139                                     | PERL_SCAN_DISALLOW_PREFIX;
3140                                 numlen = e - p - 1;
3141                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3142                                 if (ender > 0xff)
3143                                     RExC_utf8 = 1;
3144                                 /* numlen is generous */
3145                                 if (numlen + len >= 127) {
3146                                     p--;
3147                                     goto loopdone;
3148                                 }
3149                                 p = e + 1;
3150                             }
3151                         }
3152                         else {
3153                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3154                             numlen = 2;
3155                             ender = grok_hex(p, &numlen, &flags, NULL);
3156                             p += numlen;
3157                         }
3158                         break;
3159                     case 'c':
3160                         p++;
3161                         ender = UCHARAT(p++);
3162                         ender = toCTRL(ender);
3163                         break;
3164                     case '0': case '1': case '2': case '3':case '4':
3165                     case '5': case '6': case '7': case '8':case '9':
3166                         if (*p == '0' ||
3167                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3168                             I32 flags = 0;
3169                             numlen = 3;
3170                             ender = grok_oct(p, &numlen, &flags, NULL);
3171                             p += numlen;
3172                         }
3173                         else {
3174                             --p;
3175                             goto loopdone;
3176                         }
3177                         break;
3178                     case '\0':
3179                         if (p >= RExC_end)
3180                             FAIL("Trailing \\");
3181                         /* FALL THROUGH */
3182                     default:
3183                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3184                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3185                         goto normal_default;
3186                     }
3187                     break;
3188                 default:
3189                   normal_default:
3190                     if (UTF8_IS_START(*p) && UTF) {
3191                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3192                                                &numlen, 0);
3193                         p += numlen;
3194                     }
3195                     else
3196                         ender = *p++;
3197                     break;
3198                 }
3199                 if (RExC_flags & PMf_EXTENDED)
3200                     p = regwhite(p, RExC_end);
3201                 if (UTF && FOLD) {
3202                     /* Prime the casefolded buffer. */
3203                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3204                 }
3205                 if (ISMULT2(p)) { /* Back off on ?+*. */
3206                     if (len)
3207                         p = oldp;
3208                     else if (UTF) {
3209                          STRLEN unilen;
3210
3211                          if (FOLD) {
3212                               /* Emit all the Unicode characters. */
3213                               for (foldbuf = tmpbuf;
3214                                    foldlen;
3215                                    foldlen -= numlen) {
3216                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3217                                    if (numlen > 0) {
3218                                         reguni(pRExC_state, ender, s, &unilen);
3219                                         s       += unilen;
3220                                         len     += unilen;
3221                                         /* In EBCDIC the numlen
3222                                          * and unilen can differ. */
3223                                         foldbuf += numlen;
3224                                         if (numlen >= foldlen)
3225                                              break;
3226                                    }
3227                                    else
3228                                         break; /* "Can't happen." */
3229                               }
3230                          }
3231                          else {
3232                               reguni(pRExC_state, ender, s, &unilen);
3233                               if (unilen > 0) {
3234                                    s   += unilen;
3235                                    len += unilen;
3236                               }
3237                          }
3238                     }
3239                     else {
3240                         len++;
3241                         REGC((char)ender, s++);
3242                     }
3243                     break;
3244                 }
3245                 if (UTF) {
3246                      STRLEN unilen;
3247
3248                      if (FOLD) {
3249                           /* Emit all the Unicode characters. */
3250                           for (foldbuf = tmpbuf;
3251                                foldlen;
3252                                foldlen -= numlen) {
3253                                ender = utf8_to_uvchr(foldbuf, &numlen);
3254                                if (numlen > 0) {
3255                                     reguni(pRExC_state, ender, s, &unilen);
3256                                     len     += unilen;
3257                                     s       += unilen;
3258                                     /* In EBCDIC the numlen
3259                                      * and unilen can differ. */
3260                                     foldbuf += numlen;
3261                                     if (numlen >= foldlen)
3262                                          break;
3263                                }
3264                                else
3265                                     break;
3266                           }
3267                      }
3268                      else {
3269                           reguni(pRExC_state, ender, s, &unilen);
3270                           if (unilen > 0) {
3271                                s   += unilen;
3272                                len += unilen;
3273                           }
3274                      }
3275                      len--;
3276                 }
3277                 else
3278                     REGC((char)ender, s++);
3279             }
3280         loopdone:
3281             RExC_parse = p - 1;
3282             Set_Node_Cur_Length(ret); /* MJD */
3283             nextchar(pRExC_state);
3284             {
3285                 /* len is STRLEN which is unsigned, need to copy to signed */
3286                 IV iv = len;
3287                 if (iv < 0)
3288                     vFAIL("Internal disaster");
3289             }
3290             if (len > 0)
3291                 *flagp |= HASWIDTH;
3292             if (len == 1)
3293                 *flagp |= SIMPLE;
3294             if (!SIZE_ONLY)
3295                 STR_LEN(ret) = len;
3296             if (SIZE_ONLY)
3297                 RExC_size += STR_SZ(len);
3298             else
3299                 RExC_emit += STR_SZ(len);
3300         }
3301         break;
3302     }
3303
3304     /* If the encoding pragma is in effect recode the text of
3305      * any EXACT-kind nodes. */
3306     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3307          STRLEN oldlen = STR_LEN(ret);
3308          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3309
3310          if (RExC_utf8)
3311               SvUTF8_on(sv);
3312          if (sv_utf8_downgrade(sv, TRUE)) {
3313               char *s       = sv_recode_to_utf8(sv, PL_encoding);
3314               STRLEN newlen = SvCUR(sv);
3315          
3316               if (!SIZE_ONLY) {
3317                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3318                                          (int)oldlen, STRING(ret),
3319                                          (int)newlen, s));
3320                    Copy(s, STRING(ret), newlen, char);
3321                    STR_LEN(ret) += newlen - oldlen;
3322                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3323               } else
3324                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3325          }
3326     }
3327
3328     return(ret);
3329 }
3330
3331 STATIC char *
3332 S_regwhite(pTHX_ char *p, char *e)
3333 {
3334     while (p < e) {
3335         if (isSPACE(*p))
3336             ++p;
3337         else if (*p == '#') {
3338             do {
3339                 p++;
3340             } while (p < e && *p != '\n');
3341         }
3342         else
3343             break;
3344     }
3345     return p;
3346 }
3347
3348 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3349    Character classes ([:foo:]) can also be negated ([:^foo:]).
3350    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3351    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3352    but trigger failures because they are currently unimplemented. */
3353
3354 #define POSIXCC_DONE(c)   ((c) == ':')
3355 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3356 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3357
3358 STATIC I32
3359 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3360 {
3361     char *posixcc = 0;
3362     I32 namedclass = OOB_NAMEDCLASS;
3363
3364     if (value == '[' && RExC_parse + 1 < RExC_end &&
3365         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3366         POSIXCC(UCHARAT(RExC_parse))) {
3367         char  c = UCHARAT(RExC_parse);
3368         char* s = RExC_parse++;
3369         
3370         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3371             RExC_parse++;
3372         if (RExC_parse == RExC_end)
3373             /* Grandfather lone [:, [=, [. */
3374             RExC_parse = s;
3375         else {
3376             char* t = RExC_parse++; /* skip over the c */
3377
3378             if (UCHARAT(RExC_parse) == ']') {
3379                 RExC_parse++; /* skip over the ending ] */
3380                 posixcc = s + 1;
3381                 if (*s == ':') {
3382                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3383                     I32 skip = 5; /* the most common skip */
3384
3385                     switch (*posixcc) {
3386                     case 'a':
3387                         if (strnEQ(posixcc, "alnum", 5))
3388                             namedclass =
3389                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3390                         else if (strnEQ(posixcc, "alpha", 5))
3391                             namedclass =
3392                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3393                         else if (strnEQ(posixcc, "ascii", 5))
3394                             namedclass =
3395                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3396                         break;
3397                     case 'b':
3398                         if (strnEQ(posixcc, "blank", 5))
3399                             namedclass =
3400                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3401                         break;
3402                     case 'c':
3403                         if (strnEQ(posixcc, "cntrl", 5))
3404                             namedclass =
3405                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3406                         break;
3407                     case 'd':
3408                         if (strnEQ(posixcc, "digit", 5))
3409                             namedclass =
3410                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3411                         break;
3412                     case 'g':
3413                         if (strnEQ(posixcc, "graph", 5))
3414                             namedclass =
3415                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3416                         break;
3417                     case 'l':
3418                         if (strnEQ(posixcc, "lower", 5))
3419                             namedclass =
3420                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3421                         break;
3422                     case 'p':
3423                         if (strnEQ(posixcc, "print", 5))
3424                             namedclass =
3425                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3426                         else if (strnEQ(posixcc, "punct", 5))
3427                             namedclass =
3428                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3429                         break;
3430                     case 's':
3431                         if (strnEQ(posixcc, "space", 5))
3432                             namedclass =
3433                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3434                         break;
3435                     case 'u':
3436                         if (strnEQ(posixcc, "upper", 5))
3437                             namedclass =
3438                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3439                         break;
3440                     case 'w': /* this is not POSIX, this is the Perl \w */
3441                         if (strnEQ(posixcc, "word", 4)) {
3442                             namedclass =
3443                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3444                             skip = 4;
3445                         }
3446                         break;
3447                     case 'x':
3448                         if (strnEQ(posixcc, "xdigit", 6)) {
3449                             namedclass =
3450                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3451                             skip = 6;
3452                         }
3453                         break;
3454                     }
3455                     if (namedclass == OOB_NAMEDCLASS ||
3456                         posixcc[skip] != ':' ||
3457                         posixcc[skip+1] != ']')
3458                     {
3459                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3460                                       t - s - 1, s + 1);
3461                     }
3462                 } else if (!SIZE_ONLY) {
3463                     /* [[=foo=]] and [[.foo.]] are still future. */
3464
3465                     /* adjust RExC_parse so the warning shows after
3466                        the class closes */
3467                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3468                         RExC_parse++;
3469                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3470                 }
3471             } else {
3472                 /* Maternal grandfather:
3473                  * "[:" ending in ":" but not in ":]" */
3474                 RExC_parse = s;
3475             }
3476         }
3477     }
3478
3479     return namedclass;
3480 }
3481
3482 STATIC void
3483 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3484 {
3485     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3486         char *s = RExC_parse;
3487         char  c = *s++;
3488
3489         while(*s && isALNUM(*s))
3490             s++;
3491         if (*s && c == *s && s[1] == ']') {
3492             if (ckWARN(WARN_REGEXP))
3493                 vWARN3(s+2,
3494                         "POSIX syntax [%c %c] belongs inside character classes",
3495                         c, c);
3496
3497             /* [[=foo=]] and [[.foo.]] are still future. */
3498             if (POSIXCC_NOTYET(c)) {
3499                 /* adjust RExC_parse so the error shows after
3500                    the class closes */
3501                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3502                     ;
3503                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3504             }
3505         }
3506     }
3507 }
3508
3509 STATIC regnode *
3510 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3511 {
3512     register UV value;
3513     register UV nextvalue;
3514     register IV prevvalue = OOB_UNICODE;
3515     register IV range = 0;
3516     register regnode *ret;
3517     STRLEN numlen;
3518     IV namedclass;
3519     char *rangebegin = 0;
3520     bool need_class = 0;
3521     SV *listsv = Nullsv;
3522     register char *e;
3523     UV n;
3524     bool optimize_invert   = TRUE;
3525     AV* unicode_alternate  = 0;
3526 #ifdef EBCDIC
3527     UV literal_endpoint = 0;
3528 #endif
3529
3530     ret = reganode(pRExC_state, ANYOF, 0);
3531
3532     if (!SIZE_ONLY)
3533         ANYOF_FLAGS(ret) = 0;
3534
3535     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3536         RExC_naughty++;
3537         RExC_parse++;
3538         if (!SIZE_ONLY)
3539             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3540     }
3541
3542     if (SIZE_ONLY)
3543         RExC_size += ANYOF_SKIP;
3544     else {
3545         RExC_emit += ANYOF_SKIP;
3546         if (FOLD)
3547             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3548         if (LOC)
3549             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3550         ANYOF_BITMAP_ZERO(ret);
3551         listsv = newSVpvn("# comment\n", 10);
3552     }
3553
3554     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3555
3556     if (!SIZE_ONLY && POSIXCC(nextvalue))
3557         checkposixcc(pRExC_state);
3558
3559     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3560     if (UCHARAT(RExC_parse) == ']')
3561         goto charclassloop;
3562
3563     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3564
3565     charclassloop:
3566
3567         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3568
3569         if (!range)
3570             rangebegin = RExC_parse;
3571         if (UTF) {
3572             value = utf8n_to_uvchr((U8*)RExC_parse,
3573                                    RExC_end - RExC_parse,
3574                                    &numlen, 0);
3575             RExC_parse += numlen;
3576         }
3577         else
3578             value = UCHARAT(RExC_parse++);
3579         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3580         if (value == '[' && POSIXCC(nextvalue))
3581             namedclass = regpposixcc(pRExC_state, value);
3582         else if (value == '\\') {
3583             if (UTF) {
3584                 value = utf8n_to_uvchr((U8*)RExC_parse,
3585                                    RExC_end - RExC_parse,
3586                                    &numlen, 0);
3587                 RExC_parse += numlen;
3588             }
3589             else
3590                 value = UCHARAT(RExC_parse++);
3591             /* Some compilers cannot handle switching on 64-bit integer
3592              * values, therefore value cannot be an UV.  Yes, this will
3593              * be a problem later if we want switch on Unicode.
3594              * A similar issue a little bit later when switching on
3595              * namedclass. --jhi */
3596             switch ((I32)value) {
3597             case 'w':   namedclass = ANYOF_ALNUM;       break;
3598             case 'W':   namedclass = ANYOF_NALNUM;      break;
3599             case 's':   namedclass = ANYOF_SPACE;       break;
3600             case 'S':   namedclass = ANYOF_NSPACE;      break;
3601             case 'd':   namedclass = ANYOF_DIGIT;       break;
3602             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3603             case 'p':
3604             case 'P':
3605                 if (RExC_parse >= RExC_end)
3606                     vFAIL2("Empty \\%c{}", (U8)value);
3607                 if (*RExC_parse == '{') {
3608                     U8 c = (U8)value;
3609                     e = strchr(RExC_parse++, '}');
3610                     if (!e)
3611                         vFAIL2("Missing right brace on \\%c{}", c);
3612                     while (isSPACE(UCHARAT(RExC_parse)))
3613                         RExC_parse++;
3614                     if (e == RExC_parse)
3615                         vFAIL2("Empty \\%c{}", c);
3616                     n = e - RExC_parse;
3617                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3618                         n--;
3619                 }
3620                 else {
3621                     e = RExC_parse;
3622                     n = 1;
3623                 }
3624                 if (!SIZE_ONLY) {
3625                     if (UCHARAT(RExC_parse) == '^') {
3626                          RExC_parse++;
3627                          n--;
3628                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3629                          while (isSPACE(UCHARAT(RExC_parse))) {
3630                               RExC_parse++;
3631                               n--;
3632                          }
3633                     }
3634                     if (value == 'p')
3635                          Perl_sv_catpvf(aTHX_ listsv,
3636                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3637                     else
3638                          Perl_sv_catpvf(aTHX_ listsv,
3639                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3640                 }
3641                 RExC_parse = e + 1;
3642                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3643                 continue;
3644             case 'n':   value = '\n';                   break;
3645             case 'r':   value = '\r';                   break;
3646             case 't':   value = '\t';                   break;
3647             case 'f':   value = '\f';                   break;
3648             case 'b':   value = '\b';                   break;
3649             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3650             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3651             case 'x':
3652                 if (*RExC_parse == '{') {
3653                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3654                         | PERL_SCAN_DISALLOW_PREFIX;
3655                     e = strchr(RExC_parse++, '}');
3656                     if (!e)
3657                         vFAIL("Missing right brace on \\x{}");
3658
3659                     numlen = e - RExC_parse;
3660                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3661                     RExC_parse = e + 1;
3662                 }
3663                 else {
3664                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3665                     numlen = 2;
3666                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3667                     RExC_parse += numlen;
3668                 }
3669                 break;
3670             case 'c':
3671                 value = UCHARAT(RExC_parse++);
3672                 value = toCTRL(value);
3673                 break;
3674             case '0': case '1': case '2': case '3': case '4':
3675             case '5': case '6': case '7': case '8': case '9':
3676             {
3677                 I32 flags = 0;
3678                 numlen = 3;
3679                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3680                 RExC_parse += numlen;
3681                 break;
3682             }
3683             default:
3684                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3685                     vWARN2(RExC_parse,
3686                            "Unrecognized escape \\%c in character class passed through",
3687                            (int)value);
3688                 break;
3689             }
3690         } /* end of \blah */
3691 #ifdef EBCDIC
3692         else
3693             literal_endpoint++;
3694 #endif
3695
3696         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3697
3698             if (!SIZE_ONLY && !need_class)
3699                 ANYOF_CLASS_ZERO(ret);
3700
3701             need_class = 1;
3702
3703             /* a bad range like a-\d, a-[:digit:] ? */
3704             if (range) {
3705                 if (!SIZE_ONLY) {
3706                     if (ckWARN(WARN_REGEXP))
3707                         vWARN4(RExC_parse,
3708                                "False [] range \"%*.*s\"",
3709                                RExC_parse - rangebegin,
3710                                RExC_parse - rangebegin,
3711                                rangebegin);
3712                     if (prevvalue < 256) {
3713                         ANYOF_BITMAP_SET(ret, prevvalue);
3714                         ANYOF_BITMAP_SET(ret, '-');
3715                     }
3716                     else {
3717                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3718                         Perl_sv_catpvf(aTHX_ listsv,
3719                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3720                     }
3721                 }
3722
3723                 range = 0; /* this was not a true range */
3724             }
3725
3726             if (!SIZE_ONLY) {
3727                 if (namedclass > OOB_NAMEDCLASS)
3728                     optimize_invert = FALSE;
3729                 /* Possible truncation here but in some 64-bit environments
3730                  * the compiler gets heartburn about switch on 64-bit values.
3731                  * A similar issue a little earlier when switching on value.
3732                  * --jhi */
3733                 switch ((I32)namedclass) {
3734                 case ANYOF_ALNUM:
3735                     if (LOC)
3736                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3737                     else {
3738                         for (value = 0; value < 256; value++)
3739                             if (isALNUM(value))
3740                                 ANYOF_BITMAP_SET(ret, value);
3741                     }
3742                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3743                     break;
3744                 case ANYOF_NALNUM:
3745                     if (LOC)
3746                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3747                     else {
3748                         for (value = 0; value < 256; value++)
3749                             if (!isALNUM(value))
3750                                 ANYOF_BITMAP_SET(ret, value);
3751                     }
3752                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3753                     break;
3754                 case ANYOF_ALNUMC:
3755                     if (LOC)
3756                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3757                     else {
3758                         for (value = 0; value < 256; value++)
3759                             if (isALNUMC(value))
3760                                 ANYOF_BITMAP_SET(ret, value);
3761                     }
3762                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3763                     break;
3764                 case ANYOF_NALNUMC:
3765                     if (LOC)
3766                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3767                     else {
3768                         for (value = 0; value < 256; value++)
3769                             if (!isALNUMC(value))
3770                                 ANYOF_BITMAP_SET(ret, value);
3771                     }
3772                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3773                     break;
3774                 case ANYOF_ALPHA:
3775                     if (LOC)
3776                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3777                     else {
3778                         for (value = 0; value < 256; value++)
3779                             if (isALPHA(value))
3780                                 ANYOF_BITMAP_SET(ret, value);
3781                     }
3782                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3783                     break;
3784                 case ANYOF_NALPHA:
3785                     if (LOC)
3786                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3787                     else {
3788                         for (value = 0; value < 256; value++)
3789                             if (!isALPHA(value))
3790                                 ANYOF_BITMAP_SET(ret, value);
3791                     }
3792                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3793                     break;
3794                 case ANYOF_ASCII:
3795                     if (LOC)
3796                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3797                     else {
3798 #ifndef EBCDIC
3799                         for (value = 0; value < 128; value++)
3800                             ANYOF_BITMAP_SET(ret, value);
3801 #else  /* EBCDIC */
3802                         for (value = 0; value < 256; value++) {
3803                             if (isASCII(value))
3804                                 ANYOF_BITMAP_SET(ret, value);
3805                         }
3806 #endif /* EBCDIC */
3807                     }
3808                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3809                     break;
3810                 case ANYOF_NASCII:
3811                     if (LOC)
3812                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3813                     else {
3814 #ifndef EBCDIC
3815                         for (value = 128; value < 256; value++)
3816                             ANYOF_BITMAP_SET(ret, value);
3817 #else  /* EBCDIC */
3818                         for (value = 0; value < 256; value++) {
3819                             if (!isASCII(value))
3820                                 ANYOF_BITMAP_SET(ret, value);
3821                         }
3822 #endif /* EBCDIC */
3823                     }
3824                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3825                     break;
3826                 case ANYOF_BLANK:
3827                     if (LOC)
3828                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3829                     else {
3830                         for (value = 0; value < 256; value++)
3831                             if (isBLANK(value))
3832                                 ANYOF_BITMAP_SET(ret, value);
3833                     }
3834                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3835                     break;
3836                 case ANYOF_NBLANK:
3837                     if (LOC)
3838                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3839                     else {
3840                         for (value = 0; value < 256; value++)
3841                             if (!isBLANK(value))
3842                                 ANYOF_BITMAP_SET(ret, value);
3843                     }
3844                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3845                     break;
3846                 case ANYOF_CNTRL:
3847                     if (LOC)
3848                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3849                     else {
3850                         for (value = 0; value < 256; value++)
3851                             if (isCNTRL(value))
3852                                 ANYOF_BITMAP_SET(ret, value);
3853                     }
3854                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3855                     break;
3856                 case ANYOF_NCNTRL:
3857                     if (LOC)
3858                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3859                     else {
3860                         for (value = 0; value < 256; value++)
3861                             if (!isCNTRL(value))
3862                                 ANYOF_BITMAP_SET(ret, value);
3863                     }
3864                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3865                     break;
3866                 case ANYOF_DIGIT:
3867                     if (LOC)
3868                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3869                     else {
3870                         /* consecutive digits assumed */
3871                         for (value = '0'; value <= '9'; value++)
3872                             ANYOF_BITMAP_SET(ret, value);
3873                     }
3874                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3875                     break;
3876                 case ANYOF_NDIGIT:
3877                     if (LOC)
3878                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3879                     else {
3880                         /* consecutive digits assumed */
3881                         for (value = 0; value < '0'; value++)
3882                             ANYOF_BITMAP_SET(ret, value);
3883                         for (value = '9' + 1; value < 256; value++)
3884                             ANYOF_BITMAP_SET(ret, value);
3885                     }
3886                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3887                     break;
3888                 case ANYOF_GRAPH:
3889                     if (LOC)
3890                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3891                     else {
3892                         for (value = 0; value < 256; value++)
3893                             if (isGRAPH(value))
3894                                 ANYOF_BITMAP_SET(ret, value);
3895                     }
3896                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3897                     break;
3898                 case ANYOF_NGRAPH:
3899                     if (LOC)
3900                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3901                     else {