This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
6ecb9787a98d813cff5f591868d95b27dd3733f7
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg) STMT_START {                                          \
260     char *ellipses = "";                                                \
261     IV len = RExC_end - RExC_precomp;                                   \
262                                                                         \
263     if (!SIZE_ONLY)                                                     \
264         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
265     if (len > RegexLengthToShowInErrorMessages) {                       \
266         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
267         len = RegexLengthToShowInErrorMessages - 10;                    \
268         ellipses = "...";                                               \
269     }                                                                   \
270     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
271             msg, (int)len, RExC_precomp, ellipses);                     \
272 } STMT_END
273
274 /*
275  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276  * args. Show regex, up to a maximum length. If it's too long, chop and add
277  * "...".
278  */
279 #define FAIL2(pat,msg) STMT_START {                                     \
280     char *ellipses = "";                                                \
281     IV len = RExC_end - RExC_precomp;                                   \
282                                                                         \
283     if (!SIZE_ONLY)                                                     \
284         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
285     if (len > RegexLengthToShowInErrorMessages) {                       \
286         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
287         len = RegexLengthToShowInErrorMessages - 10;                    \
288         ellipses = "...";                                               \
289     }                                                                   \
290     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
291             msg, (int)len, RExC_precomp, ellipses);                     \
292 } STMT_END
293
294
295 /*
296  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
297  */
298 #define Simple_vFAIL(m) STMT_START {                                    \
299     IV offset = RExC_parse - RExC_precomp;                              \
300     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
301             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
302 } STMT_END
303
304 /*
305  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
306  */
307 #define vFAIL(m) STMT_START {                           \
308     if (!SIZE_ONLY)                                     \
309         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
310     Simple_vFAIL(m);                                    \
311 } STMT_END
312
313 /*
314  * Like Simple_vFAIL(), but accepts two arguments.
315  */
316 #define Simple_vFAIL2(m,a1) STMT_START {                        \
317     IV offset = RExC_parse - RExC_precomp;                      \
318     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
319             (int)offset, RExC_precomp, RExC_precomp + offset);  \
320 } STMT_END
321
322 /*
323  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
324  */
325 #define vFAIL2(m,a1) STMT_START {                       \
326     if (!SIZE_ONLY)                                     \
327         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
328     Simple_vFAIL2(m, a1);                               \
329 } STMT_END
330
331
332 /*
333  * Like Simple_vFAIL(), but accepts three arguments.
334  */
335 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
336     IV offset = RExC_parse - RExC_precomp;                      \
337     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
338             (int)offset, RExC_precomp, RExC_precomp + offset);  \
339 } STMT_END
340
341 /*
342  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
343  */
344 #define vFAIL3(m,a1,a2) STMT_START {                    \
345     if (!SIZE_ONLY)                                     \
346         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
347     Simple_vFAIL3(m, a1, a2);                           \
348 } STMT_END
349
350 /*
351  * Like Simple_vFAIL(), but accepts four arguments.
352  */
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
354     IV offset = RExC_parse - RExC_precomp;                      \
355     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
356             (int)offset, RExC_precomp, RExC_precomp + offset);  \
357 } STMT_END
358
359 /*
360  * Like Simple_vFAIL(), but accepts five arguments.
361  */
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
363     IV offset = RExC_parse - RExC_precomp;                      \
364     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
365             (int)offset, RExC_precomp, RExC_precomp + offset);  \
366 } STMT_END
367
368
369 #define vWARN(loc,m) STMT_START {                                       \
370     IV offset = loc - RExC_precomp;                                     \
371     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
372             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
373 } STMT_END
374
375 #define vWARNdep(loc,m) STMT_START {                                    \
376     IV offset = loc - RExC_precomp;                                     \
377     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
378             "%s" REPORT_LOCATION,                                       \
379             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
380 } STMT_END
381
382
383 #define vWARN2(loc, m, a1) STMT_START {                                 \
384     IV offset = loc - RExC_precomp;                                     \
385     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
386             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
387 } STMT_END
388
389 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
390     IV offset = loc - RExC_precomp;                                     \
391     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
392             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
393 } STMT_END
394
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
396     IV offset = loc - RExC_precomp;                                     \
397     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
398             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399 } STMT_END
400
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
402     IV offset = loc - RExC_precomp;                                     \
403     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
404             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 } STMT_END
406
407
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START {                  \
410     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411 } STMT_END
412
413 /* Macros for recording node offsets.   20001227 mjd@plover.com 
414  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
415  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
416  * Element 0 holds the number n.
417  */
418
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
424     if (! SIZE_ONLY) {                                                  \
425         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
426                 __LINE__, (node), (byte)));                             \
427         if((node) < 0) {                                                \
428             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
429         } else {                                                        \
430             RExC_offsets[2*(node)-1] = (byte);                          \
431         }                                                               \
432     }                                                                   \
433 } STMT_END
434
435 #define Set_Node_Offset(node,byte) \
436     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
440     if (! SIZE_ONLY) {                                                  \
441         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
442                 __LINE__, (node), (len)));                              \
443         if((node) < 0) {                                                \
444             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
445         } else {                                                        \
446             RExC_offsets[2*(node)] = (len);                             \
447         }                                                               \
448     }                                                                   \
449 } STMT_END
450
451 #define Set_Node_Length(node,len) \
452     Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455     Set_Node_Length(node, RExC_parse - parse_start)
456
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
461 static void clear_re(pTHX_ void *r);
462
463 /* Mark that we cannot extend a found fixed substring at this point.
464    Updata the longest found anchored substring and the longest found
465    floating substrings if needed. */
466
467 STATIC void
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
469 {
470     STRLEN l = CHR_SVLEN(data->last_found);
471     STRLEN old_l = CHR_SVLEN(*data->longest);
472
473     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474         sv_setsv(*data->longest, data->last_found);
475         if (*data->longest == data->longest_fixed) {
476             data->offset_fixed = l ? data->last_start_min : data->pos_min;
477             if (data->flags & SF_BEFORE_EOL)
478                 data->flags
479                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480             else
481                 data->flags &= ~SF_FIX_BEFORE_EOL;
482         }
483         else {
484             data->offset_float_min = l ? data->last_start_min : data->pos_min;
485             data->offset_float_max = (l
486                                       ? data->last_start_max
487                                       : data->pos_min + data->pos_delta);
488             if ((U32)data->offset_float_max > (U32)I32_MAX)
489                 data->offset_float_max = I32_MAX;
490             if (data->flags & SF_BEFORE_EOL)
491                 data->flags
492                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493             else
494                 data->flags &= ~SF_FL_BEFORE_EOL;
495         }
496     }
497     SvCUR_set(data->last_found, 0);
498     data->last_end = -1;
499     data->flags &= ~SF_BEFORE_EOL;
500 }
501
502 /* Can match anything (initialization) */
503 STATIC void
504 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
505 {
506     ANYOF_CLASS_ZERO(cl);
507     ANYOF_BITMAP_SETALL(cl);
508     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
509     if (LOC)
510         cl->flags |= ANYOF_LOCALE;
511 }
512
513 /* Can match anything (initialization) */
514 STATIC int
515 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
516 {
517     int value;
518
519     for (value = 0; value <= ANYOF_MAX; value += 2)
520         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
521             return 1;
522     if (!(cl->flags & ANYOF_UNICODE_ALL))
523         return 0;
524     if (!ANYOF_BITMAP_TESTALLSET(cl))
525         return 0;
526     return 1;
527 }
528
529 /* Can match anything (initialization) */
530 STATIC void
531 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
532 {
533     Zero(cl, 1, struct regnode_charclass_class);
534     cl->type = ANYOF;
535     cl_anything(pRExC_state, cl);
536 }
537
538 STATIC void
539 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 {
541     Zero(cl, 1, struct regnode_charclass_class);
542     cl->type = ANYOF;
543     cl_anything(pRExC_state, cl);
544     if (LOC)
545         cl->flags |= ANYOF_LOCALE;
546 }
547
548 /* 'And' a given class with another one.  Can create false positives */
549 /* We assume that cl is not inverted */
550 STATIC void
551 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
552          struct regnode_charclass_class *and_with)
553 {
554     if (!(and_with->flags & ANYOF_CLASS)
555         && !(cl->flags & ANYOF_CLASS)
556         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
557         && !(and_with->flags & ANYOF_FOLD)
558         && !(cl->flags & ANYOF_FOLD)) {
559         int i;
560
561         if (and_with->flags & ANYOF_INVERT)
562             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
563                 cl->bitmap[i] &= ~and_with->bitmap[i];
564         else
565             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
566                 cl->bitmap[i] &= and_with->bitmap[i];
567     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
568     if (!(and_with->flags & ANYOF_EOS))
569         cl->flags &= ~ANYOF_EOS;
570
571     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
572         cl->flags &= ~ANYOF_UNICODE_ALL;
573         cl->flags |= ANYOF_UNICODE;
574         ARG_SET(cl, ARG(and_with));
575     }
576     if (!(and_with->flags & ANYOF_UNICODE_ALL))
577         cl->flags &= ~ANYOF_UNICODE_ALL;
578     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
579         cl->flags &= ~ANYOF_UNICODE;
580 }
581
582 /* 'OR' a given class with another one.  Can create false positives */
583 /* We assume that cl is not inverted */
584 STATIC void
585 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
586 {
587     if (or_with->flags & ANYOF_INVERT) {
588         /* We do not use
589          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
590          *   <= (B1 | !B2) | (CL1 | !CL2)
591          * which is wasteful if CL2 is small, but we ignore CL2:
592          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
593          * XXXX Can we handle case-fold?  Unclear:
594          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
595          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
596          */
597         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
598              && !(or_with->flags & ANYOF_FOLD)
599              && !(cl->flags & ANYOF_FOLD) ) {
600             int i;
601
602             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
603                 cl->bitmap[i] |= ~or_with->bitmap[i];
604         } /* XXXX: logic is complicated otherwise */
605         else {
606             cl_anything(pRExC_state, cl);
607         }
608     } else {
609         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
610         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
611              && (!(or_with->flags & ANYOF_FOLD)
612                  || (cl->flags & ANYOF_FOLD)) ) {
613             int i;
614
615             /* OR char bitmap and class bitmap separately */
616             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
617                 cl->bitmap[i] |= or_with->bitmap[i];
618             if (or_with->flags & ANYOF_CLASS) {
619                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
620                     cl->classflags[i] |= or_with->classflags[i];
621                 cl->flags |= ANYOF_CLASS;
622             }
623         }
624         else { /* XXXX: logic is complicated, leave it along for a moment. */
625             cl_anything(pRExC_state, cl);
626         }
627     }
628     if (or_with->flags & ANYOF_EOS)
629         cl->flags |= ANYOF_EOS;
630
631     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
632         ARG(cl) != ARG(or_with)) {
633         cl->flags |= ANYOF_UNICODE_ALL;
634         cl->flags &= ~ANYOF_UNICODE;
635     }
636     if (or_with->flags & ANYOF_UNICODE_ALL) {
637         cl->flags |= ANYOF_UNICODE_ALL;
638         cl->flags &= ~ANYOF_UNICODE;
639     }
640 }
641
642 /*
643  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
644  * These need to be revisited when a newer toolchain becomes available.
645  */
646 #if defined(__sparc64__) && defined(__GNUC__)
647 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
648 #       undef  SPARC64_GCC_WORKAROUND
649 #       define SPARC64_GCC_WORKAROUND 1
650 #   endif
651 #endif
652
653 /* REx optimizer.  Converts nodes into quickier variants "in place".
654    Finds fixed substrings.  */
655
656 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
657    to the position after last scanned or to NULL. */
658
659 STATIC I32
660 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
661                         /* scanp: Start here (read-write). */
662                         /* deltap: Write maxlen-minlen here. */
663                         /* last: Stop before this one. */
664 {
665     I32 min = 0, pars = 0, code;
666     regnode *scan = *scanp, *next;
667     I32 delta = 0;
668     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
669     int is_inf_internal = 0;            /* The studied chunk is infinite */
670     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
671     scan_data_t data_fake;
672     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
673
674     while (scan && OP(scan) != END && scan < last) {
675         /* Peephole optimizer: */
676
677         if (PL_regkind[(U8)OP(scan)] == EXACT) {
678             /* Merge several consecutive EXACTish nodes into one. */
679             regnode *n = regnext(scan);
680             U32 stringok = 1;
681 #ifdef DEBUGGING
682             regnode *stop = scan;
683 #endif
684
685             next = scan + NODE_SZ_STR(scan);
686             /* Skip NOTHING, merge EXACT*. */
687             while (n &&
688                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
689                      (stringok && (OP(n) == OP(scan))))
690                    && NEXT_OFF(n)
691                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
692                 if (OP(n) == TAIL || n > next)
693                     stringok = 0;
694                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
695                     NEXT_OFF(scan) += NEXT_OFF(n);
696                     next = n + NODE_STEP_REGNODE;
697 #ifdef DEBUGGING
698                     if (stringok)
699                         stop = n;
700 #endif
701                     n = regnext(n);
702                 }
703                 else if (stringok) {
704                     int oldl = STR_LEN(scan);
705                     regnode *nnext = regnext(n);
706
707                     if (oldl + STR_LEN(n) > U8_MAX)
708                         break;
709                     NEXT_OFF(scan) += NEXT_OFF(n);
710                     STR_LEN(scan) += STR_LEN(n);
711                     next = n + NODE_SZ_STR(n);
712                     /* Now we can overwrite *n : */
713                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
714 #ifdef DEBUGGING
715                     stop = next - 1;
716 #endif
717                     n = nnext;
718                 }
719             }
720
721             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
722 /*
723   Two problematic code points in Unicode casefolding of EXACT nodes:
724
725    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
726    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
727
728    which casefold to
729
730    Unicode                      UTF-8
731
732    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
733    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
734
735    This means that in case-insensitive matching (or "loose matching",
736    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
737    length of the above casefolded versions) can match a target string
738    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
739    This would rather mess up the minimum length computation.
740
741    What we'll do is to look for the tail four bytes, and then peek
742    at the preceding two bytes to see whether we need to decrease
743    the minimum length by four (six minus two).
744
745    Thanks to the design of UTF-8, there cannot be false matches:
746    A sequence of valid UTF-8 bytes cannot be a subsequence of
747    another valid sequence of UTF-8 bytes.
748
749 */
750                  char *s0 = STRING(scan), *s, *t;
751                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
752                  char *t0 = "\xcc\x88\xcc\x81";
753                  char *t1 = t0 + 3;
754                  
755                  for (s = s0 + 2;
756                       s < s2 && (t = ninstr(s, s1, t0, t1));
757                       s = t + 4) {
758                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
759                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
760                            min -= 4;
761                  }
762             }
763
764 #ifdef DEBUGGING
765             /* Allow dumping */
766             n = scan + NODE_SZ_STR(scan);
767             while (n <= stop) {
768                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
769                     OP(n) = OPTIMIZED;
770                     NEXT_OFF(n) = 0;
771                 }
772                 n++;
773             }
774 #endif
775         }
776         /* Follow the next-chain of the current node and optimize
777            away all the NOTHINGs from it.  */
778         if (OP(scan) != CURLYX) {
779             int max = (reg_off_by_arg[OP(scan)]
780                        ? I32_MAX
781                        /* I32 may be smaller than U16 on CRAYs! */
782                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
783             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
784             int noff;
785             regnode *n = scan;
786         
787             /* Skip NOTHING and LONGJMP. */
788             while ((n = regnext(n))
789                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
790                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
791                    && off + noff < max)
792                 off += noff;
793             if (reg_off_by_arg[OP(scan)])
794                 ARG(scan) = off;
795             else
796                 NEXT_OFF(scan) = off;
797         }
798         /* The principal pseudo-switch.  Cannot be a switch, since we
799            look into several different things.  */
800         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
801                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
802             next = regnext(scan);
803             code = OP(scan);
804         
805             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
806                 I32 max1 = 0, min1 = I32_MAX, num = 0;
807                 struct regnode_charclass_class accum;
808                 
809                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
810                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
811                 if (flags & SCF_DO_STCLASS)
812                     cl_init_zero(pRExC_state, &accum);
813                 while (OP(scan) == code) {
814                     I32 deltanext, minnext, f = 0, fake;
815                     struct regnode_charclass_class this_class;
816
817                     num++;
818                     data_fake.flags = 0;
819                     if (data) {         
820                         data_fake.whilem_c = data->whilem_c;
821                         data_fake.last_closep = data->last_closep;
822                     }
823                     else
824                         data_fake.last_closep = &fake;
825                     next = regnext(scan);
826                     scan = NEXTOPER(scan);
827                     if (code != BRANCH)
828                         scan = NEXTOPER(scan);
829                     if (flags & SCF_DO_STCLASS) {
830                         cl_init(pRExC_state, &this_class);
831                         data_fake.start_class = &this_class;
832                         f = SCF_DO_STCLASS_AND;
833                     }           
834                     if (flags & SCF_WHILEM_VISITED_POS)
835                         f |= SCF_WHILEM_VISITED_POS;
836                     /* we suppose the run is continuous, last=next...*/
837                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
838                                           next, &data_fake, f);
839                     if (min1 > minnext)
840                         min1 = minnext;
841                     if (max1 < minnext + deltanext)
842                         max1 = minnext + deltanext;
843                     if (deltanext == I32_MAX)
844                         is_inf = is_inf_internal = 1;
845                     scan = next;
846                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
847                         pars++;
848                     if (data && (data_fake.flags & SF_HAS_EVAL))
849                         data->flags |= SF_HAS_EVAL;
850                     if (data)
851                         data->whilem_c = data_fake.whilem_c;
852                     if (flags & SCF_DO_STCLASS)
853                         cl_or(pRExC_state, &accum, &this_class);
854                     if (code == SUSPEND)
855                         break;
856                 }
857                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
858                     min1 = 0;
859                 if (flags & SCF_DO_SUBSTR) {
860                     data->pos_min += min1;
861                     data->pos_delta += max1 - min1;
862                     if (max1 != min1 || is_inf)
863                         data->longest = &(data->longest_float);
864                 }
865                 min += min1;
866                 delta += max1 - min1;
867                 if (flags & SCF_DO_STCLASS_OR) {
868                     cl_or(pRExC_state, data->start_class, &accum);
869                     if (min1) {
870                         cl_and(data->start_class, &and_with);
871                         flags &= ~SCF_DO_STCLASS;
872                     }
873                 }
874                 else if (flags & SCF_DO_STCLASS_AND) {
875                     if (min1) {
876                         cl_and(data->start_class, &accum);
877                         flags &= ~SCF_DO_STCLASS;
878                     }
879                     else {
880                         /* Switch to OR mode: cache the old value of
881                          * data->start_class */
882                         StructCopy(data->start_class, &and_with,
883                                    struct regnode_charclass_class);
884                         flags &= ~SCF_DO_STCLASS_AND;
885                         StructCopy(&accum, data->start_class,
886                                    struct regnode_charclass_class);
887                         flags |= SCF_DO_STCLASS_OR;
888                         data->start_class->flags |= ANYOF_EOS;
889                     }
890                 }
891             }
892             else if (code == BRANCHJ)   /* single branch is optimized. */
893                 scan = NEXTOPER(NEXTOPER(scan));
894             else                        /* single branch is optimized. */
895                 scan = NEXTOPER(scan);
896             continue;
897         }
898         else if (OP(scan) == EXACT) {
899             I32 l = STR_LEN(scan);
900             UV uc = *((U8*)STRING(scan));
901             if (UTF) {
902                 U8 *s = (U8*)STRING(scan);
903                 l = utf8_length(s, s + l);
904                 uc = utf8_to_uvchr(s, NULL);
905             }
906             min += l;
907             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
908                 /* The code below prefers earlier match for fixed
909                    offset, later match for variable offset.  */
910                 if (data->last_end == -1) { /* Update the start info. */
911                     data->last_start_min = data->pos_min;
912                     data->last_start_max = is_inf
913                         ? I32_MAX : data->pos_min + data->pos_delta;
914                 }
915                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
916                 if (UTF)
917                     SvUTF8_on(data->last_found);
918                 data->last_end = data->pos_min + l;
919                 data->pos_min += l; /* As in the first entry. */
920                 data->flags &= ~SF_BEFORE_EOL;
921             }
922             if (flags & SCF_DO_STCLASS_AND) {
923                 /* Check whether it is compatible with what we know already! */
924                 int compat = 1;
925
926                 if (uc >= 0x100 ||
927                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
928                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
929                     && (!(data->start_class->flags & ANYOF_FOLD)
930                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
931                     )
932                     compat = 0;
933                 ANYOF_CLASS_ZERO(data->start_class);
934                 ANYOF_BITMAP_ZERO(data->start_class);
935                 if (compat)
936                     ANYOF_BITMAP_SET(data->start_class, uc);
937                 data->start_class->flags &= ~ANYOF_EOS;
938                 if (uc < 0x100)
939                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
940             }
941             else if (flags & SCF_DO_STCLASS_OR) {
942                 /* false positive possible if the class is case-folded */
943                 if (uc < 0x100)
944                     ANYOF_BITMAP_SET(data->start_class, uc);
945                 else
946                     data->start_class->flags |= ANYOF_UNICODE_ALL;
947                 data->start_class->flags &= ~ANYOF_EOS;
948                 cl_and(data->start_class, &and_with);
949             }
950             flags &= ~SCF_DO_STCLASS;
951         }
952         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
953             I32 l = STR_LEN(scan);
954             UV uc = *((U8*)STRING(scan));
955
956             /* Search for fixed substrings supports EXACT only. */
957             if (flags & SCF_DO_SUBSTR)
958                 scan_commit(pRExC_state, data);
959             if (UTF) {
960                 U8 *s = (U8 *)STRING(scan);
961                 l = utf8_length(s, s + l);
962                 uc = utf8_to_uvchr(s, NULL);
963             }
964             min += l;
965             if (data && (flags & SCF_DO_SUBSTR))
966                 data->pos_min += l;
967             if (flags & SCF_DO_STCLASS_AND) {
968                 /* Check whether it is compatible with what we know already! */
969                 int compat = 1;
970
971                 if (uc >= 0x100 ||
972                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
973                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
974                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
975                     compat = 0;
976                 ANYOF_CLASS_ZERO(data->start_class);
977                 ANYOF_BITMAP_ZERO(data->start_class);
978                 if (compat) {
979                     ANYOF_BITMAP_SET(data->start_class, uc);
980                     data->start_class->flags &= ~ANYOF_EOS;
981                     data->start_class->flags |= ANYOF_FOLD;
982                     if (OP(scan) == EXACTFL)
983                         data->start_class->flags |= ANYOF_LOCALE;
984                 }
985             }
986             else if (flags & SCF_DO_STCLASS_OR) {
987                 if (data->start_class->flags & ANYOF_FOLD) {
988                     /* false positive possible if the class is case-folded.
989                        Assume that the locale settings are the same... */
990                     if (uc < 0x100)
991                         ANYOF_BITMAP_SET(data->start_class, uc);
992                     data->start_class->flags &= ~ANYOF_EOS;
993                 }
994                 cl_and(data->start_class, &and_with);
995             }
996             flags &= ~SCF_DO_STCLASS;
997         }
998         else if (strchr((char*)PL_varies,OP(scan))) {
999             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1000             I32 f = flags, pos_before = 0;
1001             regnode *oscan = scan;
1002             struct regnode_charclass_class this_class;
1003             struct regnode_charclass_class *oclass = NULL;
1004             I32 next_is_eval = 0;
1005
1006             switch (PL_regkind[(U8)OP(scan)]) {
1007             case WHILEM:                /* End of (?:...)* . */
1008                 scan = NEXTOPER(scan);
1009                 goto finish;
1010             case PLUS:
1011                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1012                     next = NEXTOPER(scan);
1013                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1014                         mincount = 1;
1015                         maxcount = REG_INFTY;
1016                         next = regnext(scan);
1017                         scan = NEXTOPER(scan);
1018                         goto do_curly;
1019                     }
1020                 }
1021                 if (flags & SCF_DO_SUBSTR)
1022                     data->pos_min++;
1023                 min++;
1024                 /* Fall through. */
1025             case STAR:
1026                 if (flags & SCF_DO_STCLASS) {
1027                     mincount = 0;
1028                     maxcount = REG_INFTY;
1029                     next = regnext(scan);
1030                     scan = NEXTOPER(scan);
1031                     goto do_curly;
1032                 }
1033                 is_inf = is_inf_internal = 1;
1034                 scan = regnext(scan);
1035                 if (flags & SCF_DO_SUBSTR) {
1036                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1037                     data->longest = &(data->longest_float);
1038                 }
1039                 goto optimize_curly_tail;
1040             case CURLY:
1041                 mincount = ARG1(scan);
1042                 maxcount = ARG2(scan);
1043                 next = regnext(scan);
1044                 if (OP(scan) == CURLYX) {
1045                     I32 lp = (data ? *(data->last_closep) : 0);
1046
1047                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1048                 }
1049                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1050                 next_is_eval = (OP(scan) == EVAL);
1051               do_curly:
1052                 if (flags & SCF_DO_SUBSTR) {
1053                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1054                     pos_before = data->pos_min;
1055                 }
1056                 if (data) {
1057                     fl = data->flags;
1058                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1059                     if (is_inf)
1060                         data->flags |= SF_IS_INF;
1061                 }
1062                 if (flags & SCF_DO_STCLASS) {
1063                     cl_init(pRExC_state, &this_class);
1064                     oclass = data->start_class;
1065                     data->start_class = &this_class;
1066                     f |= SCF_DO_STCLASS_AND;
1067                     f &= ~SCF_DO_STCLASS_OR;
1068                 }
1069                 /* These are the cases when once a subexpression
1070                    fails at a particular position, it cannot succeed
1071                    even after backtracking at the enclosing scope.
1072                 
1073                    XXXX what if minimal match and we are at the
1074                         initial run of {n,m}? */
1075                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1076                     f &= ~SCF_WHILEM_VISITED_POS;
1077
1078                 /* This will finish on WHILEM, setting scan, or on NULL: */
1079                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1080                                       mincount == 0
1081                                         ? (f & ~SCF_DO_SUBSTR) : f);
1082
1083                 if (flags & SCF_DO_STCLASS)
1084                     data->start_class = oclass;
1085                 if (mincount == 0 || minnext == 0) {
1086                     if (flags & SCF_DO_STCLASS_OR) {
1087                         cl_or(pRExC_state, data->start_class, &this_class);
1088                     }
1089                     else if (flags & SCF_DO_STCLASS_AND) {
1090                         /* Switch to OR mode: cache the old value of
1091                          * data->start_class */
1092                         StructCopy(data->start_class, &and_with,
1093                                    struct regnode_charclass_class);
1094                         flags &= ~SCF_DO_STCLASS_AND;
1095                         StructCopy(&this_class, data->start_class,
1096                                    struct regnode_charclass_class);
1097                         flags |= SCF_DO_STCLASS_OR;
1098                         data->start_class->flags |= ANYOF_EOS;
1099                     }
1100                 } else {                /* Non-zero len */
1101                     if (flags & SCF_DO_STCLASS_OR) {
1102                         cl_or(pRExC_state, data->start_class, &this_class);
1103                         cl_and(data->start_class, &and_with);
1104                     }
1105                     else if (flags & SCF_DO_STCLASS_AND)
1106                         cl_and(data->start_class, &this_class);
1107                     flags &= ~SCF_DO_STCLASS;
1108                 }
1109                 if (!scan)              /* It was not CURLYX, but CURLY. */
1110                     scan = next;
1111                 if (ckWARN(WARN_REGEXP)
1112                        /* ? quantifier ok, except for (?{ ... }) */
1113                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1114                     && (minnext == 0) && (deltanext == 0)
1115                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1116                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1117                 {
1118                     vWARN(RExC_parse,
1119                           "Quantifier unexpected on zero-length expression");
1120                 }
1121
1122                 min += minnext * mincount;
1123                 is_inf_internal |= ((maxcount == REG_INFTY
1124                                      && (minnext + deltanext) > 0)
1125                                     || deltanext == I32_MAX);
1126                 is_inf |= is_inf_internal;
1127                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1128
1129                 /* Try powerful optimization CURLYX => CURLYN. */
1130                 if (  OP(oscan) == CURLYX && data
1131                       && data->flags & SF_IN_PAR
1132                       && !(data->flags & SF_HAS_EVAL)
1133                       && !deltanext && minnext == 1 ) {
1134                     /* Try to optimize to CURLYN.  */
1135                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1136                     regnode *nxt1 = nxt;
1137 #ifdef DEBUGGING
1138                     regnode *nxt2;
1139 #endif
1140
1141                     /* Skip open. */
1142                     nxt = regnext(nxt);
1143                     if (!strchr((char*)PL_simple,OP(nxt))
1144                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1145                              && STR_LEN(nxt) == 1))
1146                         goto nogo;
1147 #ifdef DEBUGGING
1148                     nxt2 = nxt;
1149 #endif
1150                     nxt = regnext(nxt);
1151                     if (OP(nxt) != CLOSE)
1152                         goto nogo;
1153                     /* Now we know that nxt2 is the only contents: */
1154                     oscan->flags = (U8)ARG(nxt);
1155                     OP(oscan) = CURLYN;
1156                     OP(nxt1) = NOTHING; /* was OPEN. */
1157 #ifdef DEBUGGING
1158                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1159                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1160                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1161                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1162                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1163                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1164 #endif
1165                 }
1166               nogo:
1167
1168                 /* Try optimization CURLYX => CURLYM. */
1169                 if (  OP(oscan) == CURLYX && data
1170                       && !(data->flags & SF_HAS_PAR)
1171                       && !(data->flags & SF_HAS_EVAL)
1172                       && !deltanext  ) {
1173                     /* XXXX How to optimize if data == 0? */
1174                     /* Optimize to a simpler form.  */
1175                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1176                     regnode *nxt2;
1177
1178                     OP(oscan) = CURLYM;
1179                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1180                             && (OP(nxt2) != WHILEM))
1181                         nxt = nxt2;
1182                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1183                     /* Need to optimize away parenths. */
1184                     if (data->flags & SF_IN_PAR) {
1185                         /* Set the parenth number.  */
1186                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1187
1188                         if (OP(nxt) != CLOSE)
1189                             FAIL("Panic opt close");
1190                         oscan->flags = (U8)ARG(nxt);
1191                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1192                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1193 #ifdef DEBUGGING
1194                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1195                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1196                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1197                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1198 #endif
1199 #if 0
1200                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1201                             regnode *nnxt = regnext(nxt1);
1202                         
1203                             if (nnxt == nxt) {
1204                                 if (reg_off_by_arg[OP(nxt1)])
1205                                     ARG_SET(nxt1, nxt2 - nxt1);
1206                                 else if (nxt2 - nxt1 < U16_MAX)
1207                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1208                                 else
1209                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1210                             }
1211                             nxt1 = nnxt;
1212                         }
1213 #endif
1214                         /* Optimize again: */
1215                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1216                                     NULL, 0);
1217                     }
1218                     else
1219                         oscan->flags = 0;
1220                 }
1221                 else if ((OP(oscan) == CURLYX)
1222                          && (flags & SCF_WHILEM_VISITED_POS)
1223                          /* See the comment on a similar expression above.
1224                             However, this time it not a subexpression
1225                             we care about, but the expression itself. */
1226                          && (maxcount == REG_INFTY)
1227                          && data && ++data->whilem_c < 16) {
1228                     /* This stays as CURLYX, we can put the count/of pair. */
1229                     /* Find WHILEM (as in regexec.c) */
1230                     regnode *nxt = oscan + NEXT_OFF(oscan);
1231
1232                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1233                         nxt += ARG(nxt);
1234                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1235                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1236                 }
1237                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1238                     pars++;
1239                 if (flags & SCF_DO_SUBSTR) {
1240                     SV *last_str = Nullsv;
1241                     int counted = mincount != 0;
1242
1243                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1244 #if defined(SPARC64_GCC_WORKAROUND)
1245                         I32 b = 0;
1246                         STRLEN l = 0;
1247                         char *s = NULL;
1248                         I32 old = 0;
1249
1250                         if (pos_before >= data->last_start_min)
1251                             b = pos_before;
1252                         else
1253                             b = data->last_start_min;
1254
1255                         l = 0;
1256                         s = SvPV(data->last_found, l);
1257                         old = b - data->last_start_min;
1258
1259 #else
1260                         I32 b = pos_before >= data->last_start_min
1261                             ? pos_before : data->last_start_min;
1262                         STRLEN l;
1263                         char *s = SvPV(data->last_found, l);
1264                         I32 old = b - data->last_start_min;
1265 #endif
1266
1267                         if (UTF)
1268                             old = utf8_hop((U8*)s, old) - (U8*)s;
1269                         
1270                         l -= old;
1271                         /* Get the added string: */
1272                         last_str = newSVpvn(s  + old, l);
1273                         if (UTF)
1274                             SvUTF8_on(last_str);
1275                         if (deltanext == 0 && pos_before == b) {
1276                             /* What was added is a constant string */
1277                             if (mincount > 1) {
1278                                 SvGROW(last_str, (mincount * l) + 1);
1279                                 repeatcpy(SvPVX(last_str) + l,
1280                                           SvPVX(last_str), l, mincount - 1);
1281                                 SvCUR(last_str) *= mincount;
1282                                 /* Add additional parts. */
1283                                 SvCUR_set(data->last_found,
1284                                           SvCUR(data->last_found) - l);
1285                                 sv_catsv(data->last_found, last_str);
1286                                 data->last_end += l * (mincount - 1);
1287                             }
1288                         } else {
1289                             /* start offset must point into the last copy */
1290                             data->last_start_min += minnext * (mincount - 1);
1291                             data->last_start_max += is_inf ? I32_MAX
1292                                 : (maxcount - 1) * (minnext + data->pos_delta);
1293                         }
1294                     }
1295                     /* It is counted once already... */
1296                     data->pos_min += minnext * (mincount - counted);
1297                     data->pos_delta += - counted * deltanext +
1298                         (minnext + deltanext) * maxcount - minnext * mincount;
1299                     if (mincount != maxcount) {
1300                          /* Cannot extend fixed substrings found inside
1301                             the group.  */
1302                         scan_commit(pRExC_state,data);
1303                         if (mincount && last_str) {
1304                             sv_setsv(data->last_found, last_str);
1305                             data->last_end = data->pos_min;
1306                             data->last_start_min =
1307                                 data->pos_min - CHR_SVLEN(last_str);
1308                             data->last_start_max = is_inf
1309                                 ? I32_MAX
1310                                 : data->pos_min + data->pos_delta
1311                                 - CHR_SVLEN(last_str);
1312                         }
1313                         data->longest = &(data->longest_float);
1314                     }
1315                     SvREFCNT_dec(last_str);
1316                 }
1317                 if (data && (fl & SF_HAS_EVAL))
1318                     data->flags |= SF_HAS_EVAL;
1319               optimize_curly_tail:
1320                 if (OP(oscan) != CURLYX) {
1321                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1322                            && NEXT_OFF(next))
1323                         NEXT_OFF(oscan) += NEXT_OFF(next);
1324                 }
1325                 continue;
1326             default:                    /* REF and CLUMP only? */
1327                 if (flags & SCF_DO_SUBSTR) {
1328                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1329                     data->longest = &(data->longest_float);
1330                 }
1331                 is_inf = is_inf_internal = 1;
1332                 if (flags & SCF_DO_STCLASS_OR)
1333                     cl_anything(pRExC_state, data->start_class);
1334                 flags &= ~SCF_DO_STCLASS;
1335                 break;
1336             }
1337         }
1338         else if (strchr((char*)PL_simple,OP(scan))) {
1339             int value = 0;
1340
1341             if (flags & SCF_DO_SUBSTR) {
1342                 scan_commit(pRExC_state,data);
1343                 data->pos_min++;
1344             }
1345             min++;
1346             if (flags & SCF_DO_STCLASS) {
1347                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1348
1349                 /* Some of the logic below assumes that switching
1350                    locale on will only add false positives. */
1351                 switch (PL_regkind[(U8)OP(scan)]) {
1352                 case SANY:
1353                 default:
1354                   do_default:
1355                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1356                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1357                         cl_anything(pRExC_state, data->start_class);
1358                     break;
1359                 case REG_ANY:
1360                     if (OP(scan) == SANY)
1361                         goto do_default;
1362                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1363                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1364                                  || (data->start_class->flags & ANYOF_CLASS));
1365                         cl_anything(pRExC_state, data->start_class);
1366                     }
1367                     if (flags & SCF_DO_STCLASS_AND || !value)
1368                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1369                     break;
1370                 case ANYOF:
1371                     if (flags & SCF_DO_STCLASS_AND)
1372                         cl_and(data->start_class,
1373                                (struct regnode_charclass_class*)scan);
1374                     else
1375                         cl_or(pRExC_state, data->start_class,
1376                               (struct regnode_charclass_class*)scan);
1377                     break;
1378                 case ALNUM:
1379                     if (flags & SCF_DO_STCLASS_AND) {
1380                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1381                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1382                             for (value = 0; value < 256; value++)
1383                                 if (!isALNUM(value))
1384                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1385                         }
1386                     }
1387                     else {
1388                         if (data->start_class->flags & ANYOF_LOCALE)
1389                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1390                         else {
1391                             for (value = 0; value < 256; value++)
1392                                 if (isALNUM(value))
1393                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1394                         }
1395                     }
1396                     break;
1397                 case ALNUML:
1398                     if (flags & SCF_DO_STCLASS_AND) {
1399                         if (data->start_class->flags & ANYOF_LOCALE)
1400                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1401                     }
1402                     else {
1403                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1404                         data->start_class->flags |= ANYOF_LOCALE;
1405                     }
1406                     break;
1407                 case NALNUM:
1408                     if (flags & SCF_DO_STCLASS_AND) {
1409                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1411                             for (value = 0; value < 256; value++)
1412                                 if (isALNUM(value))
1413                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1414                         }
1415                     }
1416                     else {
1417                         if (data->start_class->flags & ANYOF_LOCALE)
1418                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1419                         else {
1420                             for (value = 0; value < 256; value++)
1421                                 if (!isALNUM(value))
1422                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1423                         }
1424                     }
1425                     break;
1426                 case NALNUML:
1427                     if (flags & SCF_DO_STCLASS_AND) {
1428                         if (data->start_class->flags & ANYOF_LOCALE)
1429                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1430                     }
1431                     else {
1432                         data->start_class->flags |= ANYOF_LOCALE;
1433                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434                     }
1435                     break;
1436                 case SPACE:
1437                     if (flags & SCF_DO_STCLASS_AND) {
1438                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1440                             for (value = 0; value < 256; value++)
1441                                 if (!isSPACE(value))
1442                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1443                         }
1444                     }
1445                     else {
1446                         if (data->start_class->flags & ANYOF_LOCALE)
1447                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1448                         else {
1449                             for (value = 0; value < 256; value++)
1450                                 if (isSPACE(value))
1451                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1452                         }
1453                     }
1454                     break;
1455                 case SPACEL:
1456                     if (flags & SCF_DO_STCLASS_AND) {
1457                         if (data->start_class->flags & ANYOF_LOCALE)
1458                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1459                     }
1460                     else {
1461                         data->start_class->flags |= ANYOF_LOCALE;
1462                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463                     }
1464                     break;
1465                 case NSPACE:
1466                     if (flags & SCF_DO_STCLASS_AND) {
1467                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1469                             for (value = 0; value < 256; value++)
1470                                 if (isSPACE(value))
1471                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1472                         }
1473                     }
1474                     else {
1475                         if (data->start_class->flags & ANYOF_LOCALE)
1476                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1477                         else {
1478                             for (value = 0; value < 256; value++)
1479                                 if (!isSPACE(value))
1480                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1481                         }
1482                     }
1483                     break;
1484                 case NSPACEL:
1485                     if (flags & SCF_DO_STCLASS_AND) {
1486                         if (data->start_class->flags & ANYOF_LOCALE) {
1487                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1488                             for (value = 0; value < 256; value++)
1489                                 if (!isSPACE(value))
1490                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1491                         }
1492                     }
1493                     else {
1494                         data->start_class->flags |= ANYOF_LOCALE;
1495                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1496                     }
1497                     break;
1498                 case DIGIT:
1499                     if (flags & SCF_DO_STCLASS_AND) {
1500                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1501                         for (value = 0; value < 256; value++)
1502                             if (!isDIGIT(value))
1503                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1504                     }
1505                     else {
1506                         if (data->start_class->flags & ANYOF_LOCALE)
1507                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1508                         else {
1509                             for (value = 0; value < 256; value++)
1510                                 if (isDIGIT(value))
1511                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1512                         }
1513                     }
1514                     break;
1515                 case NDIGIT:
1516                     if (flags & SCF_DO_STCLASS_AND) {
1517                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1518                         for (value = 0; value < 256; value++)
1519                             if (isDIGIT(value))
1520                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521                     }
1522                     else {
1523                         if (data->start_class->flags & ANYOF_LOCALE)
1524                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1525                         else {
1526                             for (value = 0; value < 256; value++)
1527                                 if (!isDIGIT(value))
1528                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1529                         }
1530                     }
1531                     break;
1532                 }
1533                 if (flags & SCF_DO_STCLASS_OR)
1534                     cl_and(data->start_class, &and_with);
1535                 flags &= ~SCF_DO_STCLASS;
1536             }
1537         }
1538         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1539             data->flags |= (OP(scan) == MEOL
1540                             ? SF_BEFORE_MEOL
1541                             : SF_BEFORE_SEOL);
1542         }
1543         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1544                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1545                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1546                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1547             /* Lookahead/lookbehind */
1548             I32 deltanext, minnext, fake = 0;
1549             regnode *nscan;
1550             struct regnode_charclass_class intrnl;
1551             int f = 0;
1552
1553             data_fake.flags = 0;
1554             if (data) {         
1555                 data_fake.whilem_c = data->whilem_c;
1556                 data_fake.last_closep = data->last_closep;
1557             }
1558             else
1559                 data_fake.last_closep = &fake;
1560             if ( flags & SCF_DO_STCLASS && !scan->flags
1561                  && OP(scan) == IFMATCH ) { /* Lookahead */
1562                 cl_init(pRExC_state, &intrnl);
1563                 data_fake.start_class = &intrnl;
1564                 f |= SCF_DO_STCLASS_AND;
1565             }
1566             if (flags & SCF_WHILEM_VISITED_POS)
1567                 f |= SCF_WHILEM_VISITED_POS;
1568             next = regnext(scan);
1569             nscan = NEXTOPER(NEXTOPER(scan));
1570             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1571             if (scan->flags) {
1572                 if (deltanext) {
1573                     vFAIL("Variable length lookbehind not implemented");
1574                 }
1575                 else if (minnext > U8_MAX) {
1576                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1577                 }
1578                 scan->flags = (U8)minnext;
1579             }
1580             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1581                 pars++;
1582             if (data && (data_fake.flags & SF_HAS_EVAL))
1583                 data->flags |= SF_HAS_EVAL;
1584             if (data)
1585                 data->whilem_c = data_fake.whilem_c;
1586             if (f & SCF_DO_STCLASS_AND) {
1587                 int was = (data->start_class->flags & ANYOF_EOS);
1588
1589                 cl_and(data->start_class, &intrnl);
1590                 if (was)
1591                     data->start_class->flags |= ANYOF_EOS;
1592             }
1593         }
1594         else if (OP(scan) == OPEN) {
1595             pars++;
1596         }
1597         else if (OP(scan) == CLOSE) {
1598             if ((I32)ARG(scan) == is_par) {
1599                 next = regnext(scan);
1600
1601                 if ( next && (OP(next) != WHILEM) && next < last)
1602                     is_par = 0;         /* Disable optimization */
1603             }
1604             if (data)
1605                 *(data->last_closep) = ARG(scan);
1606         }
1607         else if (OP(scan) == EVAL) {
1608                 if (data)
1609                     data->flags |= SF_HAS_EVAL;
1610         }
1611         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1612                 if (flags & SCF_DO_SUBSTR) {
1613                     scan_commit(pRExC_state,data);
1614                     data->longest = &(data->longest_float);
1615                 }
1616                 is_inf = is_inf_internal = 1;
1617                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1618                     cl_anything(pRExC_state, data->start_class);
1619                 flags &= ~SCF_DO_STCLASS;
1620         }
1621         /* Else: zero-length, ignore. */
1622         scan = regnext(scan);
1623     }
1624
1625   finish:
1626     *scanp = scan;
1627     *deltap = is_inf_internal ? I32_MAX : delta;
1628     if (flags & SCF_DO_SUBSTR && is_inf)
1629         data->pos_delta = I32_MAX - data->pos_min;
1630     if (is_par > U8_MAX)
1631         is_par = 0;
1632     if (is_par && pars==1 && data) {
1633         data->flags |= SF_IN_PAR;
1634         data->flags &= ~SF_HAS_PAR;
1635     }
1636     else if (pars && data) {
1637         data->flags |= SF_HAS_PAR;
1638         data->flags &= ~SF_IN_PAR;
1639     }
1640     if (flags & SCF_DO_STCLASS_OR)
1641         cl_and(data->start_class, &and_with);
1642     return min;
1643 }
1644
1645 STATIC I32
1646 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1647 {
1648     if (RExC_rx->data) {
1649         Renewc(RExC_rx->data,
1650                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1651                char, struct reg_data);
1652         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1653         RExC_rx->data->count += n;
1654     }
1655     else {
1656         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1657              char, struct reg_data);
1658         New(1208, RExC_rx->data->what, n, U8);
1659         RExC_rx->data->count = n;
1660     }
1661     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1662     return RExC_rx->data->count - n;
1663 }
1664
1665 void
1666 Perl_reginitcolors(pTHX)
1667 {
1668     int i = 0;
1669     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1670         
1671     if (s) {
1672         PL_colors[0] = s = savepv(s);
1673         while (++i < 6) {
1674             s = strchr(s, '\t');
1675             if (s) {
1676                 *s = '\0';
1677                 PL_colors[i] = ++s;
1678             }
1679             else
1680                 PL_colors[i] = s = "";
1681         }
1682     } else {
1683         while (i < 6)
1684             PL_colors[i++] = "";
1685     }
1686     PL_colorset = 1;
1687 }
1688
1689
1690 /*
1691  - pregcomp - compile a regular expression into internal code
1692  *
1693  * We can't allocate space until we know how big the compiled form will be,
1694  * but we can't compile it (and thus know how big it is) until we've got a
1695  * place to put the code.  So we cheat:  we compile it twice, once with code
1696  * generation turned off and size counting turned on, and once "for real".
1697  * This also means that we don't allocate space until we are sure that the
1698  * thing really will compile successfully, and we never have to move the
1699  * code and thus invalidate pointers into it.  (Note that it has to be in
1700  * one piece because free() must be able to free it all.) [NB: not true in perl]
1701  *
1702  * Beware that the optimization-preparation code in here knows about some
1703  * of the structure of the compiled regexp.  [I'll say.]
1704  */
1705 regexp *
1706 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1707 {
1708     register regexp *r;
1709     regnode *scan;
1710     regnode *first;
1711     I32 flags;
1712     I32 minlen = 0;
1713     I32 sawplus = 0;
1714     I32 sawopen = 0;
1715     scan_data_t data;
1716     RExC_state_t RExC_state;
1717     RExC_state_t *pRExC_state = &RExC_state;
1718
1719     if (exp == NULL)
1720         FAIL("NULL regexp argument");
1721
1722     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1723
1724     RExC_precomp = exp;
1725     DEBUG_r({
1726          if (!PL_colorset) reginitcolors();
1727          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1728                        PL_colors[4],PL_colors[5],PL_colors[0],
1729                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1730     });
1731     RExC_flags = pm->op_pmflags;
1732     RExC_sawback = 0;
1733
1734     RExC_seen = 0;
1735     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1736     RExC_seen_evals = 0;
1737     RExC_extralen = 0;
1738
1739     /* First pass: determine size, legality. */
1740     RExC_parse = exp;
1741     RExC_start = exp;
1742     RExC_end = xend;
1743     RExC_naughty = 0;
1744     RExC_npar = 1;
1745     RExC_size = 0L;
1746     RExC_emit = &PL_regdummy;
1747     RExC_whilem_seen = 0;
1748 #if 0 /* REGC() is (currently) a NOP at the first pass.
1749        * Clever compilers notice this and complain. --jhi */
1750     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1751 #endif
1752     if (reg(pRExC_state, 0, &flags) == NULL) {
1753         RExC_precomp = Nullch;
1754         return(NULL);
1755     }
1756     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1757
1758     /* Small enough for pointer-storage convention?
1759        If extralen==0, this means that we will not need long jumps. */
1760     if (RExC_size >= 0x10000L && RExC_extralen)
1761         RExC_size += RExC_extralen;
1762     else
1763         RExC_extralen = 0;
1764     if (RExC_whilem_seen > 15)
1765         RExC_whilem_seen = 15;
1766
1767     /* Allocate space and initialize. */
1768     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1769          char, regexp);
1770     if (r == NULL)
1771         FAIL("Regexp out of space");
1772
1773 #ifdef DEBUGGING
1774     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1775     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1776 #endif
1777     r->refcnt = 1;
1778     r->prelen = xend - exp;
1779     r->precomp = savepvn(RExC_precomp, r->prelen);
1780     r->subbeg = NULL;
1781     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1782     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1783
1784     r->substrs = 0;                     /* Useful during FAIL. */
1785     r->startp = 0;                      /* Useful during FAIL. */
1786     r->endp = 0;                        /* Useful during FAIL. */
1787
1788     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1789     if (r->offsets) {
1790       r->offsets[0] = RExC_size; 
1791     }
1792     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1793                           "%s %"UVuf" bytes for offset annotations.\n", 
1794                           r->offsets ? "Got" : "Couldn't get", 
1795                           (UV)((2*RExC_size+1) * sizeof(U32))));
1796
1797     RExC_rx = r;
1798
1799     /* Second pass: emit code. */
1800     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1801     RExC_parse = exp;
1802     RExC_end = xend;
1803     RExC_naughty = 0;
1804     RExC_npar = 1;
1805     RExC_emit_start = r->program;
1806     RExC_emit = r->program;
1807     /* Store the count of eval-groups for security checks: */
1808     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1809     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1810     r->data = 0;
1811     if (reg(pRExC_state, 0, &flags) == NULL)
1812         return(NULL);
1813
1814     /* Dig out information for optimizations. */
1815     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1816     pm->op_pmflags = RExC_flags;
1817     if (UTF)
1818         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1819     r->regstclass = NULL;
1820     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1821         r->reganch |= ROPT_NAUGHTY;
1822     scan = r->program + 1;              /* First BRANCH. */
1823
1824     /* XXXX To minimize changes to RE engine we always allocate
1825        3-units-long substrs field. */
1826     Newz(1004, r->substrs, 1, struct reg_substr_data);
1827
1828     StructCopy(&zero_scan_data, &data, scan_data_t);
1829     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1830     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1831         I32 fake;
1832         STRLEN longest_float_length, longest_fixed_length;
1833         struct regnode_charclass_class ch_class;
1834         int stclass_flag;
1835         I32 last_close = 0;
1836
1837         first = scan;
1838         /* Skip introductions and multiplicators >= 1. */
1839         while ((OP(first) == OPEN && (sawopen = 1)) ||
1840                /* An OR of *one* alternative - should not happen now. */
1841             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1842             (OP(first) == PLUS) ||
1843             (OP(first) == MINMOD) ||
1844                /* An {n,m} with n>0 */
1845             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1846                 if (OP(first) == PLUS)
1847                     sawplus = 1;
1848                 else
1849                     first += regarglen[(U8)OP(first)];
1850                 first = NEXTOPER(first);
1851         }
1852
1853         /* Starting-point info. */
1854       again:
1855         if (PL_regkind[(U8)OP(first)] == EXACT) {
1856             if (OP(first) == EXACT)
1857                 ;       /* Empty, get anchored substr later. */
1858             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1859                 r->regstclass = first;
1860         }
1861         else if (strchr((char*)PL_simple,OP(first)))
1862             r->regstclass = first;
1863         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1864                  PL_regkind[(U8)OP(first)] == NBOUND)
1865             r->regstclass = first;
1866         else if (PL_regkind[(U8)OP(first)] == BOL) {
1867             r->reganch |= (OP(first) == MBOL
1868                            ? ROPT_ANCH_MBOL
1869                            : (OP(first) == SBOL
1870                               ? ROPT_ANCH_SBOL
1871                               : ROPT_ANCH_BOL));
1872             first = NEXTOPER(first);
1873             goto again;
1874         }
1875         else if (OP(first) == GPOS) {
1876             r->reganch |= ROPT_ANCH_GPOS;
1877             first = NEXTOPER(first);
1878             goto again;
1879         }
1880         else if (!sawopen && (OP(first) == STAR &&
1881             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1882             !(r->reganch & ROPT_ANCH) )
1883         {
1884             /* turn .* into ^.* with an implied $*=1 */
1885             int type = OP(NEXTOPER(first));
1886
1887             if (type == REG_ANY)
1888                 type = ROPT_ANCH_MBOL;
1889             else
1890                 type = ROPT_ANCH_SBOL;
1891
1892             r->reganch |= type | ROPT_IMPLICIT;
1893             first = NEXTOPER(first);
1894             goto again;
1895         }
1896         if (sawplus && (!sawopen || !RExC_sawback)
1897             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1898             /* x+ must match at the 1st pos of run of x's */
1899             r->reganch |= ROPT_SKIP;
1900
1901         /* Scan is after the zeroth branch, first is atomic matcher. */
1902         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1903                               (IV)(first - scan + 1)));
1904         /*
1905         * If there's something expensive in the r.e., find the
1906         * longest literal string that must appear and make it the
1907         * regmust.  Resolve ties in favor of later strings, since
1908         * the regstart check works with the beginning of the r.e.
1909         * and avoiding duplication strengthens checking.  Not a
1910         * strong reason, but sufficient in the absence of others.
1911         * [Now we resolve ties in favor of the earlier string if
1912         * it happens that c_offset_min has been invalidated, since the
1913         * earlier string may buy us something the later one won't.]
1914         */
1915         minlen = 0;
1916
1917         data.longest_fixed = newSVpvn("",0);
1918         data.longest_float = newSVpvn("",0);
1919         data.last_found = newSVpvn("",0);
1920         data.longest = &(data.longest_fixed);
1921         first = scan;
1922         if (!r->regstclass) {
1923             cl_init(pRExC_state, &ch_class);
1924             data.start_class = &ch_class;
1925             stclass_flag = SCF_DO_STCLASS_AND;
1926         } else                          /* XXXX Check for BOUND? */
1927             stclass_flag = 0;
1928         data.last_closep = &last_close;
1929
1930         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1931                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1932         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1933              && data.last_start_min == 0 && data.last_end > 0
1934              && !RExC_seen_zerolen
1935              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1936             r->reganch |= ROPT_CHECK_ALL;
1937         scan_commit(pRExC_state, &data);
1938         SvREFCNT_dec(data.last_found);
1939
1940         longest_float_length = CHR_SVLEN(data.longest_float);
1941         if (longest_float_length
1942             || (data.flags & SF_FL_BEFORE_EOL
1943                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1944                     || (RExC_flags & PMf_MULTILINE)))) {
1945             int t;
1946
1947             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1948                 && data.offset_fixed == data.offset_float_min
1949                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1950                     goto remove_float;          /* As in (a)+. */
1951
1952             if (SvUTF8(data.longest_float)) {
1953                 r->float_utf8 = data.longest_float;
1954                 r->float_substr = Nullsv;
1955             } else {
1956                 r->float_substr = data.longest_float;
1957                 r->float_utf8 = Nullsv;
1958             }
1959             r->float_min_offset = data.offset_float_min;
1960             r->float_max_offset = data.offset_float_max;
1961             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1962                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1963                            || (RExC_flags & PMf_MULTILINE)));
1964             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1965         }
1966         else {
1967           remove_float:
1968             r->float_substr = r->float_utf8 = Nullsv;
1969             SvREFCNT_dec(data.longest_float);
1970             longest_float_length = 0;
1971         }
1972
1973         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1974         if (longest_fixed_length
1975             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1976                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1977                     || (RExC_flags & PMf_MULTILINE)))) {
1978             int t;
1979
1980             if (SvUTF8(data.longest_fixed)) {
1981                 r->anchored_utf8 = data.longest_fixed;
1982                 r->anchored_substr = Nullsv;
1983             } else {
1984                 r->anchored_substr = data.longest_fixed;
1985                 r->anchored_utf8 = Nullsv;
1986             }
1987             r->anchored_offset = data.offset_fixed;
1988             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1989                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1990                      || (RExC_flags & PMf_MULTILINE)));
1991             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
1992         }
1993         else {
1994             r->anchored_substr = r->anchored_utf8 = Nullsv;
1995             SvREFCNT_dec(data.longest_fixed);
1996             longest_fixed_length = 0;
1997         }
1998         if (r->regstclass
1999             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2000             r->regstclass = NULL;
2001         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2002             && stclass_flag
2003             && !(data.start_class->flags & ANYOF_EOS)
2004             && !cl_is_anything(data.start_class))
2005         {
2006             I32 n = add_data(pRExC_state, 1, "f");
2007
2008             New(1006, RExC_rx->data->data[n], 1,
2009                 struct regnode_charclass_class);
2010             StructCopy(data.start_class,
2011                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2012                        struct regnode_charclass_class);
2013             r->regstclass = (regnode*)RExC_rx->data->data[n];
2014             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2015             PL_regdata = r->data; /* for regprop() */
2016             DEBUG_r({ SV *sv = sv_newmortal();
2017                       regprop(sv, (regnode*)data.start_class);
2018                       PerlIO_printf(Perl_debug_log,
2019                                     "synthetic stclass `%s'.\n",
2020                                     SvPVX(sv));});
2021         }
2022
2023         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2024         if (longest_fixed_length > longest_float_length) {
2025             r->check_substr = r->anchored_substr;
2026             r->check_utf8 = r->anchored_utf8;
2027             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028             if (r->reganch & ROPT_ANCH_SINGLE)
2029                 r->reganch |= ROPT_NOSCAN;
2030         }
2031         else {
2032             r->check_substr = r->float_substr;
2033             r->check_utf8 = r->float_utf8;
2034             r->check_offset_min = data.offset_float_min;
2035             r->check_offset_max = data.offset_float_max;
2036         }
2037         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2038            This should be changed ASAP!  */
2039         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2040             r->reganch |= RE_USE_INTUIT;
2041             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2042                 r->reganch |= RE_INTUIT_TAIL;
2043         }
2044     }
2045     else {
2046         /* Several toplevels. Best we can is to set minlen. */
2047         I32 fake;
2048         struct regnode_charclass_class ch_class;
2049         I32 last_close = 0;
2050         
2051         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2052         scan = r->program + 1;
2053         cl_init(pRExC_state, &ch_class);
2054         data.start_class = &ch_class;
2055         data.last_closep = &last_close;
2056         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2057         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2058                 = r->float_substr = r->float_utf8 = Nullsv;
2059         if (!(data.start_class->flags & ANYOF_EOS)
2060             && !cl_is_anything(data.start_class))
2061         {
2062             I32 n = add_data(pRExC_state, 1, "f");
2063
2064             New(1006, RExC_rx->data->data[n], 1,
2065                 struct regnode_charclass_class);
2066             StructCopy(data.start_class,
2067                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2068                        struct regnode_charclass_class);
2069             r->regstclass = (regnode*)RExC_rx->data->data[n];
2070             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2071             DEBUG_r({ SV* sv = sv_newmortal();
2072                       regprop(sv, (regnode*)data.start_class);
2073                       PerlIO_printf(Perl_debug_log,
2074                                     "synthetic stclass `%s'.\n",
2075                                     SvPVX(sv));});
2076         }
2077     }
2078
2079     r->minlen = minlen;
2080     if (RExC_seen & REG_SEEN_GPOS)
2081         r->reganch |= ROPT_GPOS_SEEN;
2082     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2083         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2084     if (RExC_seen & REG_SEEN_EVAL)
2085         r->reganch |= ROPT_EVAL_SEEN;
2086     if (RExC_seen & REG_SEEN_CANY)
2087         r->reganch |= ROPT_CANY_SEEN;
2088     Newz(1002, r->startp, RExC_npar, I32);
2089     Newz(1002, r->endp, RExC_npar, I32);
2090     PL_regdata = r->data; /* for regprop() */
2091     DEBUG_r(regdump(r));
2092     return(r);
2093 }
2094
2095 /*
2096  - reg - regular expression, i.e. main body or parenthesized thing
2097  *
2098  * Caller must absorb opening parenthesis.
2099  *
2100  * Combining parenthesis handling with the base level of regular expression
2101  * is a trifle forced, but the need to tie the tails of the branches to what
2102  * follows makes it hard to avoid.
2103  */
2104 STATIC regnode *
2105 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2106     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2107 {
2108     register regnode *ret;              /* Will be the head of the group. */
2109     register regnode *br;
2110     register regnode *lastbr;
2111     register regnode *ender = 0;
2112     register I32 parno = 0;
2113     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2114
2115     /* for (?g), (?gc), and (?o) warnings; warning
2116        about (?c) will warn about (?g) -- japhy    */
2117
2118     I32 wastedflags = 0x00,
2119         wasted_o    = 0x01,
2120         wasted_g    = 0x02,
2121         wasted_gc   = 0x02 | 0x04,
2122         wasted_c    = 0x04;
2123
2124     char * parse_start = RExC_parse; /* MJD */
2125     char *oregcomp_parse = RExC_parse;
2126     char c;
2127
2128     *flagp = 0;                         /* Tentatively. */
2129
2130
2131     /* Make an OPEN node, if parenthesized. */
2132     if (paren) {
2133         if (*RExC_parse == '?') { /* (?...) */
2134             U32 posflags = 0, negflags = 0;
2135             U32 *flagsp = &posflags;
2136             int logical = 0;
2137             char *seqstart = RExC_parse;
2138
2139             RExC_parse++;
2140             paren = *RExC_parse++;
2141             ret = NULL;                 /* For look-ahead/behind. */
2142             switch (paren) {
2143             case '<':           /* (?<...) */
2144                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2145                 if (*RExC_parse == '!')
2146                     paren = ',';
2147                 if (*RExC_parse != '=' && *RExC_parse != '!')
2148                     goto unknown;
2149                 RExC_parse++;
2150             case '=':           /* (?=...) */
2151             case '!':           /* (?!...) */
2152                 RExC_seen_zerolen++;
2153             case ':':           /* (?:...) */
2154             case '>':           /* (?>...) */
2155                 break;
2156             case '$':           /* (?$...) */
2157             case '@':           /* (?@...) */
2158                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2159                 break;
2160             case '#':           /* (?#...) */
2161                 while (*RExC_parse && *RExC_parse != ')')
2162                     RExC_parse++;
2163                 if (*RExC_parse != ')')
2164                     FAIL("Sequence (?#... not terminated");
2165                 nextchar(pRExC_state);
2166                 *flagp = TRYAGAIN;
2167                 return NULL;
2168             case 'p':           /* (?p...) */
2169                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2170                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2171                 /* FALL THROUGH*/
2172             case '?':           /* (??...) */
2173                 logical = 1;
2174                 if (*RExC_parse != '{')
2175                     goto unknown;
2176                 paren = *RExC_parse++;
2177                 /* FALL THROUGH */
2178             case '{':           /* (?{...}) */
2179             {
2180                 I32 count = 1, n = 0;
2181                 char c;
2182                 char *s = RExC_parse;
2183                 SV *sv;
2184                 OP_4tree *sop, *rop;
2185
2186                 RExC_seen_zerolen++;
2187                 RExC_seen |= REG_SEEN_EVAL;
2188                 while (count && (c = *RExC_parse)) {
2189                     if (c == '\\' && RExC_parse[1])
2190                         RExC_parse++;
2191                     else if (c == '{')
2192                         count++;
2193                     else if (c == '}')
2194                         count--;
2195                     RExC_parse++;
2196                 }
2197                 if (*RExC_parse != ')')
2198                 {
2199                     RExC_parse = s;             
2200                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2201                 }
2202                 if (!SIZE_ONLY) {
2203                     PAD *pad;
2204                 
2205                     if (RExC_parse - 1 - s)
2206                         sv = newSVpvn(s, RExC_parse - 1 - s);
2207                     else
2208                         sv = newSVpvn("", 0);
2209
2210                     ENTER;
2211                     Perl_save_re_context(aTHX);
2212                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2213                     sop->op_private |= OPpREFCOUNTED;
2214                     /* re_dup will OpREFCNT_inc */
2215                     OpREFCNT_set(sop, 1);
2216                     LEAVE;
2217
2218                     n = add_data(pRExC_state, 3, "nop");
2219                     RExC_rx->data->data[n] = (void*)rop;
2220                     RExC_rx->data->data[n+1] = (void*)sop;
2221                     RExC_rx->data->data[n+2] = (void*)pad;
2222                     SvREFCNT_dec(sv);
2223                 }
2224                 else {                                          /* First pass */
2225                     if (PL_reginterp_cnt < ++RExC_seen_evals
2226                         && PL_curcop != &PL_compiling)
2227                         /* No compiled RE interpolated, has runtime
2228                            components ===> unsafe.  */
2229                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2230                     if (PL_tainting && PL_tainted)
2231                         FAIL("Eval-group in insecure regular expression");
2232                 }
2233                 
2234                 nextchar(pRExC_state);
2235                 if (logical) {
2236                     ret = reg_node(pRExC_state, LOGICAL);
2237                     if (!SIZE_ONLY)
2238                         ret->flags = 2;
2239                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2240                     /* deal with the length of this later - MJD */
2241                     return ret;
2242                 }
2243                 ret = reganode(pRExC_state, EVAL, n);
2244                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2245                 Set_Node_Offset(ret, parse_start);
2246                 return ret;
2247             }
2248             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2249             {
2250                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2251                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2252                         || RExC_parse[1] == '<'
2253                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2254                         I32 flag;
2255                         
2256                         ret = reg_node(pRExC_state, LOGICAL);
2257                         if (!SIZE_ONLY)
2258                             ret->flags = 1;
2259                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2260                         goto insert_if;
2261                     }
2262                 }
2263                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2264                     /* (?(1)...) */
2265                     parno = atoi(RExC_parse++);
2266
2267                     while (isDIGIT(*RExC_parse))
2268                         RExC_parse++;
2269                     ret = reganode(pRExC_state, GROUPP, parno);
2270                     
2271                     if ((c = *nextchar(pRExC_state)) != ')')
2272                         vFAIL("Switch condition not recognized");
2273                   insert_if:
2274                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2275                     br = regbranch(pRExC_state, &flags, 1);
2276                     if (br == NULL)
2277                         br = reganode(pRExC_state, LONGJMP, 0);
2278                     else
2279                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2280                     c = *nextchar(pRExC_state);
2281                     if (flags&HASWIDTH)
2282                         *flagp |= HASWIDTH;
2283                     if (c == '|') {
2284                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2285                         regbranch(pRExC_state, &flags, 1);
2286                         regtail(pRExC_state, ret, lastbr);
2287                         if (flags&HASWIDTH)
2288                             *flagp |= HASWIDTH;
2289                         c = *nextchar(pRExC_state);
2290                     }
2291                     else
2292                         lastbr = NULL;
2293                     if (c != ')')
2294                         vFAIL("Switch (?(condition)... contains too many branches");
2295                     ender = reg_node(pRExC_state, TAIL);
2296                     regtail(pRExC_state, br, ender);
2297                     if (lastbr) {
2298                         regtail(pRExC_state, lastbr, ender);
2299                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2300                     }
2301                     else
2302                         regtail(pRExC_state, ret, ender);
2303                     return ret;
2304                 }
2305                 else {
2306                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2307                 }
2308             }
2309             case 0:
2310                 RExC_parse--; /* for vFAIL to print correctly */
2311                 vFAIL("Sequence (? incomplete");
2312                 break;
2313             default:
2314                 --RExC_parse;
2315               parse_flags:      /* (?i) */
2316                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2317                     /* (?g), (?gc) and (?o) are useless here
2318                        and must be globally applied -- japhy */
2319
2320                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2321                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2322                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2323                             if (! (wastedflags & wflagbit) ) {
2324                                 wastedflags |= wflagbit;
2325                                 vWARN5(
2326                                     RExC_parse + 1,
2327                                     "Useless (%s%c) - %suse /%c modifier",
2328                                     flagsp == &negflags ? "?-" : "?",
2329                                     *RExC_parse,
2330                                     flagsp == &negflags ? "don't " : "",
2331                                     *RExC_parse
2332                                 );
2333                             }
2334                         }
2335                     }
2336                     else if (*RExC_parse == 'c') {
2337                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2338                             if (! (wastedflags & wasted_c) ) {
2339                                 wastedflags |= wasted_gc;
2340                                 vWARN3(
2341                                     RExC_parse + 1,
2342                                     "Useless (%sc) - %suse /gc modifier",
2343                                     flagsp == &negflags ? "?-" : "?",
2344                                     flagsp == &negflags ? "don't " : ""
2345                                 );
2346                             }
2347                         }
2348                     }
2349                     else { pmflag(flagsp, *RExC_parse); }
2350
2351                     ++RExC_parse;
2352                 }
2353                 if (*RExC_parse == '-') {
2354                     flagsp = &negflags;
2355                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2356                     ++RExC_parse;
2357                     goto parse_flags;
2358                 }
2359                 RExC_flags |= posflags;
2360                 RExC_flags &= ~negflags;
2361                 if (*RExC_parse == ':') {
2362                     RExC_parse++;
2363                     paren = ':';
2364                     break;
2365                 }               
2366               unknown:
2367                 if (*RExC_parse != ')') {
2368                     RExC_parse++;
2369                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2370                 }
2371                 nextchar(pRExC_state);
2372                 *flagp = TRYAGAIN;
2373                 return NULL;
2374             }
2375         }
2376         else {                  /* (...) */
2377             parno = RExC_npar;
2378             RExC_npar++;
2379             ret = reganode(pRExC_state, OPEN, parno);
2380             Set_Node_Length(ret, 1); /* MJD */
2381             Set_Node_Offset(ret, RExC_parse); /* MJD */
2382             open = 1;
2383         }
2384     }
2385     else                        /* ! paren */
2386         ret = NULL;
2387
2388     /* Pick up the branches, linking them together. */
2389     parse_start = RExC_parse;   /* MJD */
2390     br = regbranch(pRExC_state, &flags, 1);
2391     /*     branch_len = (paren != 0); */
2392     
2393     if (br == NULL)
2394         return(NULL);
2395     if (*RExC_parse == '|') {
2396         if (!SIZE_ONLY && RExC_extralen) {
2397             reginsert(pRExC_state, BRANCHJ, br);
2398         }
2399         else {                  /* MJD */
2400             reginsert(pRExC_state, BRANCH, br);
2401             Set_Node_Length(br, paren != 0);
2402             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2403         }
2404         have_branch = 1;
2405         if (SIZE_ONLY)
2406             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2407     }
2408     else if (paren == ':') {
2409         *flagp |= flags&SIMPLE;
2410     }
2411     if (open) {                         /* Starts with OPEN. */
2412         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2413     }
2414     else if (paren != '?')              /* Not Conditional */
2415         ret = br;
2416     *flagp |= flags & (SPSTART | HASWIDTH);
2417     lastbr = br;
2418     while (*RExC_parse == '|') {
2419         if (!SIZE_ONLY && RExC_extralen) {
2420             ender = reganode(pRExC_state, LONGJMP,0);
2421             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2422         }
2423         if (SIZE_ONLY)
2424             RExC_extralen += 2;         /* Account for LONGJMP. */
2425         nextchar(pRExC_state);
2426         br = regbranch(pRExC_state, &flags, 0);
2427         
2428         if (br == NULL)
2429             return(NULL);
2430         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2431         lastbr = br;
2432         if (flags&HASWIDTH)
2433             *flagp |= HASWIDTH;
2434         *flagp |= flags&SPSTART;
2435     }
2436
2437     if (have_branch || paren != ':') {
2438         /* Make a closing node, and hook it on the end. */
2439         switch (paren) {
2440         case ':':
2441             ender = reg_node(pRExC_state, TAIL);
2442             break;
2443         case 1:
2444             ender = reganode(pRExC_state, CLOSE, parno);
2445             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2446             Set_Node_Length(ender,1); /* MJD */
2447             break;
2448         case '<':
2449         case ',':
2450         case '=':
2451         case '!':
2452             *flagp &= ~HASWIDTH;
2453             /* FALL THROUGH */
2454         case '>':
2455             ender = reg_node(pRExC_state, SUCCEED);
2456             break;
2457         case 0:
2458             ender = reg_node(pRExC_state, END);
2459             break;
2460         }
2461         regtail(pRExC_state, lastbr, ender);
2462
2463         if (have_branch) {
2464             /* Hook the tails of the branches to the closing node. */
2465             for (br = ret; br != NULL; br = regnext(br)) {
2466                 regoptail(pRExC_state, br, ender);
2467             }
2468         }
2469     }
2470
2471     {
2472         char *p;
2473         static char parens[] = "=!<,>";
2474
2475         if (paren && (p = strchr(parens, paren))) {
2476             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2477             int flag = (p - parens) > 1;
2478
2479             if (paren == '>')
2480                 node = SUSPEND, flag = 0;
2481             reginsert(pRExC_state, node,ret);
2482             Set_Node_Offset(ret, oregcomp_parse);
2483             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2484             ret->flags = flag;
2485             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2486         }
2487     }
2488
2489     /* Check for proper termination. */
2490     if (paren) {
2491         RExC_flags = oregflags;
2492         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2493             RExC_parse = oregcomp_parse;
2494             vFAIL("Unmatched (");
2495         }
2496     }
2497     else if (!paren && RExC_parse < RExC_end) {
2498         if (*RExC_parse == ')') {
2499             RExC_parse++;
2500             vFAIL("Unmatched )");
2501         }
2502         else
2503             FAIL("Junk on end of regexp");      /* "Can't happen". */
2504         /* NOTREACHED */
2505     }
2506
2507     return(ret);
2508 }
2509
2510 /*
2511  - regbranch - one alternative of an | operator
2512  *
2513  * Implements the concatenation operator.
2514  */
2515 STATIC regnode *
2516 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2517 {
2518     register regnode *ret;
2519     register regnode *chain = NULL;
2520     register regnode *latest;
2521     I32 flags = 0, c = 0;
2522
2523     if (first)
2524         ret = NULL;
2525     else {
2526         if (!SIZE_ONLY && RExC_extralen)
2527             ret = reganode(pRExC_state, BRANCHJ,0);
2528         else {
2529             ret = reg_node(pRExC_state, BRANCH);
2530             Set_Node_Length(ret, 1);
2531         }
2532     }
2533         
2534     if (!first && SIZE_ONLY)
2535         RExC_extralen += 1;                     /* BRANCHJ */
2536
2537     *flagp = WORST;                     /* Tentatively. */
2538
2539     RExC_parse--;
2540     nextchar(pRExC_state);
2541     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2542         flags &= ~TRYAGAIN;
2543         latest = regpiece(pRExC_state, &flags);
2544         if (latest == NULL) {
2545             if (flags & TRYAGAIN)
2546                 continue;
2547             return(NULL);
2548         }
2549         else if (ret == NULL)
2550             ret = latest;
2551         *flagp |= flags&HASWIDTH;
2552         if (chain == NULL)      /* First piece. */
2553             *flagp |= flags&SPSTART;
2554         else {
2555             RExC_naughty++;
2556             regtail(pRExC_state, chain, latest);
2557         }
2558         chain = latest;
2559         c++;
2560     }
2561     if (chain == NULL) {        /* Loop ran zero times. */
2562         chain = reg_node(pRExC_state, NOTHING);
2563         if (ret == NULL)
2564             ret = chain;
2565     }
2566     if (c == 1) {
2567         *flagp |= flags&SIMPLE;
2568     }
2569
2570     return(ret);
2571 }
2572
2573 /*
2574  - regpiece - something followed by possible [*+?]
2575  *
2576  * Note that the branching code sequences used for ? and the general cases
2577  * of * and + are somewhat optimized:  they use the same NOTHING node as
2578  * both the endmarker for their branch list and the body of the last branch.
2579  * It might seem that this node could be dispensed with entirely, but the
2580  * endmarker role is not redundant.
2581  */
2582 STATIC regnode *
2583 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2584 {
2585     register regnode *ret;
2586     register char op;
2587     register char *next;
2588     I32 flags;
2589     char *origparse = RExC_parse;
2590     char *maxpos;
2591     I32 min;
2592     I32 max = REG_INFTY;
2593     char *parse_start;
2594
2595     ret = regatom(pRExC_state, &flags);
2596     if (ret == NULL) {
2597         if (flags & TRYAGAIN)
2598             *flagp |= TRYAGAIN;
2599         return(NULL);
2600     }
2601
2602     op = *RExC_parse;
2603
2604     if (op == '{' && regcurly(RExC_parse)) {
2605         parse_start = RExC_parse; /* MJD */
2606         next = RExC_parse + 1;
2607         maxpos = Nullch;
2608         while (isDIGIT(*next) || *next == ',') {
2609             if (*next == ',') {
2610                 if (maxpos)
2611                     break;
2612                 else
2613                     maxpos = next;
2614             }
2615             next++;
2616         }
2617         if (*next == '}') {             /* got one */
2618             if (!maxpos)
2619                 maxpos = next;
2620             RExC_parse++;
2621             min = atoi(RExC_parse);
2622             if (*maxpos == ',')
2623                 maxpos++;
2624             else
2625                 maxpos = RExC_parse;
2626             max = atoi(maxpos);
2627             if (!max && *maxpos != '0')
2628                 max = REG_INFTY;                /* meaning "infinity" */
2629             else if (max >= REG_INFTY)
2630                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2631             RExC_parse = next;
2632             nextchar(pRExC_state);
2633
2634         do_curly:
2635             if ((flags&SIMPLE)) {
2636                 RExC_naughty += 2 + RExC_naughty / 2;
2637                 reginsert(pRExC_state, CURLY, ret);
2638                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2639                 Set_Node_Cur_Length(ret);
2640             }
2641             else {
2642                 regnode *w = reg_node(pRExC_state, WHILEM);
2643
2644                 w->flags = 0;
2645                 regtail(pRExC_state, ret, w);
2646                 if (!SIZE_ONLY && RExC_extralen) {
2647                     reginsert(pRExC_state, LONGJMP,ret);
2648                     reginsert(pRExC_state, NOTHING,ret);
2649                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2650                 }
2651                 reginsert(pRExC_state, CURLYX,ret);
2652                                 /* MJD hk */
2653                 Set_Node_Offset(ret, parse_start+1);
2654                 Set_Node_Length(ret, 
2655                                 op == '{' ? (RExC_parse - parse_start) : 1);
2656                 
2657                 if (!SIZE_ONLY && RExC_extralen)
2658                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2659                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2660                 if (SIZE_ONLY)
2661                     RExC_whilem_seen++, RExC_extralen += 3;
2662                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2663             }
2664             ret->flags = 0;
2665
2666             if (min > 0)
2667                 *flagp = WORST;
2668             if (max > 0)
2669                 *flagp |= HASWIDTH;
2670             if (max && max < min)
2671                 vFAIL("Can't do {n,m} with n > m");
2672             if (!SIZE_ONLY) {
2673                 ARG1_SET(ret, (U16)min);
2674                 ARG2_SET(ret, (U16)max);
2675             }
2676
2677             goto nest_check;
2678         }
2679     }
2680
2681     if (!ISMULT1(op)) {
2682         *flagp = flags;
2683         return(ret);
2684     }
2685
2686 #if 0                           /* Now runtime fix should be reliable. */
2687
2688     /* if this is reinstated, don't forget to put this back into perldiag:
2689
2690             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2691
2692            (F) The part of the regexp subject to either the * or + quantifier
2693            could match an empty string. The {#} shows in the regular
2694            expression about where the problem was discovered.
2695
2696     */
2697
2698     if (!(flags&HASWIDTH) && op != '?')
2699       vFAIL("Regexp *+ operand could be empty");
2700 #endif
2701
2702     parse_start = RExC_parse;
2703     nextchar(pRExC_state);
2704
2705     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2706
2707     if (op == '*' && (flags&SIMPLE)) {
2708         reginsert(pRExC_state, STAR, ret);
2709         ret->flags = 0;
2710         RExC_naughty += 4;
2711     }
2712     else if (op == '*') {
2713         min = 0;
2714         goto do_curly;
2715     }
2716     else if (op == '+' && (flags&SIMPLE)) {
2717         reginsert(pRExC_state, PLUS, ret);
2718         ret->flags = 0;
2719         RExC_naughty += 3;
2720     }
2721     else if (op == '+') {
2722         min = 1;
2723         goto do_curly;
2724     }
2725     else if (op == '?') {
2726         min = 0; max = 1;
2727         goto do_curly;
2728     }
2729   nest_check:
2730     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2731         vWARN3(RExC_parse,
2732                "%.*s matches null string many times",
2733                RExC_parse - origparse,
2734                origparse);
2735     }
2736
2737     if (*RExC_parse == '?') {
2738         nextchar(pRExC_state);
2739         reginsert(pRExC_state, MINMOD, ret);
2740         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2741     }
2742     if (ISMULT2(RExC_parse)) {
2743         RExC_parse++;
2744         vFAIL("Nested quantifiers");
2745     }
2746
2747     return(ret);
2748 }
2749
2750 /*
2751  - regatom - the lowest level
2752  *
2753  * Optimization:  gobbles an entire sequence of ordinary characters so that
2754  * it can turn them into a single node, which is smaller to store and
2755  * faster to run.  Backslashed characters are exceptions, each becoming a
2756  * separate node; the code is simpler that way and it's not worth fixing.
2757  *
2758  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2759 STATIC regnode *
2760 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2761 {
2762     register regnode *ret = 0;
2763     I32 flags;
2764     char *parse_start = 0;
2765
2766     *flagp = WORST;             /* Tentatively. */
2767
2768 tryagain:
2769     switch (*RExC_parse) {
2770     case '^':
2771         RExC_seen_zerolen++;
2772         nextchar(pRExC_state);
2773         if (RExC_flags & PMf_MULTILINE)
2774             ret = reg_node(pRExC_state, MBOL);
2775         else if (RExC_flags & PMf_SINGLELINE)
2776             ret = reg_node(pRExC_state, SBOL);
2777         else
2778             ret = reg_node(pRExC_state, BOL);
2779         Set_Node_Length(ret, 1); /* MJD */
2780         break;
2781     case '$':
2782         nextchar(pRExC_state);
2783         if (*RExC_parse)
2784             RExC_seen_zerolen++;
2785         if (RExC_flags & PMf_MULTILINE)
2786             ret = reg_node(pRExC_state, MEOL);
2787         else if (RExC_flags & PMf_SINGLELINE)
2788             ret = reg_node(pRExC_state, SEOL);
2789         else
2790             ret = reg_node(pRExC_state, EOL);
2791         Set_Node_Length(ret, 1); /* MJD */
2792         break;
2793     case '.':
2794         nextchar(pRExC_state);
2795         if (RExC_flags & PMf_SINGLELINE)
2796             ret = reg_node(pRExC_state, SANY);
2797         else
2798             ret = reg_node(pRExC_state, REG_ANY);
2799         *flagp |= HASWIDTH|SIMPLE;
2800         RExC_naughty++;
2801         Set_Node_Length(ret, 1); /* MJD */
2802         break;
2803     case '[':
2804     {
2805         char *oregcomp_parse = ++RExC_parse;
2806         ret = regclass(pRExC_state);
2807         if (*RExC_parse != ']') {
2808             RExC_parse = oregcomp_parse;
2809             vFAIL("Unmatched [");
2810         }
2811         nextchar(pRExC_state);
2812         *flagp |= HASWIDTH|SIMPLE;
2813         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2814         break;
2815     }
2816     case '(':
2817         nextchar(pRExC_state);
2818         ret = reg(pRExC_state, 1, &flags);
2819         if (ret == NULL) {
2820                 if (flags & TRYAGAIN) {
2821                     if (RExC_parse == RExC_end) {
2822                          /* Make parent create an empty node if needed. */
2823                         *flagp |= TRYAGAIN;
2824                         return(NULL);
2825                     }
2826                     goto tryagain;
2827                 }
2828                 return(NULL);
2829         }
2830         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2831         break;
2832     case '|':
2833     case ')':
2834         if (flags & TRYAGAIN) {
2835             *flagp |= TRYAGAIN;
2836             return NULL;
2837         }
2838         vFAIL("Internal urp");
2839                                 /* Supposed to be caught earlier. */
2840         break;
2841     case '{':
2842         if (!regcurly(RExC_parse)) {
2843             RExC_parse++;
2844             goto defchar;
2845         }
2846         /* FALL THROUGH */
2847     case '?':
2848     case '+':
2849     case '*':
2850         RExC_parse++;
2851         vFAIL("Quantifier follows nothing");
2852         break;
2853     case '\\':
2854         switch (*++RExC_parse) {
2855         case 'A':
2856             RExC_seen_zerolen++;
2857             ret = reg_node(pRExC_state, SBOL);
2858             *flagp |= SIMPLE;
2859             nextchar(pRExC_state);
2860             Set_Node_Length(ret, 2); /* MJD */
2861             break;
2862         case 'G':
2863             ret = reg_node(pRExC_state, GPOS);
2864             RExC_seen |= REG_SEEN_GPOS;
2865             *flagp |= SIMPLE;
2866             nextchar(pRExC_state);
2867             Set_Node_Length(ret, 2); /* MJD */
2868             break;
2869         case 'Z':
2870             ret = reg_node(pRExC_state, SEOL);
2871             *flagp |= SIMPLE;
2872             RExC_seen_zerolen++;                /* Do not optimize RE away */
2873             nextchar(pRExC_state);
2874             break;
2875         case 'z':
2876             ret = reg_node(pRExC_state, EOS);
2877             *flagp |= SIMPLE;
2878             RExC_seen_zerolen++;                /* Do not optimize RE away */
2879             nextchar(pRExC_state);
2880             Set_Node_Length(ret, 2); /* MJD */
2881             break;
2882         case 'C':
2883             ret = reg_node(pRExC_state, CANY);
2884             RExC_seen |= REG_SEEN_CANY;
2885             *flagp |= HASWIDTH|SIMPLE;
2886             nextchar(pRExC_state);
2887             Set_Node_Length(ret, 2); /* MJD */
2888             break;
2889         case 'X':
2890             ret = reg_node(pRExC_state, CLUMP);
2891             *flagp |= HASWIDTH;
2892             nextchar(pRExC_state);
2893             Set_Node_Length(ret, 2); /* MJD */
2894             break;
2895         case 'w':
2896             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2897             *flagp |= HASWIDTH|SIMPLE;
2898             nextchar(pRExC_state);
2899             Set_Node_Length(ret, 2); /* MJD */
2900             break;
2901         case 'W':
2902             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2903             *flagp |= HASWIDTH|SIMPLE;
2904             nextchar(pRExC_state);
2905             Set_Node_Length(ret, 2); /* MJD */
2906             break;
2907         case 'b':
2908             RExC_seen_zerolen++;
2909             RExC_seen |= REG_SEEN_LOOKBEHIND;
2910             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2911             *flagp |= SIMPLE;
2912             nextchar(pRExC_state);
2913             Set_Node_Length(ret, 2); /* MJD */
2914             break;
2915         case 'B':
2916             RExC_seen_zerolen++;
2917             RExC_seen |= REG_SEEN_LOOKBEHIND;
2918             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2919             *flagp |= SIMPLE;
2920             nextchar(pRExC_state);
2921             Set_Node_Length(ret, 2); /* MJD */
2922             break;
2923         case 's':
2924             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2925             *flagp |= HASWIDTH|SIMPLE;
2926             nextchar(pRExC_state);
2927             Set_Node_Length(ret, 2); /* MJD */
2928             break;
2929         case 'S':
2930             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2931             *flagp |= HASWIDTH|SIMPLE;
2932             nextchar(pRExC_state);
2933             Set_Node_Length(ret, 2); /* MJD */
2934             break;
2935         case 'd':
2936             ret = reg_node(pRExC_state, DIGIT);
2937             *flagp |= HASWIDTH|SIMPLE;
2938             nextchar(pRExC_state);
2939             Set_Node_Length(ret, 2); /* MJD */
2940             break;
2941         case 'D':
2942             ret = reg_node(pRExC_state, NDIGIT);
2943             *flagp |= HASWIDTH|SIMPLE;
2944             nextchar(pRExC_state);
2945             Set_Node_Length(ret, 2); /* MJD */
2946             break;
2947         case 'p':
2948         case 'P':
2949             {   
2950                 char* oldregxend = RExC_end;
2951                 char* parse_start = RExC_parse - 2;
2952
2953                 if (RExC_parse[1] == '{') {
2954                   /* a lovely hack--pretend we saw [\pX] instead */
2955                     RExC_end = strchr(RExC_parse, '}');
2956                     if (!RExC_end) {
2957                         U8 c = (U8)*RExC_parse;
2958                         RExC_parse += 2;
2959                         RExC_end = oldregxend;
2960                         vFAIL2("Missing right brace on \\%c{}", c);
2961                     }
2962                     RExC_end++;
2963                 }
2964                 else {
2965                     RExC_end = RExC_parse + 2;
2966                     if (RExC_end > oldregxend)
2967                         RExC_end = oldregxend;
2968                 }
2969                 RExC_parse--;
2970
2971                 ret = regclass(pRExC_state);
2972
2973                 RExC_end = oldregxend;
2974                 RExC_parse--;
2975
2976                 Set_Node_Offset(ret, parse_start + 2);
2977                 Set_Node_Cur_Length(ret);
2978                 nextchar(pRExC_state);
2979                 *flagp |= HASWIDTH|SIMPLE;
2980             }
2981             break;
2982         case 'n':
2983         case 'r':
2984         case 't':
2985         case 'f':
2986         case 'e':
2987         case 'a':
2988         case 'x':
2989         case 'c':
2990         case '0':
2991             goto defchar;
2992         case '1': case '2': case '3': case '4':
2993         case '5': case '6': case '7': case '8': case '9':
2994             {
2995                 I32 num = atoi(RExC_parse);
2996
2997                 if (num > 9 && num >= RExC_npar)
2998                     goto defchar;
2999                 else {
3000                     char * parse_start = RExC_parse - 1; /* MJD */
3001                     while (isDIGIT(*RExC_parse))
3002                         RExC_parse++;
3003
3004                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3005                         vFAIL("Reference to nonexistent group");
3006                     RExC_sawback = 1;
3007                     ret = reganode(pRExC_state,
3008                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3009                                    num);
3010                     *flagp |= HASWIDTH;
3011                     
3012                     /* override incorrect value set in reganode MJD */
3013                     Set_Node_Offset(ret, parse_start+1); 
3014                     Set_Node_Cur_Length(ret); /* MJD */
3015                     RExC_parse--;
3016                     nextchar(pRExC_state);
3017                 }
3018             }
3019             break;
3020         case '\0':
3021             if (RExC_parse >= RExC_end)
3022                 FAIL("Trailing \\");
3023             /* FALL THROUGH */
3024         default:
3025             /* Do not generate `unrecognized' warnings here, we fall
3026                back into the quick-grab loop below */
3027             goto defchar;
3028         }
3029         break;
3030
3031     case '#':
3032         if (RExC_flags & PMf_EXTENDED) {
3033             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3034             if (RExC_parse < RExC_end)
3035                 goto tryagain;
3036         }
3037         /* FALL THROUGH */
3038
3039     default: {
3040             register STRLEN len;
3041             register UV ender;
3042             register char *p;
3043             char *oldp, *s;
3044             STRLEN numlen;
3045             STRLEN foldlen;
3046             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3047
3048             parse_start = RExC_parse - 1;
3049
3050             RExC_parse++;
3051
3052         defchar:
3053             ender = 0;
3054             ret = reg_node(pRExC_state,
3055                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3056             s = STRING(ret);
3057             for (len = 0, p = RExC_parse - 1;
3058               len < 127 && p < RExC_end;
3059               len++)
3060             {
3061                 oldp = p;
3062
3063                 if (RExC_flags & PMf_EXTENDED)
3064                     p = regwhite(p, RExC_end);
3065                 switch (*p) {
3066                 case '^':
3067                 case '$':
3068                 case '.':
3069                 case '[':
3070                 case '(':
3071                 case ')':
3072                 case '|':
3073                     goto loopdone;
3074                 case '\\':
3075                     switch (*++p) {
3076                     case 'A':
3077                     case 'C':
3078                     case 'X':
3079                     case 'G':
3080                     case 'Z':
3081                     case 'z':
3082                     case 'w':
3083                     case 'W':
3084                     case 'b':
3085                     case 'B':
3086                     case 's':
3087                     case 'S':
3088                     case 'd':
3089                     case 'D':
3090                     case 'p':
3091                     case 'P':
3092                         --p;
3093                         goto loopdone;
3094                     case 'n':
3095                         ender = '\n';
3096                         p++;
3097                         break;
3098                     case 'r':
3099                         ender = '\r';
3100                         p++;
3101                         break;
3102                     case 't':
3103                         ender = '\t';
3104                         p++;
3105                         break;
3106                     case 'f':
3107                         ender = '\f';
3108                         p++;
3109                         break;
3110                     case 'e':
3111                           ender = ASCII_TO_NATIVE('\033');
3112                         p++;
3113                         break;
3114                     case 'a':
3115                           ender = ASCII_TO_NATIVE('\007');
3116                         p++;
3117                         break;
3118                     case 'x':
3119                         if (*++p == '{') {
3120                             char* e = strchr(p, '}');
3121         
3122                             if (!e) {
3123                                 RExC_parse = p + 1;
3124                                 vFAIL("Missing right brace on \\x{}");
3125                             }
3126                             else {
3127                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3128                                     | PERL_SCAN_DISALLOW_PREFIX;
3129                                 numlen = e - p - 1;
3130                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3131                                 if (ender > 0xff)
3132                                     RExC_utf8 = 1;
3133                                 /* numlen is generous */
3134                                 if (numlen + len >= 127) {
3135                                     p--;
3136                                     goto loopdone;
3137                                 }
3138                                 p = e + 1;
3139                             }
3140                         }
3141                         else {
3142                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3143                             numlen = 2;
3144                             ender = grok_hex(p, &numlen, &flags, NULL);
3145                             p += numlen;
3146                         }
3147                         break;
3148                     case 'c':
3149                         p++;
3150                         ender = UCHARAT(p++);
3151                         ender = toCTRL(ender);
3152                         break;
3153                     case '0': case '1': case '2': case '3':case '4':
3154                     case '5': case '6': case '7': case '8':case '9':
3155                         if (*p == '0' ||
3156                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3157                             I32 flags = 0;
3158                             numlen = 3;
3159                             ender = grok_oct(p, &numlen, &flags, NULL);
3160                             p += numlen;
3161                         }
3162                         else {
3163                             --p;
3164                             goto loopdone;
3165                         }
3166                         break;
3167                     case '\0':
3168                         if (p >= RExC_end)
3169                             FAIL("Trailing \\");
3170                         /* FALL THROUGH */
3171                     default:
3172                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3173                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3174                         goto normal_default;
3175                     }
3176                     break;
3177                 default:
3178                   normal_default:
3179                     if (UTF8_IS_START(*p) && UTF) {
3180                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3181                                                &numlen, 0);
3182                         p += numlen;
3183                     }
3184                     else
3185                         ender = *p++;
3186                     break;
3187                 }
3188                 if (RExC_flags & PMf_EXTENDED)
3189                     p = regwhite(p, RExC_end);
3190                 if (UTF && FOLD) {
3191                     /* Prime the casefolded buffer. */
3192                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3193                 }
3194                 if (ISMULT2(p)) { /* Back off on ?+*. */
3195                     if (len)
3196                         p = oldp;
3197                     else if (UTF) {
3198                          STRLEN unilen;
3199
3200                          if (FOLD) {
3201                               /* Emit all the Unicode characters. */
3202                               for (foldbuf = tmpbuf;
3203                                    foldlen;
3204                                    foldlen -= numlen) {
3205                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3206                                    if (numlen > 0) {
3207                                         reguni(pRExC_state, ender, s, &unilen);
3208                                         s       += unilen;
3209                                         len     += unilen;
3210                                         /* In EBCDIC the numlen
3211                                          * and unilen can differ. */
3212                                         foldbuf += numlen;
3213                                         if (numlen >= foldlen)
3214                                              break;
3215                                    }
3216                                    else
3217                                         break; /* "Can't happen." */
3218                               }
3219                          }
3220                          else {
3221                               reguni(pRExC_state, ender, s, &unilen);
3222                               if (unilen > 0) {
3223                                    s   += unilen;
3224                                    len += unilen;
3225                               }
3226                          }
3227                     }
3228                     else {
3229                         len++;
3230                         REGC((char)ender, s++);
3231                     }
3232                     break;
3233                 }
3234                 if (UTF) {
3235                      STRLEN unilen;
3236
3237                      if (FOLD) {
3238                           /* Emit all the Unicode characters. */
3239                           for (foldbuf = tmpbuf;
3240                                foldlen;
3241                                foldlen -= numlen) {
3242                                ender = utf8_to_uvchr(foldbuf, &numlen);
3243                                if (numlen > 0) {
3244                                     reguni(pRExC_state, ender, s, &unilen);
3245                                     len     += unilen;
3246                                     s       += unilen;
3247                                     /* In EBCDIC the numlen
3248                                      * and unilen can differ. */
3249                                     foldbuf += numlen;
3250                                     if (numlen >= foldlen)
3251                                          break;
3252                                }
3253                                else
3254                                     break;
3255                           }
3256                      }
3257                      else {
3258                           reguni(pRExC_state, ender, s, &unilen);
3259                           if (unilen > 0) {
3260                                s   += unilen;
3261                                len += unilen;
3262                           }
3263                      }
3264                      len--;
3265                 }
3266                 else
3267                     REGC((char)ender, s++);
3268             }
3269         loopdone:
3270             RExC_parse = p - 1;
3271             Set_Node_Cur_Length(ret); /* MJD */
3272             nextchar(pRExC_state);
3273             {
3274                 /* len is STRLEN which is unsigned, need to copy to signed */
3275                 IV iv = len;
3276                 if (iv < 0)
3277                     vFAIL("Internal disaster");
3278             }
3279             if (len > 0)
3280                 *flagp |= HASWIDTH;
3281             if (len == 1)
3282                 *flagp |= SIMPLE;
3283             if (!SIZE_ONLY)
3284                 STR_LEN(ret) = len;
3285             if (SIZE_ONLY)
3286                 RExC_size += STR_SZ(len);
3287             else
3288                 RExC_emit += STR_SZ(len);
3289         }
3290         break;
3291     }
3292
3293     /* If the encoding pragma is in effect recode the text of
3294      * any EXACT-kind nodes. */
3295     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3296         STRLEN oldlen = STR_LEN(ret);
3297         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3298
3299         if (RExC_utf8)
3300             SvUTF8_on(sv);
3301         if (sv_utf8_downgrade(sv, TRUE)) {
3302             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3303             STRLEN newlen = SvCUR(sv);
3304
3305             if (SvUTF8(sv))
3306                 RExC_utf8 = 1;
3307             if (!SIZE_ONLY) {
3308                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3309                                       (int)oldlen, STRING(ret),
3310                                       (int)newlen, s));
3311                 Copy(s, STRING(ret), newlen, char);
3312                 STR_LEN(ret) += newlen - oldlen;
3313                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3314             } else
3315                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3316         }
3317     }
3318
3319     return(ret);
3320 }
3321
3322 STATIC char *
3323 S_regwhite(pTHX_ char *p, char *e)
3324 {
3325     while (p < e) {
3326         if (isSPACE(*p))
3327             ++p;
3328         else if (*p == '#') {
3329             do {
3330                 p++;
3331             } while (p < e && *p != '\n');
3332         }
3333         else
3334             break;
3335     }
3336     return p;
3337 }
3338
3339 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3340    Character classes ([:foo:]) can also be negated ([:^foo:]).
3341    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3342    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3343    but trigger failures because they are currently unimplemented. */
3344
3345 #define POSIXCC_DONE(c)   ((c) == ':')
3346 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3347 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3348
3349 STATIC I32
3350 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3351 {
3352     char *posixcc = 0;
3353     I32 namedclass = OOB_NAMEDCLASS;
3354
3355     if (value == '[' && RExC_parse + 1 < RExC_end &&
3356         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3357         POSIXCC(UCHARAT(RExC_parse))) {
3358         char  c = UCHARAT(RExC_parse);
3359         char* s = RExC_parse++;
3360         
3361         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3362             RExC_parse++;
3363         if (RExC_parse == RExC_end)
3364             /* Grandfather lone [:, [=, [. */
3365             RExC_parse = s;
3366         else {
3367             char* t = RExC_parse++; /* skip over the c */
3368
3369             if (UCHARAT(RExC_parse) == ']') {
3370                 RExC_parse++; /* skip over the ending ] */
3371                 posixcc = s + 1;
3372                 if (*s == ':') {
3373                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3374                     I32 skip = 5; /* the most common skip */
3375
3376                     switch (*posixcc) {
3377                     case 'a':
3378                         if (strnEQ(posixcc, "alnum", 5))
3379                             namedclass =
3380                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3381                         else if (strnEQ(posixcc, "alpha", 5))
3382                             namedclass =
3383                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3384                         else if (strnEQ(posixcc, "ascii", 5))
3385                             namedclass =
3386                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3387                         break;
3388                     case 'b':
3389                         if (strnEQ(posixcc, "blank", 5))
3390                             namedclass =
3391                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3392                         break;
3393                     case 'c':
3394                         if (strnEQ(posixcc, "cntrl", 5))
3395                             namedclass =
3396                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3397                         break;
3398                     case 'd':
3399                         if (strnEQ(posixcc, "digit", 5))
3400                             namedclass =
3401                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3402                         break;
3403                     case 'g':
3404                         if (strnEQ(posixcc, "graph", 5))
3405                             namedclass =
3406                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3407                         break;
3408                     case 'l':
3409                         if (strnEQ(posixcc, "lower", 5))
3410                             namedclass =
3411                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3412                         break;
3413                     case 'p':
3414                         if (strnEQ(posixcc, "print", 5))
3415                             namedclass =
3416                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3417                         else if (strnEQ(posixcc, "punct", 5))
3418                             namedclass =
3419                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3420                         break;
3421                     case 's':
3422                         if (strnEQ(posixcc, "space", 5))
3423                             namedclass =
3424                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3425                         break;
3426                     case 'u':
3427                         if (strnEQ(posixcc, "upper", 5))
3428                             namedclass =
3429                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3430                         break;
3431                     case 'w': /* this is not POSIX, this is the Perl \w */
3432                         if (strnEQ(posixcc, "word", 4)) {
3433                             namedclass =
3434                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3435                             skip = 4;
3436                         }
3437                         break;
3438                     case 'x':
3439                         if (strnEQ(posixcc, "xdigit", 6)) {
3440                             namedclass =
3441                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3442                             skip = 6;
3443                         }
3444                         break;
3445                     }
3446                     if (namedclass == OOB_NAMEDCLASS ||
3447                         posixcc[skip] != ':' ||
3448                         posixcc[skip+1] != ']')
3449                     {
3450                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3451                                       t - s - 1, s + 1);
3452                     }
3453                 } else if (!SIZE_ONLY) {
3454                     /* [[=foo=]] and [[.foo.]] are still future. */
3455
3456                     /* adjust RExC_parse so the warning shows after
3457                        the class closes */
3458                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3459                         RExC_parse++;
3460                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3461                 }
3462             } else {
3463                 /* Maternal grandfather:
3464                  * "[:" ending in ":" but not in ":]" */
3465                 RExC_parse = s;
3466             }
3467         }
3468     }
3469
3470     return namedclass;
3471 }
3472
3473 STATIC void
3474 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3475 {
3476     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3477         char *s = RExC_parse;
3478         char  c = *s++;
3479
3480         while(*s && isALNUM(*s))
3481             s++;
3482         if (*s && c == *s && s[1] == ']') {
3483             if (ckWARN(WARN_REGEXP))
3484                 vWARN3(s+2,
3485                         "POSIX syntax [%c %c] belongs inside character classes",
3486                         c, c);
3487
3488             /* [[=foo=]] and [[.foo.]] are still future. */
3489             if (POSIXCC_NOTYET(c)) {
3490                 /* adjust RExC_parse so the error shows after
3491                    the class closes */
3492                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3493                     ;
3494                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3495             }
3496         }
3497     }
3498 }
3499
3500 STATIC regnode *
3501 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3502 {
3503     register UV value;
3504     register UV nextvalue;
3505     register IV prevvalue = OOB_UNICODE;
3506     register IV range = 0;
3507     register regnode *ret;
3508     STRLEN numlen;
3509     IV namedclass;
3510     char *rangebegin = 0;
3511     bool need_class = 0;
3512     SV *listsv = Nullsv;
3513     register char *e;
3514     UV n;
3515     bool optimize_invert   = TRUE;
3516     AV* unicode_alternate  = 0;
3517 #ifdef EBCDIC
3518     UV literal_endpoint = 0;
3519 #endif
3520
3521     ret = reganode(pRExC_state, ANYOF, 0);
3522
3523     if (!SIZE_ONLY)
3524         ANYOF_FLAGS(ret) = 0;
3525
3526     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3527         RExC_naughty++;
3528         RExC_parse++;
3529         if (!SIZE_ONLY)
3530             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3531     }
3532
3533     if (SIZE_ONLY)
3534         RExC_size += ANYOF_SKIP;
3535     else {
3536         RExC_emit += ANYOF_SKIP;
3537         if (FOLD)
3538             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3539         if (LOC)
3540             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3541         ANYOF_BITMAP_ZERO(ret);
3542         listsv = newSVpvn("# comment\n", 10);
3543     }
3544
3545     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3546
3547     if (!SIZE_ONLY && POSIXCC(nextvalue))
3548         checkposixcc(pRExC_state);
3549
3550     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3551     if (UCHARAT(RExC_parse) == ']')
3552         goto charclassloop;
3553
3554     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3555
3556     charclassloop:
3557
3558         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3559
3560         if (!range)
3561             rangebegin = RExC_parse;
3562         if (UTF) {
3563             value = utf8n_to_uvchr((U8*)RExC_parse,
3564                                    RExC_end - RExC_parse,
3565                                    &numlen, 0);
3566             RExC_parse += numlen;
3567         }
3568         else
3569             value = UCHARAT(RExC_parse++);
3570         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3571         if (value == '[' && POSIXCC(nextvalue))
3572             namedclass = regpposixcc(pRExC_state, value);
3573         else if (value == '\\') {
3574             if (UTF) {
3575                 value = utf8n_to_uvchr((U8*)RExC_parse,
3576                                    RExC_end - RExC_parse,
3577                                    &numlen, 0);
3578                 RExC_parse += numlen;
3579             }
3580             else
3581                 value = UCHARAT(RExC_parse++);
3582             /* Some compilers cannot handle switching on 64-bit integer
3583              * values, therefore value cannot be an UV.  Yes, this will
3584              * be a problem later if we want switch on Unicode.
3585              * A similar issue a little bit later when switching on
3586              * namedclass. --jhi */
3587             switch ((I32)value) {
3588             case 'w':   namedclass = ANYOF_ALNUM;       break;
3589             case 'W':   namedclass = ANYOF_NALNUM;      break;
3590             case 's':   namedclass = ANYOF_SPACE;       break;
3591             case 'S':   namedclass = ANYOF_NSPACE;      break;
3592             case 'd':   namedclass = ANYOF_DIGIT;       break;
3593             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3594             case 'p':
3595             case 'P':
3596                 if (RExC_parse >= RExC_end)
3597                     vFAIL2("Empty \\%c{}", (U8)value);
3598                 if (*RExC_parse == '{') {
3599                     U8 c = (U8)value;
3600                     e = strchr(RExC_parse++, '}');
3601                     if (!e)
3602                         vFAIL2("Missing right brace on \\%c{}", c);
3603                     while (isSPACE(UCHARAT(RExC_parse)))
3604                         RExC_parse++;
3605                     if (e == RExC_parse)
3606                         vFAIL2("Empty \\%c{}", c);
3607                     n = e - RExC_parse;
3608                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3609                         n--;
3610                 }
3611                 else {
3612                     e = RExC_parse;
3613                     n = 1;
3614                 }
3615                 if (!SIZE_ONLY) {
3616                     if (UCHARAT(RExC_parse) == '^') {
3617                          RExC_parse++;
3618                          n--;
3619                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3620                          while (isSPACE(UCHARAT(RExC_parse))) {
3621                               RExC_parse++;
3622                               n--;
3623                          }
3624                     }
3625                     if (value == 'p')
3626                          Perl_sv_catpvf(aTHX_ listsv,
3627                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3628                     else
3629                          Perl_sv_catpvf(aTHX_ listsv,
3630                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3631                 }
3632                 RExC_parse = e + 1;
3633                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3634                 continue;
3635             case 'n':   value = '\n';                   break;
3636             case 'r':   value = '\r';                   break;
3637             case 't':   value = '\t';                   break;
3638             case 'f':   value = '\f';                   break;
3639             case 'b':   value = '\b';                   break;
3640             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3641             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3642             case 'x':
3643                 if (*RExC_parse == '{') {
3644                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3645                         | PERL_SCAN_DISALLOW_PREFIX;
3646                     e = strchr(RExC_parse++, '}');
3647                     if (!e)
3648                         vFAIL("Missing right brace on \\x{}");
3649
3650                     numlen = e - RExC_parse;
3651                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3652                     RExC_parse = e + 1;
3653                 }
3654                 else {
3655                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3656                     numlen = 2;
3657                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3658                     RExC_parse += numlen;
3659                 }
3660                 break;
3661             case 'c':
3662                 value = UCHARAT(RExC_parse++);
3663                 value = toCTRL(value);
3664                 break;
3665             case '0': case '1': case '2': case '3': case '4':
3666             case '5': case '6': case '7': case '8': case '9':
3667             {
3668                 I32 flags = 0;
3669                 numlen = 3;
3670                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3671                 RExC_parse += numlen;
3672                 break;
3673             }
3674             default:
3675                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3676                     vWARN2(RExC_parse,
3677                            "Unrecognized escape \\%c in character class passed through",
3678                            (int)value);
3679                 break;
3680             }
3681         } /* end of \blah */
3682 #ifdef EBCDIC
3683         else
3684             literal_endpoint++;
3685 #endif
3686
3687         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3688
3689             if (!SIZE_ONLY && !need_class)
3690                 ANYOF_CLASS_ZERO(ret);
3691
3692             need_class = 1;
3693
3694             /* a bad range like a-\d, a-[:digit:] ? */
3695             if (range) {
3696                 if (!SIZE_ONLY) {
3697                     if (ckWARN(WARN_REGEXP))
3698                         vWARN4(RExC_parse,
3699                                "False [] range \"%*.*s\"",
3700                                RExC_parse - rangebegin,
3701                                RExC_parse - rangebegin,
3702                                rangebegin);
3703                     if (prevvalue < 256) {
3704                         ANYOF_BITMAP_SET(ret, prevvalue);
3705                         ANYOF_BITMAP_SET(ret, '-');
3706                     }
3707                     else {
3708                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3709                         Perl_sv_catpvf(aTHX_ listsv,
3710                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3711                     }
3712                 }
3713
3714                 range = 0; /* this was not a true range */
3715             }
3716
3717             if (!SIZE_ONLY) {
3718                 if (namedclass > OOB_NAMEDCLASS)
3719                     optimize_invert = FALSE;
3720                 /* Possible truncation here but in some 64-bit environments
3721                  * the compiler gets heartburn about switch on 64-bit values.
3722                  * A similar issue a little earlier when switching on value.
3723                  * --jhi */
3724                 switch ((I32)namedclass) {
3725                 case ANYOF_ALNUM:
3726                     if (LOC)
3727                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3728                     else {
3729                         for (value = 0; value < 256; value++)
3730                             if (isALNUM(value))
3731                                 ANYOF_BITMAP_SET(ret, value);
3732                     }
3733                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3734                     break;
3735                 case ANYOF_NALNUM:
3736                     if (LOC)
3737                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3738                     else {
3739                         for (value = 0; value < 256; value++)
3740                             if (!isALNUM(value))
3741                                 ANYOF_BITMAP_SET(ret, value);
3742                     }
3743                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3744                     break;
3745                 case ANYOF_ALNUMC:
3746                     if (LOC)
3747                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3748                     else {
3749                         for (value = 0; value < 256; value++)
3750                             if (isALNUMC(value))
3751                                 ANYOF_BITMAP_SET(ret, value);
3752                     }
3753                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3754                     break;
3755                 case ANYOF_NALNUMC:
3756                     if (LOC)
3757                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3758                     else {
3759                         for (value = 0; value < 256; value++)
3760                             if (!isALNUMC(value))
3761                                 ANYOF_BITMAP_SET(ret, value);
3762                     }
3763                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3764                     break;
3765                 case ANYOF_ALPHA:
3766                     if (LOC)
3767                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3768                     else {
3769                         for (value = 0; value < 256; value++)
3770                             if (isALPHA(value))
3771                                 ANYOF_BITMAP_SET(ret, value);
3772                     }
3773                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3774                     break;
3775                 case ANYOF_NALPHA:
3776                     if (LOC)
3777                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3778                     else {
3779                         for (value = 0; value < 256; value++)
3780                             if (!isALPHA(value))
3781                                 ANYOF_BITMAP_SET(ret, value);
3782                     }
3783                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3784                     break;
3785                 case ANYOF_ASCII:
3786                     if (LOC)
3787                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3788                     else {
3789 #ifndef EBCDIC
3790                         for (value = 0; value < 128; value++)
3791                             ANYOF_BITMAP_SET(ret, value);
3792 #else  /* EBCDIC */
3793                         for (value = 0; value < 256; value++) {
3794                             if (isASCII(value))
3795                                 ANYOF_BITMAP_SET(ret, value);
3796                         }
3797 #endif /* EBCDIC */
3798                     }
3799                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3800                     break;
3801                 case ANYOF_NASCII:
3802                     if (LOC)
3803                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3804                     else {
3805 #ifndef EBCDIC
3806                         for (value = 128; value < 256; value++)
3807                             ANYOF_BITMAP_SET(ret, value);
3808 #else  /* EBCDIC */
3809                         for (value = 0; value < 256; value++) {
3810                             if (!isASCII(value))
3811                                 ANYOF_BITMAP_SET(ret, value);
3812                         }
3813 #endif /* EBCDIC */
3814                     }
3815                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3816                     break;
3817                 case ANYOF_BLANK:
3818                     if (LOC)
3819                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3820                     else {
3821                         for (value = 0; value < 256; value++)
3822                             if (isBLANK(value))
3823                                 ANYOF_BITMAP_SET(ret, value);
3824                     }
3825                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3826                     break;
3827                 case ANYOF_NBLANK:
3828                     if (LOC)
3829                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3830                     else {
3831                         for (value = 0; value < 256; value++)
3832                             if (!isBLANK(value))
3833                                 ANYOF_BITMAP_SET(ret, value);
3834                     }
3835                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3836                     break;
3837                 case ANYOF_CNTRL:
3838                     if (LOC)
3839                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3840                     else {
3841                         for (value = 0; value < 256; value++)
3842                             if (isCNTRL(value))
3843                                 ANYOF_BITMAP_SET(ret, value);
3844                     }
3845                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3846                     break;
3847                 case ANYOF_NCNTRL:
3848                     if (LOC)
3849                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3850                     else {
3851                         for (value = 0; value < 256; value++)
3852                             if (!isCNTRL(value))
3853                                 ANYOF_BITMAP_SET(ret, value);
3854                     }
3855                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3856                     break;
3857                 case ANYOF_DIGIT:
3858                     if (LOC)
3859                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3860                     else {
3861                         /* consecutive digits assumed */
3862                         for (value = '0'; value <= '9'; value++)
3863                             ANYOF_BITMAP_SET(ret, value);
3864                     }
3865                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3866                     break;
3867                 case ANYOF_NDIGIT:
3868                     if (LOC)
3869                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3870                     else {
3871                         /* consecutive digits assumed */
3872                         for (value = 0; value < '0'; value++)
3873                             ANYOF_BITMAP_SET(ret, value);
3874                         for (value = '9' + 1; value < 256; value++)
3875                             ANYOF_BITMAP_SET(ret, value);
3876                     }
3877                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3878                     break;
3879                 case ANYOF_GRAPH:
3880                     if (LOC)
3881                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3882                     else {
3883                         for (value = 0; value < 256; value++)
3884                             if (isGRAPH(value))
3885                                 ANYOF_BITMAP_SET(ret, value);
3886                     }
3887                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3888                     break;
3889                 case ANYOF_NGRAPH:
3890                     if (LOC)
3891                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3892                     else {
3893                         for (value = 0; value < 256; value++)
3894                             if (!isGRAPH(value))
3895                                 ANYOF_BITMAP_SET(ret, value);
3896                     }
3897                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3898                     break;
3899                 case ANYOF_LOWER:
3900                     if (LOC)
3901                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3902                     else {
3903                         for (value = 0; value < 256; value++)
3904                             if (isLOWER(value))
3905                                 ANYOF_BITMAP_SET(ret, value);
3906                     }
3907                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3908                     break;
3909                 case ANYOF_NLOWER:
3910                     if (LOC)
3911                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3912                     else {