This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
bf1b42f06e6bf7422d1637636b0c25639b848d13
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2001, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U16         flags16;                /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags16    (pRExC_state->flags16)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF RExC_utf8
231 #define LOC (RExC_flags16 & PMf_LOCALE)
232 #define FOLD (RExC_flags16 & PMf_FOLD)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED;  \
396         Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
398     } STMT_END                                                               \
399
400
401 #define vWARN2(loc, m, a1)                                                   \
402     STMT_START {                                                             \
403         IV offset = loc - RExC_precomp;          \
404         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405                  a1,                                                         \
406                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
407     } STMT_END
408
409 #define vWARN3(loc, m, a1, a2)                                               \
410     STMT_START {                                                             \
411       IV offset = loc - RExC_precomp;        \
412         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,                    \
413                  a1, a2,                                                     \
414                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
415     } STMT_END
416
417 #define vWARN4(loc, m, a1, a2, a3)                                           \
418     STMT_START {                                                             \
419       IV offset = loc - RExC_precomp;            \
420         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421                  a1, a2, a3,                                                 \
422                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
423     } STMT_END
424
425 /* used for the parse_flags section for (?c) -- japhy */
426 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
427   STMT_START {                                                   \
428       IV offset = loc - RExC_precomp;   \
429         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,      \
430                  a1, a2, a3, a4,                                 \
431                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
432     } STMT_END
433
434
435 /* Allow for side effects in s */
436 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
437
438 /* Macros for recording node offsets.   20001227 mjd@plover.com 
439  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
440  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
441  * Element 0 holds the number n.
442  */
443
444 #define MJD_OFFSET_DEBUG(x)
445 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448 #  define Set_Node_Offset_To_R(node,byte)                           \
449    STMT_START {                                        \
450      if (! SIZE_ONLY) {                                  \
451        if((node) < 0) {                   \
452          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453        } else {                                                        \
454          RExC_offsets[2*(node)-1] = (byte);                               \
455        }                                                               \
456      }                                                                 \
457    } STMT_END
458
459 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462 #  define Set_Node_Length_To_R(node,len)                            \
463    STMT_START {                                        \
464      if (! SIZE_ONLY) {                                  \
465        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466        if((node) < 0) {                   \
467          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468        } else {                                                        \
469          RExC_offsets[2*(node)] = (len);                               \
470        }                                                               \
471      }                                                                 \
472    } STMT_END
473
474 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
476 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
477
478 /* Get offsets and lengths */
479 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
482 static void clear_re(pTHX_ void *r);
483
484 /* Mark that we cannot extend a found fixed substring at this point.
485    Updata the longest found anchored substring and the longest found
486    floating substrings if needed. */
487
488 STATIC void
489 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
490 {
491     STRLEN l = CHR_SVLEN(data->last_found);
492     STRLEN old_l = CHR_SVLEN(*data->longest);
493
494     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495         sv_setsv(*data->longest, data->last_found);
496         if (*data->longest == data->longest_fixed) {
497             data->offset_fixed = l ? data->last_start_min : data->pos_min;
498             if (data->flags & SF_BEFORE_EOL)
499                 data->flags
500                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501             else
502                 data->flags &= ~SF_FIX_BEFORE_EOL;
503         }
504         else {
505             data->offset_float_min = l ? data->last_start_min : data->pos_min;
506             data->offset_float_max = (l
507                                       ? data->last_start_max
508                                       : data->pos_min + data->pos_delta);
509             if (data->flags & SF_BEFORE_EOL)
510                 data->flags
511                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512             else
513                 data->flags &= ~SF_FL_BEFORE_EOL;
514         }
515     }
516     SvCUR_set(data->last_found, 0);
517     data->last_end = -1;
518     data->flags &= ~SF_BEFORE_EOL;
519 }
520
521 /* Can match anything (initialization) */
522 STATIC void
523 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 {
525     ANYOF_CLASS_ZERO(cl);
526     ANYOF_BITMAP_SETALL(cl);
527     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528     if (LOC)
529         cl->flags |= ANYOF_LOCALE;
530 }
531
532 /* Can match anything (initialization) */
533 STATIC int
534 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535 {
536     int value;
537
538     for (value = 0; value <= ANYOF_MAX; value += 2)
539         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540             return 1;
541     if (!(cl->flags & ANYOF_UNICODE_ALL))
542         return 0;
543     if (!ANYOF_BITMAP_TESTALLSET(cl))
544         return 0;
545     return 1;
546 }
547
548 /* Can match anything (initialization) */
549 STATIC void
550 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 {
552     Zero(cl, 1, struct regnode_charclass_class);
553     cl->type = ANYOF;
554     cl_anything(pRExC_state, cl);
555 }
556
557 STATIC void
558 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 {
560     Zero(cl, 1, struct regnode_charclass_class);
561     cl->type = ANYOF;
562     cl_anything(pRExC_state, cl);
563     if (LOC)
564         cl->flags |= ANYOF_LOCALE;
565 }
566
567 /* 'And' a given class with another one.  Can create false positives */
568 /* We assume that cl is not inverted */
569 STATIC void
570 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571          struct regnode_charclass_class *and_with)
572 {
573     if (!(and_with->flags & ANYOF_CLASS)
574         && !(cl->flags & ANYOF_CLASS)
575         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576         && !(and_with->flags & ANYOF_FOLD)
577         && !(cl->flags & ANYOF_FOLD)) {
578         int i;
579
580         if (and_with->flags & ANYOF_INVERT)
581             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582                 cl->bitmap[i] &= ~and_with->bitmap[i];
583         else
584             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585                 cl->bitmap[i] &= and_with->bitmap[i];
586     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587     if (!(and_with->flags & ANYOF_EOS))
588         cl->flags &= ~ANYOF_EOS;
589
590     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591         cl->flags &= ~ANYOF_UNICODE_ALL;
592         cl->flags |= ANYOF_UNICODE;
593         ARG_SET(cl, ARG(and_with));
594     }
595     if (!(and_with->flags & ANYOF_UNICODE_ALL))
596         cl->flags &= ~ANYOF_UNICODE_ALL;
597     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598         cl->flags &= ~ANYOF_UNICODE;
599 }
600
601 /* 'OR' a given class with another one.  Can create false positives */
602 /* We assume that cl is not inverted */
603 STATIC void
604 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
605 {
606     if (or_with->flags & ANYOF_INVERT) {
607         /* We do not use
608          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609          *   <= (B1 | !B2) | (CL1 | !CL2)
610          * which is wasteful if CL2 is small, but we ignore CL2:
611          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612          * XXXX Can we handle case-fold?  Unclear:
613          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615          */
616         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617              && !(or_with->flags & ANYOF_FOLD)
618              && !(cl->flags & ANYOF_FOLD) ) {
619             int i;
620
621             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622                 cl->bitmap[i] |= ~or_with->bitmap[i];
623         } /* XXXX: logic is complicated otherwise */
624         else {
625             cl_anything(pRExC_state, cl);
626         }
627     } else {
628         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
630              && (!(or_with->flags & ANYOF_FOLD)
631                  || (cl->flags & ANYOF_FOLD)) ) {
632             int i;
633
634             /* OR char bitmap and class bitmap separately */
635             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636                 cl->bitmap[i] |= or_with->bitmap[i];
637             if (or_with->flags & ANYOF_CLASS) {
638                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639                     cl->classflags[i] |= or_with->classflags[i];
640                 cl->flags |= ANYOF_CLASS;
641             }
642         }
643         else { /* XXXX: logic is complicated, leave it along for a moment. */
644             cl_anything(pRExC_state, cl);
645         }
646     }
647     if (or_with->flags & ANYOF_EOS)
648         cl->flags |= ANYOF_EOS;
649
650     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651         ARG(cl) != ARG(or_with)) {
652         cl->flags |= ANYOF_UNICODE_ALL;
653         cl->flags &= ~ANYOF_UNICODE;
654     }
655     if (or_with->flags & ANYOF_UNICODE_ALL) {
656         cl->flags |= ANYOF_UNICODE_ALL;
657         cl->flags &= ~ANYOF_UNICODE;
658     }
659 }
660
661 /*
662  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663  * These need to be revisited when a newer toolchain becomes available.
664  */
665 #if defined(__sparc64__) && defined(__GNUC__)
666 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667 #       undef  SPARC64_GCC_WORKAROUND
668 #       define SPARC64_GCC_WORKAROUND 1
669 #   endif
670 #endif
671
672 /* REx optimizer.  Converts nodes into quickier variants "in place".
673    Finds fixed substrings.  */
674
675 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676    to the position after last scanned or to NULL. */
677
678 STATIC I32
679 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
680                         /* scanp: Start here (read-write). */
681                         /* deltap: Write maxlen-minlen here. */
682                         /* last: Stop before this one. */
683 {
684     I32 min = 0, pars = 0, code;
685     regnode *scan = *scanp, *next;
686     I32 delta = 0;
687     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
688     int is_inf_internal = 0;            /* The studied chunk is infinite */
689     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690     scan_data_t data_fake;
691     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
692
693     while (scan && OP(scan) != END && scan < last) {
694         /* Peephole optimizer: */
695
696         if (PL_regkind[(U8)OP(scan)] == EXACT) {
697             /* Merge several consecutive EXACTish nodes into one. */
698             regnode *n = regnext(scan);
699             U32 stringok = 1;
700 #ifdef DEBUGGING
701             regnode *stop = scan;
702 #endif
703
704             next = scan + NODE_SZ_STR(scan);
705             /* Skip NOTHING, merge EXACT*. */
706             while (n &&
707                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
708                      (stringok && (OP(n) == OP(scan))))
709                    && NEXT_OFF(n)
710                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711                 if (OP(n) == TAIL || n > next)
712                     stringok = 0;
713                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
714                     NEXT_OFF(scan) += NEXT_OFF(n);
715                     next = n + NODE_STEP_REGNODE;
716 #ifdef DEBUGGING
717                     if (stringok)
718                         stop = n;
719 #endif
720                     n = regnext(n);
721                 }
722                 else if (stringok) {
723                     int oldl = STR_LEN(scan);
724                     regnode *nnext = regnext(n);
725
726                     if (oldl + STR_LEN(n) > U8_MAX)
727                         break;
728                     NEXT_OFF(scan) += NEXT_OFF(n);
729                     STR_LEN(scan) += STR_LEN(n);
730                     next = n + NODE_SZ_STR(n);
731                     /* Now we can overwrite *n : */
732                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
733 #ifdef DEBUGGING
734                     stop = next - 1;
735 #endif
736                     n = nnext;
737                 }
738             }
739
740             if (UTF && OP(scan) == EXACTF) {
741 /*
742   Two problematic code points in Unicode casefolding of EXACT nodes:
743
744    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
745    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
746
747    which casefold to
748
749    Unicode                      UTF-8
750
751    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
752    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
753
754    This means that in case-insensitive matching (or "loose matching",
755    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
756    length of the above casefolded versions) can match a target string
757    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
758    This would rather mess up the minimum length computation.
759
760    What we'll do is to look for the tail four bytes, and then peek
761    at the preceding two bytes to see whether we need to decrease
762    the minimum length by four (six minus two).
763
764    Thanks to the design of UTF-8, there cannot be false matches:
765    A sequence of valid UTF-8 bytes cannot be a subsequence of
766    another valid sequence of UTF-8 bytes.
767
768 */
769                  char *s0 = STRING(scan), *s, *t;
770                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
771                  char *t0 = "\xcc\x88\xcc\x81";
772                  char *t1 = t0 + 3;
773                  
774                  for (s = s0 + 2;
775                       s < s2 && (t = ninstr(s, s1, t0, t1));
776                       s = t + 4) {
777                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
778                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
779                            min -= 4;
780                  }
781             }
782
783 #ifdef DEBUGGING
784             /* Allow dumping */
785             n = scan + NODE_SZ_STR(scan);
786             while (n <= stop) {
787                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
788                     OP(n) = OPTIMIZED;
789                     NEXT_OFF(n) = 0;
790                 }
791                 n++;
792             }
793 #endif
794         }
795         /* Follow the next-chain of the current node and optimize
796            away all the NOTHINGs from it.  */
797         if (OP(scan) != CURLYX) {
798             int max = (reg_off_by_arg[OP(scan)]
799                        ? I32_MAX
800                        /* I32 may be smaller than U16 on CRAYs! */
801                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
802             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
803             int noff;
804             regnode *n = scan;
805         
806             /* Skip NOTHING and LONGJMP. */
807             while ((n = regnext(n))
808                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
809                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
810                    && off + noff < max)
811                 off += noff;
812             if (reg_off_by_arg[OP(scan)])
813                 ARG(scan) = off;
814             else
815                 NEXT_OFF(scan) = off;
816         }
817         /* The principal pseudo-switch.  Cannot be a switch, since we
818            look into several different things.  */
819         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
820                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
821             next = regnext(scan);
822             code = OP(scan);
823         
824             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
825                 I32 max1 = 0, min1 = I32_MAX, num = 0;
826                 struct regnode_charclass_class accum;
827                 
828                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
829                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
830                 if (flags & SCF_DO_STCLASS)
831                     cl_init_zero(pRExC_state, &accum);
832                 while (OP(scan) == code) {
833                     I32 deltanext, minnext, f = 0, fake;
834                     struct regnode_charclass_class this_class;
835
836                     num++;
837                     data_fake.flags = 0;
838                     if (data) {         
839                         data_fake.whilem_c = data->whilem_c;
840                         data_fake.last_closep = data->last_closep;
841                     }
842                     else
843                         data_fake.last_closep = &fake;
844                     next = regnext(scan);
845                     scan = NEXTOPER(scan);
846                     if (code != BRANCH)
847                         scan = NEXTOPER(scan);
848                     if (flags & SCF_DO_STCLASS) {
849                         cl_init(pRExC_state, &this_class);
850                         data_fake.start_class = &this_class;
851                         f = SCF_DO_STCLASS_AND;
852                     }           
853                     if (flags & SCF_WHILEM_VISITED_POS)
854                         f |= SCF_WHILEM_VISITED_POS;
855                     /* we suppose the run is continuous, last=next...*/
856                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
857                                           next, &data_fake, f);
858                     if (min1 > minnext)
859                         min1 = minnext;
860                     if (max1 < minnext + deltanext)
861                         max1 = minnext + deltanext;
862                     if (deltanext == I32_MAX)
863                         is_inf = is_inf_internal = 1;
864                     scan = next;
865                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866                         pars++;
867                     if (data && (data_fake.flags & SF_HAS_EVAL))
868                         data->flags |= SF_HAS_EVAL;
869                     if (data)
870                         data->whilem_c = data_fake.whilem_c;
871                     if (flags & SCF_DO_STCLASS)
872                         cl_or(pRExC_state, &accum, &this_class);
873                     if (code == SUSPEND)
874                         break;
875                 }
876                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877                     min1 = 0;
878                 if (flags & SCF_DO_SUBSTR) {
879                     data->pos_min += min1;
880                     data->pos_delta += max1 - min1;
881                     if (max1 != min1 || is_inf)
882                         data->longest = &(data->longest_float);
883                 }
884                 min += min1;
885                 delta += max1 - min1;
886                 if (flags & SCF_DO_STCLASS_OR) {
887                     cl_or(pRExC_state, data->start_class, &accum);
888                     if (min1) {
889                         cl_and(data->start_class, &and_with);
890                         flags &= ~SCF_DO_STCLASS;
891                     }
892                 }
893                 else if (flags & SCF_DO_STCLASS_AND) {
894                     if (min1) {
895                         cl_and(data->start_class, &accum);
896                         flags &= ~SCF_DO_STCLASS;
897                     }
898                     else {
899                         /* Switch to OR mode: cache the old value of
900                          * data->start_class */
901                         StructCopy(data->start_class, &and_with,
902                                    struct regnode_charclass_class);
903                         flags &= ~SCF_DO_STCLASS_AND;
904                         StructCopy(&accum, data->start_class,
905                                    struct regnode_charclass_class);
906                         flags |= SCF_DO_STCLASS_OR;
907                         data->start_class->flags |= ANYOF_EOS;
908                     }
909                 }
910             }
911             else if (code == BRANCHJ)   /* single branch is optimized. */
912                 scan = NEXTOPER(NEXTOPER(scan));
913             else                        /* single branch is optimized. */
914                 scan = NEXTOPER(scan);
915             continue;
916         }
917         else if (OP(scan) == EXACT) {
918             I32 l = STR_LEN(scan);
919             UV uc = *((U8*)STRING(scan));
920             if (UTF) {
921                 U8 *s = (U8*)STRING(scan);
922                 l = utf8_length(s, s + l);
923                 uc = utf8_to_uvchr(s, NULL);
924             }
925             min += l;
926             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
927                 /* The code below prefers earlier match for fixed
928                    offset, later match for variable offset.  */
929                 if (data->last_end == -1) { /* Update the start info. */
930                     data->last_start_min = data->pos_min;
931                     data->last_start_max = is_inf
932                         ? I32_MAX : data->pos_min + data->pos_delta;
933                 }
934                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
935                 data->last_end = data->pos_min + l;
936                 data->pos_min += l; /* As in the first entry. */
937                 data->flags &= ~SF_BEFORE_EOL;
938             }
939             if (flags & SCF_DO_STCLASS_AND) {
940                 /* Check whether it is compatible with what we know already! */
941                 int compat = 1;
942
943                 if (uc >= 0x100 ||
944                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
945                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
946                     && (!(data->start_class->flags & ANYOF_FOLD)
947                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
948                     )
949                     compat = 0;
950                 ANYOF_CLASS_ZERO(data->start_class);
951                 ANYOF_BITMAP_ZERO(data->start_class);
952                 if (compat)
953                     ANYOF_BITMAP_SET(data->start_class, uc);
954                 data->start_class->flags &= ~ANYOF_EOS;
955                 if (uc < 0x100)
956                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
957             }
958             else if (flags & SCF_DO_STCLASS_OR) {
959                 /* false positive possible if the class is case-folded */
960                 if (uc < 0x100)
961                     ANYOF_BITMAP_SET(data->start_class, uc);
962                 else
963                     data->start_class->flags |= ANYOF_UNICODE_ALL;
964                 data->start_class->flags &= ~ANYOF_EOS;
965                 cl_and(data->start_class, &and_with);
966             }
967             flags &= ~SCF_DO_STCLASS;
968         }
969         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
970             I32 l = STR_LEN(scan);
971             UV uc = *((U8*)STRING(scan));
972
973             /* Search for fixed substrings supports EXACT only. */
974             if (flags & SCF_DO_SUBSTR)
975                 scan_commit(pRExC_state, data);
976             if (UTF) {
977                 U8 *s = (U8 *)STRING(scan);
978                 l = utf8_length(s, s + l);
979                 uc = utf8_to_uvchr(s, NULL);
980             }
981             min += l;
982             if (data && (flags & SCF_DO_SUBSTR))
983                 data->pos_min += l;
984             if (flags & SCF_DO_STCLASS_AND) {
985                 /* Check whether it is compatible with what we know already! */
986                 int compat = 1;
987
988                 if (uc >= 0x100 ||
989                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
990                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
991                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
992                     compat = 0;
993                 ANYOF_CLASS_ZERO(data->start_class);
994                 ANYOF_BITMAP_ZERO(data->start_class);
995                 if (compat) {
996                     ANYOF_BITMAP_SET(data->start_class, uc);
997                     data->start_class->flags &= ~ANYOF_EOS;
998                     data->start_class->flags |= ANYOF_FOLD;
999                     if (OP(scan) == EXACTFL)
1000                         data->start_class->flags |= ANYOF_LOCALE;
1001                 }
1002             }
1003             else if (flags & SCF_DO_STCLASS_OR) {
1004                 if (data->start_class->flags & ANYOF_FOLD) {
1005                     /* false positive possible if the class is case-folded.
1006                        Assume that the locale settings are the same... */
1007                     if (uc < 0x100)
1008                         ANYOF_BITMAP_SET(data->start_class, uc);
1009                     data->start_class->flags &= ~ANYOF_EOS;
1010                 }
1011                 cl_and(data->start_class, &and_with);
1012             }
1013             flags &= ~SCF_DO_STCLASS;
1014         }
1015         else if (strchr((char*)PL_varies,OP(scan))) {
1016             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1017             I32 f = flags, pos_before = 0;
1018             regnode *oscan = scan;
1019             struct regnode_charclass_class this_class;
1020             struct regnode_charclass_class *oclass = NULL;
1021             I32 next_is_eval = 0;
1022
1023             switch (PL_regkind[(U8)OP(scan)]) {
1024             case WHILEM:                /* End of (?:...)* . */
1025                 scan = NEXTOPER(scan);
1026                 goto finish;
1027             case PLUS:
1028                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1029                     next = NEXTOPER(scan);
1030                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1031                         mincount = 1;
1032                         maxcount = REG_INFTY;
1033                         next = regnext(scan);
1034                         scan = NEXTOPER(scan);
1035                         goto do_curly;
1036                     }
1037                 }
1038                 if (flags & SCF_DO_SUBSTR)
1039                     data->pos_min++;
1040                 min++;
1041                 /* Fall through. */
1042             case STAR:
1043                 if (flags & SCF_DO_STCLASS) {
1044                     mincount = 0;
1045                     maxcount = REG_INFTY;
1046                     next = regnext(scan);
1047                     scan = NEXTOPER(scan);
1048                     goto do_curly;
1049                 }
1050                 is_inf = is_inf_internal = 1;
1051                 scan = regnext(scan);
1052                 if (flags & SCF_DO_SUBSTR) {
1053                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1054                     data->longest = &(data->longest_float);
1055                 }
1056                 goto optimize_curly_tail;
1057             case CURLY:
1058                 mincount = ARG1(scan);
1059                 maxcount = ARG2(scan);
1060                 next = regnext(scan);
1061                 if (OP(scan) == CURLYX) {
1062                     I32 lp = (data ? *(data->last_closep) : 0);
1063
1064                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065                 }
1066                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1067                 next_is_eval = (OP(scan) == EVAL);
1068               do_curly:
1069                 if (flags & SCF_DO_SUBSTR) {
1070                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1071                     pos_before = data->pos_min;
1072                 }
1073                 if (data) {
1074                     fl = data->flags;
1075                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076                     if (is_inf)
1077                         data->flags |= SF_IS_INF;
1078                 }
1079                 if (flags & SCF_DO_STCLASS) {
1080                     cl_init(pRExC_state, &this_class);
1081                     oclass = data->start_class;
1082                     data->start_class = &this_class;
1083                     f |= SCF_DO_STCLASS_AND;
1084                     f &= ~SCF_DO_STCLASS_OR;
1085                 }
1086                 /* These are the cases when once a subexpression
1087                    fails at a particular position, it cannot succeed
1088                    even after backtracking at the enclosing scope.
1089                 
1090                    XXXX what if minimal match and we are at the
1091                         initial run of {n,m}? */
1092                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1093                     f &= ~SCF_WHILEM_VISITED_POS;
1094
1095                 /* This will finish on WHILEM, setting scan, or on NULL: */
1096                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097                                       mincount == 0
1098                                         ? (f & ~SCF_DO_SUBSTR) : f);
1099
1100                 if (flags & SCF_DO_STCLASS)
1101                     data->start_class = oclass;
1102                 if (mincount == 0 || minnext == 0) {
1103                     if (flags & SCF_DO_STCLASS_OR) {
1104                         cl_or(pRExC_state, data->start_class, &this_class);
1105                     }
1106                     else if (flags & SCF_DO_STCLASS_AND) {
1107                         /* Switch to OR mode: cache the old value of
1108                          * data->start_class */
1109                         StructCopy(data->start_class, &and_with,
1110                                    struct regnode_charclass_class);
1111                         flags &= ~SCF_DO_STCLASS_AND;
1112                         StructCopy(&this_class, data->start_class,
1113                                    struct regnode_charclass_class);
1114                         flags |= SCF_DO_STCLASS_OR;
1115                         data->start_class->flags |= ANYOF_EOS;
1116                     }
1117                 } else {                /* Non-zero len */
1118                     if (flags & SCF_DO_STCLASS_OR) {
1119                         cl_or(pRExC_state, data->start_class, &this_class);
1120                         cl_and(data->start_class, &and_with);
1121                     }
1122                     else if (flags & SCF_DO_STCLASS_AND)
1123                         cl_and(data->start_class, &this_class);
1124                     flags &= ~SCF_DO_STCLASS;
1125                 }
1126                 if (!scan)              /* It was not CURLYX, but CURLY. */
1127                     scan = next;
1128                 if (ckWARN(WARN_REGEXP)
1129                        /* ? quantifier ok, except for (?{ ... }) */
1130                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1131                     && (minnext == 0) && (deltanext == 0)
1132                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1133                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1134                 {
1135                     vWARN(RExC_parse,
1136                           "Quantifier unexpected on zero-length expression");
1137                 }
1138
1139                 min += minnext * mincount;
1140                 is_inf_internal |= ((maxcount == REG_INFTY
1141                                      && (minnext + deltanext) > 0)
1142                                     || deltanext == I32_MAX);
1143                 is_inf |= is_inf_internal;
1144                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145
1146                 /* Try powerful optimization CURLYX => CURLYN. */
1147                 if (  OP(oscan) == CURLYX && data
1148                       && data->flags & SF_IN_PAR
1149                       && !(data->flags & SF_HAS_EVAL)
1150                       && !deltanext && minnext == 1 ) {
1151                     /* Try to optimize to CURLYN.  */
1152                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1153                     regnode *nxt1 = nxt;
1154 #ifdef DEBUGGING
1155                     regnode *nxt2;
1156 #endif
1157
1158                     /* Skip open. */
1159                     nxt = regnext(nxt);
1160                     if (!strchr((char*)PL_simple,OP(nxt))
1161                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1162                              && STR_LEN(nxt) == 1))
1163                         goto nogo;
1164 #ifdef DEBUGGING
1165                     nxt2 = nxt;
1166 #endif
1167                     nxt = regnext(nxt);
1168                     if (OP(nxt) != CLOSE)
1169                         goto nogo;
1170                     /* Now we know that nxt2 is the only contents: */
1171                     oscan->flags = ARG(nxt);
1172                     OP(oscan) = CURLYN;
1173                     OP(nxt1) = NOTHING; /* was OPEN. */
1174 #ifdef DEBUGGING
1175                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1176                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1177                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1178                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1179                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1180                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1181 #endif
1182                 }
1183               nogo:
1184
1185                 /* Try optimization CURLYX => CURLYM. */
1186                 if (  OP(oscan) == CURLYX && data
1187                       && !(data->flags & SF_HAS_PAR)
1188                       && !(data->flags & SF_HAS_EVAL)
1189                       && !deltanext  ) {
1190                     /* XXXX How to optimize if data == 0? */
1191                     /* Optimize to a simpler form.  */
1192                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1193                     regnode *nxt2;
1194
1195                     OP(oscan) = CURLYM;
1196                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1197                             && (OP(nxt2) != WHILEM))
1198                         nxt = nxt2;
1199                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1200                     /* Need to optimize away parenths. */
1201                     if (data->flags & SF_IN_PAR) {
1202                         /* Set the parenth number.  */
1203                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204
1205                         if (OP(nxt) != CLOSE)
1206                             FAIL("Panic opt close");
1207                         oscan->flags = ARG(nxt);
1208                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1209                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1210 #ifdef DEBUGGING
1211                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1212                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1213                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1214                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1215 #endif
1216 #if 0
1217                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1218                             regnode *nnxt = regnext(nxt1);
1219                         
1220                             if (nnxt == nxt) {
1221                                 if (reg_off_by_arg[OP(nxt1)])
1222                                     ARG_SET(nxt1, nxt2 - nxt1);
1223                                 else if (nxt2 - nxt1 < U16_MAX)
1224                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1225                                 else
1226                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1227                             }
1228                             nxt1 = nnxt;
1229                         }
1230 #endif
1231                         /* Optimize again: */
1232                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1233                                     NULL, 0);
1234                     }
1235                     else
1236                         oscan->flags = 0;
1237                 }
1238                 else if ((OP(oscan) == CURLYX)
1239                          && (flags & SCF_WHILEM_VISITED_POS)
1240                          /* See the comment on a similar expression above.
1241                             However, this time it not a subexpression
1242                             we care about, but the expression itself. */
1243                          && (maxcount == REG_INFTY)
1244                          && data && ++data->whilem_c < 16) {
1245                     /* This stays as CURLYX, we can put the count/of pair. */
1246                     /* Find WHILEM (as in regexec.c) */
1247                     regnode *nxt = oscan + NEXT_OFF(oscan);
1248
1249                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250                         nxt += ARG(nxt);
1251                     PREVOPER(nxt)->flags = data->whilem_c
1252                         | (RExC_whilem_seen << 4); /* On WHILEM */
1253                 }
1254                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1255                     pars++;
1256                 if (flags & SCF_DO_SUBSTR) {
1257                     SV *last_str = Nullsv;
1258                     int counted = mincount != 0;
1259
1260                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1261 #if defined(SPARC64_GCC_WORKAROUND)
1262                         I32 b = 0;
1263                         STRLEN l = 0;
1264                         char *s = NULL;
1265                         I32 old = 0;
1266
1267                         if (pos_before >= data->last_start_min)
1268                             b = pos_before;
1269                         else
1270                             b = data->last_start_min;
1271
1272                         l = 0;
1273                         s = SvPV(data->last_found, l);
1274                         old = b - data->last_start_min;
1275
1276 #else
1277                         I32 b = pos_before >= data->last_start_min
1278                             ? pos_before : data->last_start_min;
1279                         STRLEN l;
1280                         char *s = SvPV(data->last_found, l);
1281                         I32 old = b - data->last_start_min;
1282 #endif
1283
1284                         if (UTF)
1285                             old = utf8_hop((U8*)s, old) - (U8*)s;
1286                         
1287                         l -= old;
1288                         /* Get the added string: */
1289                         last_str = newSVpvn(s  + old, l);
1290                         if (deltanext == 0 && pos_before == b) {
1291                             /* What was added is a constant string */
1292                             if (mincount > 1) {
1293                                 SvGROW(last_str, (mincount * l) + 1);
1294                                 repeatcpy(SvPVX(last_str) + l,
1295                                           SvPVX(last_str), l, mincount - 1);
1296                                 SvCUR(last_str) *= mincount;
1297                                 /* Add additional parts. */
1298                                 SvCUR_set(data->last_found,
1299                                           SvCUR(data->last_found) - l);
1300                                 sv_catsv(data->last_found, last_str);
1301                                 data->last_end += l * (mincount - 1);
1302                             }
1303                         } else {
1304                             /* start offset must point into the last copy */
1305                             data->last_start_min += minnext * (mincount - 1);
1306                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1307                                 * (minnext + data->pos_delta);
1308                         }
1309                     }
1310                     /* It is counted once already... */
1311                     data->pos_min += minnext * (mincount - counted);
1312                     data->pos_delta += - counted * deltanext +
1313                         (minnext + deltanext) * maxcount - minnext * mincount;
1314                     if (mincount != maxcount) {
1315                          /* Cannot extend fixed substrings found inside
1316                             the group.  */
1317                         scan_commit(pRExC_state,data);
1318                         if (mincount && last_str) {
1319                             sv_setsv(data->last_found, last_str);
1320                             data->last_end = data->pos_min;
1321                             data->last_start_min =
1322                                 data->pos_min - CHR_SVLEN(last_str);
1323                             data->last_start_max = is_inf
1324                                 ? I32_MAX
1325                                 : data->pos_min + data->pos_delta
1326                                 - CHR_SVLEN(last_str);
1327                         }
1328                         data->longest = &(data->longest_float);
1329                     }
1330                     SvREFCNT_dec(last_str);
1331                 }
1332                 if (data && (fl & SF_HAS_EVAL))
1333                     data->flags |= SF_HAS_EVAL;
1334               optimize_curly_tail:
1335                 if (OP(oscan) != CURLYX) {
1336                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1337                            && NEXT_OFF(next))
1338                         NEXT_OFF(oscan) += NEXT_OFF(next);
1339                 }
1340                 continue;
1341             default:                    /* REF and CLUMP only? */
1342                 if (flags & SCF_DO_SUBSTR) {
1343                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1344                     data->longest = &(data->longest_float);
1345                 }
1346                 is_inf = is_inf_internal = 1;
1347                 if (flags & SCF_DO_STCLASS_OR)
1348                     cl_anything(pRExC_state, data->start_class);
1349                 flags &= ~SCF_DO_STCLASS;
1350                 break;
1351             }
1352         }
1353         else if (strchr((char*)PL_simple,OP(scan))) {
1354             int value = 0;
1355
1356             if (flags & SCF_DO_SUBSTR) {
1357                 scan_commit(pRExC_state,data);
1358                 data->pos_min++;
1359             }
1360             min++;
1361             if (flags & SCF_DO_STCLASS) {
1362                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363
1364                 /* Some of the logic below assumes that switching
1365                    locale on will only add false positives. */
1366                 switch (PL_regkind[(U8)OP(scan)]) {
1367                 case SANY:
1368                 default:
1369                   do_default:
1370                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1371                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1372                         cl_anything(pRExC_state, data->start_class);
1373                     break;
1374                 case REG_ANY:
1375                     if (OP(scan) == SANY)
1376                         goto do_default;
1377                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1378                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1379                                  || (data->start_class->flags & ANYOF_CLASS));
1380                         cl_anything(pRExC_state, data->start_class);
1381                     }
1382                     if (flags & SCF_DO_STCLASS_AND || !value)
1383                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1384                     break;
1385                 case ANYOF:
1386                     if (flags & SCF_DO_STCLASS_AND)
1387                         cl_and(data->start_class,
1388                                (struct regnode_charclass_class*)scan);
1389                     else
1390                         cl_or(pRExC_state, data->start_class,
1391                               (struct regnode_charclass_class*)scan);
1392                     break;
1393                 case ALNUM:
1394                     if (flags & SCF_DO_STCLASS_AND) {
1395                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1396                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1397                             for (value = 0; value < 256; value++)
1398                                 if (!isALNUM(value))
1399                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1400                         }
1401                     }
1402                     else {
1403                         if (data->start_class->flags & ANYOF_LOCALE)
1404                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405                         else {
1406                             for (value = 0; value < 256; value++)
1407                                 if (isALNUM(value))
1408                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1409                         }
1410                     }
1411                     break;
1412                 case ALNUML:
1413                     if (flags & SCF_DO_STCLASS_AND) {
1414                         if (data->start_class->flags & ANYOF_LOCALE)
1415                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416                     }
1417                     else {
1418                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419                         data->start_class->flags |= ANYOF_LOCALE;
1420                     }
1421                     break;
1422                 case NALNUM:
1423                     if (flags & SCF_DO_STCLASS_AND) {
1424                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1425                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1426                             for (value = 0; value < 256; value++)
1427                                 if (isALNUM(value))
1428                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1429                         }
1430                     }
1431                     else {
1432                         if (data->start_class->flags & ANYOF_LOCALE)
1433                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434                         else {
1435                             for (value = 0; value < 256; value++)
1436                                 if (!isALNUM(value))
1437                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1438                         }
1439                     }
1440                     break;
1441                 case NALNUML:
1442                     if (flags & SCF_DO_STCLASS_AND) {
1443                         if (data->start_class->flags & ANYOF_LOCALE)
1444                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445                     }
1446                     else {
1447                         data->start_class->flags |= ANYOF_LOCALE;
1448                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1449                     }
1450                     break;
1451                 case SPACE:
1452                     if (flags & SCF_DO_STCLASS_AND) {
1453                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1454                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1455                             for (value = 0; value < 256; value++)
1456                                 if (!isSPACE(value))
1457                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1458                         }
1459                     }
1460                     else {
1461                         if (data->start_class->flags & ANYOF_LOCALE)
1462                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463                         else {
1464                             for (value = 0; value < 256; value++)
1465                                 if (isSPACE(value))
1466                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1467                         }
1468                     }
1469                     break;
1470                 case SPACEL:
1471                     if (flags & SCF_DO_STCLASS_AND) {
1472                         if (data->start_class->flags & ANYOF_LOCALE)
1473                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474                     }
1475                     else {
1476                         data->start_class->flags |= ANYOF_LOCALE;
1477                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1478                     }
1479                     break;
1480                 case NSPACE:
1481                     if (flags & SCF_DO_STCLASS_AND) {
1482                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1483                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1484                             for (value = 0; value < 256; value++)
1485                                 if (isSPACE(value))
1486                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1487                         }
1488                     }
1489                     else {
1490                         if (data->start_class->flags & ANYOF_LOCALE)
1491                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492                         else {
1493                             for (value = 0; value < 256; value++)
1494                                 if (!isSPACE(value))
1495                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1496                         }
1497                     }
1498                     break;
1499                 case NSPACEL:
1500                     if (flags & SCF_DO_STCLASS_AND) {
1501                         if (data->start_class->flags & ANYOF_LOCALE) {
1502                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503                             for (value = 0; value < 256; value++)
1504                                 if (!isSPACE(value))
1505                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1506                         }
1507                     }
1508                     else {
1509                         data->start_class->flags |= ANYOF_LOCALE;
1510                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511                     }
1512                     break;
1513                 case DIGIT:
1514                     if (flags & SCF_DO_STCLASS_AND) {
1515                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1516                         for (value = 0; value < 256; value++)
1517                             if (!isDIGIT(value))
1518                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1519                     }
1520                     else {
1521                         if (data->start_class->flags & ANYOF_LOCALE)
1522                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523                         else {
1524                             for (value = 0; value < 256; value++)
1525                                 if (isDIGIT(value))
1526                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1527                         }
1528                     }
1529                     break;
1530                 case NDIGIT:
1531                     if (flags & SCF_DO_STCLASS_AND) {
1532                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1533                         for (value = 0; value < 256; value++)
1534                             if (isDIGIT(value))
1535                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1536                     }
1537                     else {
1538                         if (data->start_class->flags & ANYOF_LOCALE)
1539                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540                         else {
1541                             for (value = 0; value < 256; value++)
1542                                 if (!isDIGIT(value))
1543                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1544                         }
1545                     }
1546                     break;
1547                 }
1548                 if (flags & SCF_DO_STCLASS_OR)
1549                     cl_and(data->start_class, &and_with);
1550                 flags &= ~SCF_DO_STCLASS;
1551             }
1552         }
1553         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1554             data->flags |= (OP(scan) == MEOL
1555                             ? SF_BEFORE_MEOL
1556                             : SF_BEFORE_SEOL);
1557         }
1558         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1559                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1560                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1561                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1562             /* Lookahead/lookbehind */
1563             I32 deltanext, minnext, fake = 0;
1564             regnode *nscan;
1565             struct regnode_charclass_class intrnl;
1566             int f = 0;
1567
1568             data_fake.flags = 0;
1569             if (data) {         
1570                 data_fake.whilem_c = data->whilem_c;
1571                 data_fake.last_closep = data->last_closep;
1572             }
1573             else
1574                 data_fake.last_closep = &fake;
1575             if ( flags & SCF_DO_STCLASS && !scan->flags
1576                  && OP(scan) == IFMATCH ) { /* Lookahead */
1577                 cl_init(pRExC_state, &intrnl);
1578                 data_fake.start_class = &intrnl;
1579                 f |= SCF_DO_STCLASS_AND;
1580             }
1581             if (flags & SCF_WHILEM_VISITED_POS)
1582                 f |= SCF_WHILEM_VISITED_POS;
1583             next = regnext(scan);
1584             nscan = NEXTOPER(NEXTOPER(scan));
1585             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1586             if (scan->flags) {
1587                 if (deltanext) {
1588                     vFAIL("Variable length lookbehind not implemented");
1589                 }
1590                 else if (minnext > U8_MAX) {
1591                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1592                 }
1593                 scan->flags = minnext;
1594             }
1595             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596                 pars++;
1597             if (data && (data_fake.flags & SF_HAS_EVAL))
1598                 data->flags |= SF_HAS_EVAL;
1599             if (data)
1600                 data->whilem_c = data_fake.whilem_c;
1601             if (f & SCF_DO_STCLASS_AND) {
1602                 int was = (data->start_class->flags & ANYOF_EOS);
1603
1604                 cl_and(data->start_class, &intrnl);
1605                 if (was)
1606                     data->start_class->flags |= ANYOF_EOS;
1607             }
1608         }
1609         else if (OP(scan) == OPEN) {
1610             pars++;
1611         }
1612         else if (OP(scan) == CLOSE) {
1613             if (ARG(scan) == is_par) {
1614                 next = regnext(scan);
1615
1616                 if ( next && (OP(next) != WHILEM) && next < last)
1617                     is_par = 0;         /* Disable optimization */
1618             }
1619             if (data)
1620                 *(data->last_closep) = ARG(scan);
1621         }
1622         else if (OP(scan) == EVAL) {
1623                 if (data)
1624                     data->flags |= SF_HAS_EVAL;
1625         }
1626         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1627                 if (flags & SCF_DO_SUBSTR) {
1628                     scan_commit(pRExC_state,data);
1629                     data->longest = &(data->longest_float);
1630                 }
1631                 is_inf = is_inf_internal = 1;
1632                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1633                     cl_anything(pRExC_state, data->start_class);
1634                 flags &= ~SCF_DO_STCLASS;
1635         }
1636         /* Else: zero-length, ignore. */
1637         scan = regnext(scan);
1638     }
1639
1640   finish:
1641     *scanp = scan;
1642     *deltap = is_inf_internal ? I32_MAX : delta;
1643     if (flags & SCF_DO_SUBSTR && is_inf)
1644         data->pos_delta = I32_MAX - data->pos_min;
1645     if (is_par > U8_MAX)
1646         is_par = 0;
1647     if (is_par && pars==1 && data) {
1648         data->flags |= SF_IN_PAR;
1649         data->flags &= ~SF_HAS_PAR;
1650     }
1651     else if (pars && data) {
1652         data->flags |= SF_HAS_PAR;
1653         data->flags &= ~SF_IN_PAR;
1654     }
1655     if (flags & SCF_DO_STCLASS_OR)
1656         cl_and(data->start_class, &and_with);
1657     return min;
1658 }
1659
1660 STATIC I32
1661 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1662 {
1663     if (RExC_rx->data) {
1664         Renewc(RExC_rx->data,
1665                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1666                char, struct reg_data);
1667         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1668         RExC_rx->data->count += n;
1669     }
1670     else {
1671         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1672              char, struct reg_data);
1673         New(1208, RExC_rx->data->what, n, U8);
1674         RExC_rx->data->count = n;
1675     }
1676     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1677     return RExC_rx->data->count - n;
1678 }
1679
1680 void
1681 Perl_reginitcolors(pTHX)
1682 {
1683     int i = 0;
1684     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1685         
1686     if (s) {
1687         PL_colors[0] = s = savepv(s);
1688         while (++i < 6) {
1689             s = strchr(s, '\t');
1690             if (s) {
1691                 *s = '\0';
1692                 PL_colors[i] = ++s;
1693             }
1694             else
1695                 PL_colors[i] = s = "";
1696         }
1697     } else {
1698         while (i < 6)
1699             PL_colors[i++] = "";
1700     }
1701     PL_colorset = 1;
1702 }
1703
1704
1705 /*
1706  - pregcomp - compile a regular expression into internal code
1707  *
1708  * We can't allocate space until we know how big the compiled form will be,
1709  * but we can't compile it (and thus know how big it is) until we've got a
1710  * place to put the code.  So we cheat:  we compile it twice, once with code
1711  * generation turned off and size counting turned on, and once "for real".
1712  * This also means that we don't allocate space until we are sure that the
1713  * thing really will compile successfully, and we never have to move the
1714  * code and thus invalidate pointers into it.  (Note that it has to be in
1715  * one piece because free() must be able to free it all.) [NB: not true in perl]
1716  *
1717  * Beware that the optimization-preparation code in here knows about some
1718  * of the structure of the compiled regexp.  [I'll say.]
1719  */
1720 regexp *
1721 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1722 {
1723     register regexp *r;
1724     regnode *scan;
1725     regnode *first;
1726     I32 flags;
1727     I32 minlen = 0;
1728     I32 sawplus = 0;
1729     I32 sawopen = 0;
1730     scan_data_t data;
1731     RExC_state_t RExC_state;
1732     RExC_state_t *pRExC_state = &RExC_state;
1733
1734     if (exp == NULL)
1735         FAIL("NULL regexp argument");
1736
1737     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1738
1739     RExC_precomp = exp;
1740     DEBUG_r({
1741          if (!PL_colorset) reginitcolors();
1742          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1743                        PL_colors[4],PL_colors[5],PL_colors[0],
1744                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745     });
1746     RExC_flags16 = pm->op_pmflags;
1747     RExC_sawback = 0;
1748
1749     RExC_seen = 0;
1750     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1751     RExC_seen_evals = 0;
1752     RExC_extralen = 0;
1753
1754     /* First pass: determine size, legality. */
1755     RExC_parse = exp;
1756     RExC_start = exp;
1757     RExC_end = xend;
1758     RExC_naughty = 0;
1759     RExC_npar = 1;
1760     RExC_size = 0L;
1761     RExC_emit = &PL_regdummy;
1762     RExC_whilem_seen = 0;
1763 #if 0 /* REGC() is (currently) a NOP at the first pass.
1764        * Clever compilers notice this and complain. --jhi */
1765     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1766 #endif
1767     if (reg(pRExC_state, 0, &flags) == NULL) {
1768         RExC_precomp = Nullch;
1769         return(NULL);
1770     }
1771     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1772
1773     /* Small enough for pointer-storage convention?
1774        If extralen==0, this means that we will not need long jumps. */
1775     if (RExC_size >= 0x10000L && RExC_extralen)
1776         RExC_size += RExC_extralen;
1777     else
1778         RExC_extralen = 0;
1779     if (RExC_whilem_seen > 15)
1780         RExC_whilem_seen = 15;
1781
1782     /* Allocate space and initialize. */
1783     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1784          char, regexp);
1785     if (r == NULL)
1786         FAIL("Regexp out of space");
1787
1788 #ifdef DEBUGGING
1789     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1790     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1791 #endif
1792     r->refcnt = 1;
1793     r->prelen = xend - exp;
1794     r->precomp = savepvn(RExC_precomp, r->prelen);
1795     r->subbeg = NULL;
1796     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1797     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1798
1799     r->substrs = 0;                     /* Useful during FAIL. */
1800     r->startp = 0;                      /* Useful during FAIL. */
1801     r->endp = 0;                        /* Useful during FAIL. */
1802
1803     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804     if (r->offsets) {
1805       r->offsets[0] = RExC_size; 
1806     }
1807     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1808                           "%s %"UVuf" bytes for offset annotations.\n", 
1809                           r->offsets ? "Got" : "Couldn't get", 
1810                           (UV)((2*RExC_size+1) * sizeof(U32))));
1811
1812     RExC_rx = r;
1813
1814     /* Second pass: emit code. */
1815     RExC_flags16 = pm->op_pmflags;      /* don't let top level (?i) bleed */
1816     RExC_parse = exp;
1817     RExC_end = xend;
1818     RExC_naughty = 0;
1819     RExC_npar = 1;
1820     RExC_emit_start = r->program;
1821     RExC_emit = r->program;
1822     /* Store the count of eval-groups for security checks: */
1823     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1824     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1825     r->data = 0;
1826     if (reg(pRExC_state, 0, &flags) == NULL)
1827         return(NULL);
1828
1829     /* Dig out information for optimizations. */
1830     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1831     pm->op_pmflags = RExC_flags16;
1832     if (UTF)
1833         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1834     r->regstclass = NULL;
1835     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1836         r->reganch |= ROPT_NAUGHTY;
1837     scan = r->program + 1;              /* First BRANCH. */
1838
1839     /* XXXX To minimize changes to RE engine we always allocate
1840        3-units-long substrs field. */
1841     Newz(1004, r->substrs, 1, struct reg_substr_data);
1842
1843     StructCopy(&zero_scan_data, &data, scan_data_t);
1844     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1845     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1846         I32 fake;
1847         STRLEN longest_float_length, longest_fixed_length;
1848         struct regnode_charclass_class ch_class;
1849         int stclass_flag;
1850         I32 last_close = 0;
1851
1852         first = scan;
1853         /* Skip introductions and multiplicators >= 1. */
1854         while ((OP(first) == OPEN && (sawopen = 1)) ||
1855                /* An OR of *one* alternative - should not happen now. */
1856             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1857             (OP(first) == PLUS) ||
1858             (OP(first) == MINMOD) ||
1859                /* An {n,m} with n>0 */
1860             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1861                 if (OP(first) == PLUS)
1862                     sawplus = 1;
1863                 else
1864                     first += regarglen[(U8)OP(first)];
1865                 first = NEXTOPER(first);
1866         }
1867
1868         /* Starting-point info. */
1869       again:
1870         if (PL_regkind[(U8)OP(first)] == EXACT) {
1871             if (OP(first) == EXACT)
1872                 ;       /* Empty, get anchored substr later. */
1873             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1874                 r->regstclass = first;
1875         }
1876         else if (strchr((char*)PL_simple,OP(first)))
1877             r->regstclass = first;
1878         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1879                  PL_regkind[(U8)OP(first)] == NBOUND)
1880             r->regstclass = first;
1881         else if (PL_regkind[(U8)OP(first)] == BOL) {
1882             r->reganch |= (OP(first) == MBOL
1883                            ? ROPT_ANCH_MBOL
1884                            : (OP(first) == SBOL
1885                               ? ROPT_ANCH_SBOL
1886                               : ROPT_ANCH_BOL));
1887             first = NEXTOPER(first);
1888             goto again;
1889         }
1890         else if (OP(first) == GPOS) {
1891             r->reganch |= ROPT_ANCH_GPOS;
1892             first = NEXTOPER(first);
1893             goto again;
1894         }
1895         else if (!sawopen && (OP(first) == STAR &&
1896             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1897             !(r->reganch & ROPT_ANCH) )
1898         {
1899             /* turn .* into ^.* with an implied $*=1 */
1900             int type = OP(NEXTOPER(first));
1901
1902             if (type == REG_ANY)
1903                 type = ROPT_ANCH_MBOL;
1904             else
1905                 type = ROPT_ANCH_SBOL;
1906
1907             r->reganch |= type | ROPT_IMPLICIT;
1908             first = NEXTOPER(first);
1909             goto again;
1910         }
1911         if (sawplus && (!sawopen || !RExC_sawback)
1912             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1913             /* x+ must match at the 1st pos of run of x's */
1914             r->reganch |= ROPT_SKIP;
1915
1916         /* Scan is after the zeroth branch, first is atomic matcher. */
1917         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1918                               (IV)(first - scan + 1)));
1919         /*
1920         * If there's something expensive in the r.e., find the
1921         * longest literal string that must appear and make it the
1922         * regmust.  Resolve ties in favor of later strings, since
1923         * the regstart check works with the beginning of the r.e.
1924         * and avoiding duplication strengthens checking.  Not a
1925         * strong reason, but sufficient in the absence of others.
1926         * [Now we resolve ties in favor of the earlier string if
1927         * it happens that c_offset_min has been invalidated, since the
1928         * earlier string may buy us something the later one won't.]
1929         */
1930         minlen = 0;
1931
1932         data.longest_fixed = newSVpvn("",0);
1933         data.longest_float = newSVpvn("",0);
1934         data.last_found = newSVpvn("",0);
1935         data.longest = &(data.longest_fixed);
1936         first = scan;
1937         if (!r->regstclass) {
1938             cl_init(pRExC_state, &ch_class);
1939             data.start_class = &ch_class;
1940             stclass_flag = SCF_DO_STCLASS_AND;
1941         } else                          /* XXXX Check for BOUND? */
1942             stclass_flag = 0;
1943         data.last_closep = &last_close;
1944
1945         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1946                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1947         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1948              && data.last_start_min == 0 && data.last_end > 0
1949              && !RExC_seen_zerolen
1950              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1951             r->reganch |= ROPT_CHECK_ALL;
1952         scan_commit(pRExC_state, &data);
1953         SvREFCNT_dec(data.last_found);
1954
1955         longest_float_length = CHR_SVLEN(data.longest_float);
1956         if (longest_float_length
1957             || (data.flags & SF_FL_BEFORE_EOL
1958                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1959                     || (RExC_flags16 & PMf_MULTILINE)))) {
1960             int t;
1961
1962             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1963                 && data.offset_fixed == data.offset_float_min
1964                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1965                     goto remove_float;          /* As in (a)+. */
1966
1967             r->float_substr = data.longest_float;
1968             r->float_min_offset = data.offset_float_min;
1969             r->float_max_offset = data.offset_float_max;
1970             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1971                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1972                            || (RExC_flags16 & PMf_MULTILINE)));
1973             fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1974         }
1975         else {
1976           remove_float:
1977             r->float_substr = Nullsv;
1978             SvREFCNT_dec(data.longest_float);
1979             longest_float_length = 0;
1980         }
1981
1982         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1983         if (longest_fixed_length
1984             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1985                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1986                     || (RExC_flags16 & PMf_MULTILINE)))) {
1987             int t;
1988
1989             r->anchored_substr = data.longest_fixed;
1990             r->anchored_offset = data.offset_fixed;
1991             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1993                      || (RExC_flags16 & PMf_MULTILINE)));
1994             fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1995         }
1996         else {
1997             r->anchored_substr = Nullsv;
1998             SvREFCNT_dec(data.longest_fixed);
1999             longest_fixed_length = 0;
2000         }
2001         if (r->regstclass
2002             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2003             r->regstclass = NULL;
2004         if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2005             && !(data.start_class->flags & ANYOF_EOS)
2006             && !cl_is_anything(data.start_class)) {
2007             I32 n = add_data(pRExC_state, 1, "f");
2008
2009             New(1006, RExC_rx->data->data[n], 1,
2010                 struct regnode_charclass_class);
2011             StructCopy(data.start_class,
2012                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2013                        struct regnode_charclass_class);
2014             r->regstclass = (regnode*)RExC_rx->data->data[n];
2015             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2016             PL_regdata = r->data; /* for regprop() */
2017             DEBUG_r({ SV *sv = sv_newmortal();
2018                       regprop(sv, (regnode*)data.start_class);
2019                       PerlIO_printf(Perl_debug_log,
2020                                     "synthetic stclass `%s'.\n",
2021                                     SvPVX(sv));});
2022         }
2023
2024         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2025         if (longest_fixed_length > longest_float_length) {
2026             r->check_substr = r->anchored_substr;
2027             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028             if (r->reganch & ROPT_ANCH_SINGLE)
2029                 r->reganch |= ROPT_NOSCAN;
2030         }
2031         else {
2032             r->check_substr = r->float_substr;
2033             r->check_offset_min = data.offset_float_min;
2034             r->check_offset_max = data.offset_float_max;
2035         }
2036         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2037            This should be changed ASAP!  */
2038         if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
2039             r->reganch |= RE_USE_INTUIT;
2040             if (SvTAIL(r->check_substr))
2041                 r->reganch |= RE_INTUIT_TAIL;
2042         }
2043     }
2044     else {
2045         /* Several toplevels. Best we can is to set minlen. */
2046         I32 fake;
2047         struct regnode_charclass_class ch_class;
2048         I32 last_close = 0;
2049         
2050         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2051         scan = r->program + 1;
2052         cl_init(pRExC_state, &ch_class);
2053         data.start_class = &ch_class;
2054         data.last_closep = &last_close;
2055         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2056         r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
2057         if (!(data.start_class->flags & ANYOF_EOS)
2058             && !cl_is_anything(data.start_class)) {
2059             I32 n = add_data(pRExC_state, 1, "f");
2060
2061             New(1006, RExC_rx->data->data[n], 1,
2062                 struct regnode_charclass_class);
2063             StructCopy(data.start_class,
2064                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2065                        struct regnode_charclass_class);
2066             r->regstclass = (regnode*)RExC_rx->data->data[n];
2067             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2068             DEBUG_r({ SV* sv = sv_newmortal();
2069                       regprop(sv, (regnode*)data.start_class);
2070                       PerlIO_printf(Perl_debug_log,
2071                                     "synthetic stclass `%s'.\n",
2072                                     SvPVX(sv));});
2073         }
2074     }
2075
2076     r->minlen = minlen;
2077     if (RExC_seen & REG_SEEN_GPOS)
2078         r->reganch |= ROPT_GPOS_SEEN;
2079     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2080         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2081     if (RExC_seen & REG_SEEN_EVAL)
2082         r->reganch |= ROPT_EVAL_SEEN;
2083     if (RExC_seen & REG_SEEN_CANY)
2084         r->reganch |= ROPT_CANY_SEEN;
2085     Newz(1002, r->startp, RExC_npar, I32);
2086     Newz(1002, r->endp, RExC_npar, I32);
2087     PL_regdata = r->data; /* for regprop() */
2088     DEBUG_r(regdump(r));
2089     return(r);
2090 }
2091
2092 /*
2093  - reg - regular expression, i.e. main body or parenthesized thing
2094  *
2095  * Caller must absorb opening parenthesis.
2096  *
2097  * Combining parenthesis handling with the base level of regular expression
2098  * is a trifle forced, but the need to tie the tails of the branches to what
2099  * follows makes it hard to avoid.
2100  */
2101 STATIC regnode *
2102 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2103     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2104 {
2105     register regnode *ret;              /* Will be the head of the group. */
2106     register regnode *br;
2107     register regnode *lastbr;
2108     register regnode *ender = 0;
2109     register I32 parno = 0;
2110     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
2111
2112     /* for (?g), (?gc), and (?o) warnings; warning
2113        about (?c) will warn about (?g) -- japhy    */
2114
2115     I32 wastedflags = 0x00,
2116         wasted_o    = 0x01,
2117         wasted_g    = 0x02,
2118         wasted_gc   = 0x02 | 0x04,
2119         wasted_c    = 0x04;
2120
2121     char * parse_start = RExC_parse; /* MJD */
2122     char *oregcomp_parse = RExC_parse;
2123     char c;
2124
2125     *flagp = 0;                         /* Tentatively. */
2126
2127
2128     /* Make an OPEN node, if parenthesized. */
2129     if (paren) {
2130         if (*RExC_parse == '?') { /* (?...) */
2131             U16 posflags = 0, negflags = 0;
2132             U16 *flagsp = &posflags;
2133             int logical = 0;
2134             char *seqstart = RExC_parse;
2135
2136             RExC_parse++;
2137             paren = *RExC_parse++;
2138             ret = NULL;                 /* For look-ahead/behind. */
2139             switch (paren) {
2140             case '<':           /* (?<...) */
2141                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2142                 if (*RExC_parse == '!')
2143                     paren = ',';
2144                 if (*RExC_parse != '=' && *RExC_parse != '!')
2145                     goto unknown;
2146                 RExC_parse++;
2147             case '=':           /* (?=...) */
2148             case '!':           /* (?!...) */
2149                 RExC_seen_zerolen++;
2150             case ':':           /* (?:...) */
2151             case '>':           /* (?>...) */
2152                 break;
2153             case '$':           /* (?$...) */
2154             case '@':           /* (?@...) */
2155                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2156                 break;
2157             case '#':           /* (?#...) */
2158                 while (*RExC_parse && *RExC_parse != ')')
2159                     RExC_parse++;
2160                 if (*RExC_parse != ')')
2161                     FAIL("Sequence (?#... not terminated");
2162                 nextchar(pRExC_state);
2163                 *flagp = TRYAGAIN;
2164                 return NULL;
2165             case 'p':           /* (?p...) */
2166                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2167                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2168                 /* FALL THROUGH*/
2169             case '?':           /* (??...) */
2170                 logical = 1;
2171                 paren = *RExC_parse++;
2172                 /* FALL THROUGH */
2173             case '{':           /* (?{...}) */
2174             {
2175                 I32 count = 1, n = 0;
2176                 char c;
2177                 char *s = RExC_parse;
2178                 SV *sv;
2179                 OP_4tree *sop, *rop;
2180
2181                 RExC_seen_zerolen++;
2182                 RExC_seen |= REG_SEEN_EVAL;
2183                 while (count && (c = *RExC_parse)) {
2184                     if (c == '\\' && RExC_parse[1])
2185                         RExC_parse++;
2186                     else if (c == '{')
2187                         count++;
2188                     else if (c == '}')
2189                         count--;
2190                     RExC_parse++;
2191                 }
2192                 if (*RExC_parse != ')')
2193                 {
2194                     RExC_parse = s;             
2195                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2196                 }
2197                 if (!SIZE_ONLY) {
2198                     AV *av;
2199                 
2200                     if (RExC_parse - 1 - s)
2201                         sv = newSVpvn(s, RExC_parse - 1 - s);
2202                     else
2203                         sv = newSVpvn("", 0);
2204
2205                     ENTER;
2206                     Perl_save_re_context(aTHX);
2207                     rop = sv_compile_2op(sv, &sop, "re", &av);
2208                     sop->op_private |= OPpREFCOUNTED;
2209                     /* re_dup will OpREFCNT_inc */
2210                     OpREFCNT_set(sop, 1);
2211                     LEAVE;
2212
2213                     n = add_data(pRExC_state, 3, "nop");
2214                     RExC_rx->data->data[n] = (void*)rop;
2215                     RExC_rx->data->data[n+1] = (void*)sop;
2216                     RExC_rx->data->data[n+2] = (void*)av;
2217                     SvREFCNT_dec(sv);
2218                 }
2219                 else {                                          /* First pass */
2220                     if (PL_reginterp_cnt < ++RExC_seen_evals
2221                         && PL_curcop != &PL_compiling)
2222                         /* No compiled RE interpolated, has runtime
2223                            components ===> unsafe.  */
2224                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2225                     if (PL_tainting && PL_tainted)
2226                         FAIL("Eval-group in insecure regular expression");
2227                 }
2228                 
2229                 nextchar(pRExC_state);
2230                 if (logical) {
2231                     ret = reg_node(pRExC_state, LOGICAL);
2232                     if (!SIZE_ONLY)
2233                         ret->flags = 2;
2234                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2235                     /* deal with the length of this later - MJD */
2236                     return ret;
2237                 }
2238                 return reganode(pRExC_state, EVAL, n);
2239             }
2240             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2241             {
2242                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2243                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2244                         || RExC_parse[1] == '<'
2245                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2246                         I32 flag;
2247                         
2248                         ret = reg_node(pRExC_state, LOGICAL);
2249                         if (!SIZE_ONLY)
2250                             ret->flags = 1;
2251                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2252                         goto insert_if;
2253                     }
2254                 }
2255                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2256                     /* (?(1)...) */
2257                     parno = atoi(RExC_parse++);
2258
2259                     while (isDIGIT(*RExC_parse))
2260                         RExC_parse++;
2261                     ret = reganode(pRExC_state, GROUPP, parno);
2262                     
2263                     if ((c = *nextchar(pRExC_state)) != ')')
2264                         vFAIL("Switch condition not recognized");
2265                   insert_if:
2266                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2267                     br = regbranch(pRExC_state, &flags, 1);
2268                     if (br == NULL)
2269                         br = reganode(pRExC_state, LONGJMP, 0);
2270                     else
2271                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2272                     c = *nextchar(pRExC_state);
2273                     if (flags&HASWIDTH)
2274                         *flagp |= HASWIDTH;
2275                     if (c == '|') {
2276                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2277                         regbranch(pRExC_state, &flags, 1);
2278                         regtail(pRExC_state, ret, lastbr);
2279                         if (flags&HASWIDTH)
2280                             *flagp |= HASWIDTH;
2281                         c = *nextchar(pRExC_state);
2282                     }
2283                     else
2284                         lastbr = NULL;
2285                     if (c != ')')
2286                         vFAIL("Switch (?(condition)... contains too many branches");
2287                     ender = reg_node(pRExC_state, TAIL);
2288                     regtail(pRExC_state, br, ender);
2289                     if (lastbr) {
2290                         regtail(pRExC_state, lastbr, ender);
2291                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2292                     }
2293                     else
2294                         regtail(pRExC_state, ret, ender);
2295                     return ret;
2296                 }
2297                 else {
2298                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2299                 }
2300             }
2301             case 0:
2302                 RExC_parse--; /* for vFAIL to print correctly */
2303                 vFAIL("Sequence (? incomplete");
2304                 break;
2305             default:
2306                 --RExC_parse;
2307               parse_flags:      /* (?i) */
2308                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2309                     /* (?g), (?gc) and (?o) are useless here
2310                        and must be globally applied -- japhy */
2311
2312                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2313                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2314                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2315                             if (! (wastedflags & wflagbit) ) {
2316                                 wastedflags |= wflagbit;
2317                                 vWARN5(
2318                                     RExC_parse + 1,
2319                                     "Useless (%s%c) - %suse /%c modifier",
2320                                     flagsp == &negflags ? "?-" : "?",
2321                                     *RExC_parse,
2322                                     flagsp == &negflags ? "don't " : "",
2323                                     *RExC_parse
2324                                 );
2325                             }
2326                         }
2327                     }
2328                     else if (*RExC_parse == 'c') {
2329                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2330                             if (! (wastedflags & wasted_c) ) {
2331                                 wastedflags |= wasted_gc;
2332                                 vWARN3(
2333                                     RExC_parse + 1,
2334                                     "Useless (%sc) - %suse /gc modifier",
2335                                     flagsp == &negflags ? "?-" : "?",
2336                                     flagsp == &negflags ? "don't " : ""
2337                                 );
2338                             }
2339                         }
2340                     }
2341                     else { pmflag(flagsp, *RExC_parse); }
2342
2343                     ++RExC_parse;
2344                 }
2345                 if (*RExC_parse == '-') {
2346                     flagsp = &negflags;
2347                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2348                     ++RExC_parse;
2349                     goto parse_flags;
2350                 }
2351                 RExC_flags16 |= posflags;
2352                 RExC_flags16 &= ~negflags;
2353                 if (*RExC_parse == ':') {
2354                     RExC_parse++;
2355                     paren = ':';
2356                     break;
2357                 }               
2358               unknown:
2359                 if (*RExC_parse != ')') {
2360                     RExC_parse++;
2361                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2362                 }
2363                 nextchar(pRExC_state);
2364                 *flagp = TRYAGAIN;
2365                 return NULL;
2366             }
2367         }
2368         else {                  /* (...) */
2369             parno = RExC_npar;
2370             RExC_npar++;
2371             ret = reganode(pRExC_state, OPEN, parno);
2372             Set_Node_Length(ret, 1); /* MJD */
2373             Set_Node_Offset(ret, RExC_parse); /* MJD */
2374             open = 1;
2375         }
2376     }
2377     else                        /* ! paren */
2378         ret = NULL;
2379
2380     /* Pick up the branches, linking them together. */
2381     parse_start = RExC_parse;   /* MJD */
2382     br = regbranch(pRExC_state, &flags, 1);
2383     /*     branch_len = (paren != 0); */
2384     
2385     if (br == NULL)
2386         return(NULL);
2387     if (*RExC_parse == '|') {
2388         if (!SIZE_ONLY && RExC_extralen) {
2389             reginsert(pRExC_state, BRANCHJ, br);
2390         }
2391         else {                  /* MJD */
2392             reginsert(pRExC_state, BRANCH, br);
2393             Set_Node_Length(br, paren != 0);
2394             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2395         }
2396         have_branch = 1;
2397         if (SIZE_ONLY)
2398             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2399     }
2400     else if (paren == ':') {
2401         *flagp |= flags&SIMPLE;
2402     }
2403     if (open) {                         /* Starts with OPEN. */
2404         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2405     }
2406     else if (paren != '?')              /* Not Conditional */
2407         ret = br;
2408     *flagp |= flags & (SPSTART | HASWIDTH);
2409     lastbr = br;
2410     while (*RExC_parse == '|') {
2411         if (!SIZE_ONLY && RExC_extralen) {
2412             ender = reganode(pRExC_state, LONGJMP,0);
2413             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2414         }
2415         if (SIZE_ONLY)
2416             RExC_extralen += 2;         /* Account for LONGJMP. */
2417         nextchar(pRExC_state);
2418         br = regbranch(pRExC_state, &flags, 0);
2419         
2420         if (br == NULL)
2421             return(NULL);
2422         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2423         lastbr = br;
2424         if (flags&HASWIDTH)
2425             *flagp |= HASWIDTH;
2426         *flagp |= flags&SPSTART;
2427     }
2428
2429     if (have_branch || paren != ':') {
2430         /* Make a closing node, and hook it on the end. */
2431         switch (paren) {
2432         case ':':
2433             ender = reg_node(pRExC_state, TAIL);
2434             break;
2435         case 1:
2436             ender = reganode(pRExC_state, CLOSE, parno);
2437             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2438             Set_Node_Length(ender,1); /* MJD */
2439             break;
2440         case '<':
2441         case ',':
2442         case '=':
2443         case '!':
2444             *flagp &= ~HASWIDTH;
2445             /* FALL THROUGH */
2446         case '>':
2447             ender = reg_node(pRExC_state, SUCCEED);
2448             break;
2449         case 0:
2450             ender = reg_node(pRExC_state, END);
2451             break;
2452         }
2453         regtail(pRExC_state, lastbr, ender);
2454
2455         if (have_branch) {
2456             /* Hook the tails of the branches to the closing node. */
2457             for (br = ret; br != NULL; br = regnext(br)) {
2458                 regoptail(pRExC_state, br, ender);
2459             }
2460         }
2461     }
2462
2463     {
2464         char *p;
2465         static char parens[] = "=!<,>";
2466
2467         if (paren && (p = strchr(parens, paren))) {
2468             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2469             int flag = (p - parens) > 1;
2470
2471             if (paren == '>')
2472                 node = SUSPEND, flag = 0;
2473             reginsert(pRExC_state, node,ret);
2474             ret->flags = flag;
2475             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2476         }
2477     }
2478
2479     /* Check for proper termination. */
2480     if (paren) {
2481         RExC_flags16 = oregflags;
2482         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2483             RExC_parse = oregcomp_parse;
2484             vFAIL("Unmatched (");
2485         }
2486     }
2487     else if (!paren && RExC_parse < RExC_end) {
2488         if (*RExC_parse == ')') {
2489             RExC_parse++;
2490             vFAIL("Unmatched )");
2491         }
2492         else
2493             FAIL("Junk on end of regexp");      /* "Can't happen". */
2494         /* NOTREACHED */
2495     }
2496
2497     return(ret);
2498 }
2499
2500 /*
2501  - regbranch - one alternative of an | operator
2502  *
2503  * Implements the concatenation operator.
2504  */
2505 STATIC regnode *
2506 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2507 {
2508     register regnode *ret;
2509     register regnode *chain = NULL;
2510     register regnode *latest;
2511     I32 flags = 0, c = 0;
2512
2513     if (first)
2514         ret = NULL;
2515     else {
2516         if (!SIZE_ONLY && RExC_extralen)
2517             ret = reganode(pRExC_state, BRANCHJ,0);
2518         else {
2519             ret = reg_node(pRExC_state, BRANCH);
2520             Set_Node_Length(ret, 1);
2521         }
2522     }
2523         
2524     if (!first && SIZE_ONLY)
2525         RExC_extralen += 1;                     /* BRANCHJ */
2526
2527     *flagp = WORST;                     /* Tentatively. */
2528
2529     RExC_parse--;
2530     nextchar(pRExC_state);
2531     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2532         flags &= ~TRYAGAIN;
2533         latest = regpiece(pRExC_state, &flags);
2534         if (latest == NULL) {
2535             if (flags & TRYAGAIN)
2536                 continue;
2537             return(NULL);
2538         }
2539         else if (ret == NULL)
2540             ret = latest;
2541         *flagp |= flags&HASWIDTH;
2542         if (chain == NULL)      /* First piece. */
2543             *flagp |= flags&SPSTART;
2544         else {
2545             RExC_naughty++;
2546             regtail(pRExC_state, chain, latest);
2547         }
2548         chain = latest;
2549         c++;
2550     }
2551     if (chain == NULL) {        /* Loop ran zero times. */
2552         chain = reg_node(pRExC_state, NOTHING);
2553         if (ret == NULL)
2554             ret = chain;
2555     }
2556     if (c == 1) {
2557         *flagp |= flags&SIMPLE;
2558     }
2559
2560     return(ret);
2561 }
2562
2563 /*
2564  - regpiece - something followed by possible [*+?]
2565  *
2566  * Note that the branching code sequences used for ? and the general cases
2567  * of * and + are somewhat optimized:  they use the same NOTHING node as
2568  * both the endmarker for their branch list and the body of the last branch.
2569  * It might seem that this node could be dispensed with entirely, but the
2570  * endmarker role is not redundant.
2571  */
2572 STATIC regnode *
2573 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2574 {
2575     register regnode *ret;
2576     register char op;
2577     register char *next;
2578     I32 flags;
2579     char *origparse = RExC_parse;
2580     char *maxpos;
2581     I32 min;
2582     I32 max = REG_INFTY;
2583     char *parse_start;
2584
2585     ret = regatom(pRExC_state, &flags);
2586     if (ret == NULL) {
2587         if (flags & TRYAGAIN)
2588             *flagp |= TRYAGAIN;
2589         return(NULL);
2590     }
2591
2592     op = *RExC_parse;
2593
2594     if (op == '{' && regcurly(RExC_parse)) {
2595         parse_start = RExC_parse; /* MJD */
2596         next = RExC_parse + 1;
2597         maxpos = Nullch;
2598         while (isDIGIT(*next) || *next == ',') {
2599             if (*next == ',') {
2600                 if (maxpos)
2601                     break;
2602                 else
2603                     maxpos = next;
2604             }
2605             next++;
2606         }
2607         if (*next == '}') {             /* got one */
2608             if (!maxpos)
2609                 maxpos = next;
2610             RExC_parse++;
2611             min = atoi(RExC_parse);
2612             if (*maxpos == ',')
2613                 maxpos++;
2614             else
2615                 maxpos = RExC_parse;
2616             max = atoi(maxpos);
2617             if (!max && *maxpos != '0')
2618                 max = REG_INFTY;                /* meaning "infinity" */
2619             else if (max >= REG_INFTY)
2620                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2621             RExC_parse = next;
2622             nextchar(pRExC_state);
2623
2624         do_curly:
2625             if ((flags&SIMPLE)) {
2626                 RExC_naughty += 2 + RExC_naughty / 2;
2627                 reginsert(pRExC_state, CURLY, ret);
2628                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2629                 Set_Node_Cur_Length(ret);
2630             }
2631             else {
2632                 regnode *w = reg_node(pRExC_state, WHILEM);
2633
2634                 w->flags = 0;
2635                 regtail(pRExC_state, ret, w);
2636                 if (!SIZE_ONLY && RExC_extralen) {
2637                     reginsert(pRExC_state, LONGJMP,ret);
2638                     reginsert(pRExC_state, NOTHING,ret);
2639                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2640                 }
2641                 reginsert(pRExC_state, CURLYX,ret);
2642                                 /* MJD hk */
2643                 Set_Node_Offset(ret, parse_start+1);
2644                 Set_Node_Length(ret, 
2645                                 op == '{' ? (RExC_parse - parse_start) : 1);
2646                 
2647                 if (!SIZE_ONLY && RExC_extralen)
2648                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2649                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2650                 if (SIZE_ONLY)
2651                     RExC_whilem_seen++, RExC_extralen += 3;
2652                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2653             }
2654             ret->flags = 0;
2655
2656             if (min > 0)
2657                 *flagp = WORST;
2658             if (max > 0)
2659                 *flagp |= HASWIDTH;
2660             if (max && max < min)
2661                 vFAIL("Can't do {n,m} with n > m");
2662             if (!SIZE_ONLY) {
2663                 ARG1_SET(ret, min);
2664                 ARG2_SET(ret, max);
2665             }
2666
2667             goto nest_check;
2668         }
2669     }
2670
2671     if (!ISMULT1(op)) {
2672         *flagp = flags;
2673         return(ret);
2674     }
2675
2676 #if 0                           /* Now runtime fix should be reliable. */
2677
2678     /* if this is reinstated, don't forget to put this back into perldiag:
2679
2680             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2681
2682            (F) The part of the regexp subject to either the * or + quantifier
2683            could match an empty string. The {#} shows in the regular
2684            expression about where the problem was discovered.
2685
2686     */
2687
2688     if (!(flags&HASWIDTH) && op != '?')
2689       vFAIL("Regexp *+ operand could be empty");
2690 #endif
2691
2692     parse_start = RExC_parse;
2693     nextchar(pRExC_state);
2694
2695     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2696
2697     if (op == '*' && (flags&SIMPLE)) {
2698         reginsert(pRExC_state, STAR, ret);
2699         ret->flags = 0;
2700         RExC_naughty += 4;
2701     }
2702     else if (op == '*') {
2703         min = 0;
2704         goto do_curly;
2705     }
2706     else if (op == '+' && (flags&SIMPLE)) {
2707         reginsert(pRExC_state, PLUS, ret);
2708         ret->flags = 0;
2709         RExC_naughty += 3;
2710     }
2711     else if (op == '+') {
2712         min = 1;
2713         goto do_curly;
2714     }
2715     else if (op == '?') {
2716         min = 0; max = 1;
2717         goto do_curly;
2718     }
2719   nest_check:
2720     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2721         vWARN3(RExC_parse,
2722                "%.*s matches null string many times",
2723                RExC_parse - origparse,
2724                origparse);
2725     }
2726
2727     if (*RExC_parse == '?') {
2728         nextchar(pRExC_state);
2729         reginsert(pRExC_state, MINMOD, ret);
2730         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2731     }
2732     if (ISMULT2(RExC_parse)) {
2733         RExC_parse++;
2734         vFAIL("Nested quantifiers");
2735     }
2736
2737     return(ret);
2738 }
2739
2740 /*
2741  - regatom - the lowest level
2742  *
2743  * Optimization:  gobbles an entire sequence of ordinary characters so that
2744  * it can turn them into a single node, which is smaller to store and
2745  * faster to run.  Backslashed characters are exceptions, each becoming a
2746  * separate node; the code is simpler that way and it's not worth fixing.
2747  *
2748  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2749 STATIC regnode *
2750 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2751 {
2752     register regnode *ret = 0;
2753     I32 flags;
2754     char *parse_start = 0;
2755
2756     *flagp = WORST;             /* Tentatively. */
2757
2758 tryagain:
2759     switch (*RExC_parse) {
2760     case '^':
2761         RExC_seen_zerolen++;
2762         nextchar(pRExC_state);
2763         if (RExC_flags16 & PMf_MULTILINE)
2764             ret = reg_node(pRExC_state, MBOL);
2765         else if (RExC_flags16 & PMf_SINGLELINE)
2766             ret = reg_node(pRExC_state, SBOL);
2767         else
2768             ret = reg_node(pRExC_state, BOL);
2769         Set_Node_Length(ret, 1); /* MJD */
2770         break;
2771     case '$':
2772         nextchar(pRExC_state);
2773         if (*RExC_parse)
2774             RExC_seen_zerolen++;
2775         if (RExC_flags16 & PMf_MULTILINE)
2776             ret = reg_node(pRExC_state, MEOL);
2777         else if (RExC_flags16 & PMf_SINGLELINE)
2778             ret = reg_node(pRExC_state, SEOL);
2779         else
2780             ret = reg_node(pRExC_state, EOL);
2781         Set_Node_Length(ret, 1); /* MJD */
2782         break;
2783     case '.':
2784         nextchar(pRExC_state);
2785         if (RExC_flags16 & PMf_SINGLELINE)
2786             ret = reg_node(pRExC_state, SANY);
2787         else
2788             ret = reg_node(pRExC_state, REG_ANY);
2789         *flagp |= HASWIDTH|SIMPLE;
2790         RExC_naughty++;
2791         Set_Node_Length(ret, 1); /* MJD */
2792         break;
2793     case '[':
2794     {
2795         char *oregcomp_parse = ++RExC_parse;
2796         ret = regclass(pRExC_state);
2797         if (*RExC_parse != ']') {
2798             RExC_parse = oregcomp_parse;
2799             vFAIL("Unmatched [");
2800         }
2801         nextchar(pRExC_state);
2802         *flagp |= HASWIDTH|SIMPLE;
2803         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2804         break;
2805     }
2806     case '(':
2807         nextchar(pRExC_state);
2808         ret = reg(pRExC_state, 1, &flags);
2809         if (ret == NULL) {
2810                 if (flags & TRYAGAIN) {
2811                     if (RExC_parse == RExC_end) {
2812                          /* Make parent create an empty node if needed. */
2813                         *flagp |= TRYAGAIN;
2814                         return(NULL);
2815                     }
2816                     goto tryagain;
2817                 }
2818                 return(NULL);
2819         }
2820         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2821         break;
2822     case '|':
2823     case ')':
2824         if (flags & TRYAGAIN) {
2825             *flagp |= TRYAGAIN;
2826             return NULL;
2827         }
2828         vFAIL("Internal urp");
2829                                 /* Supposed to be caught earlier. */
2830         break;
2831     case '{':
2832         if (!regcurly(RExC_parse)) {
2833             RExC_parse++;
2834             goto defchar;
2835         }
2836         /* FALL THROUGH */
2837     case '?':
2838     case '+':
2839     case '*':
2840         RExC_parse++;
2841         vFAIL("Quantifier follows nothing");
2842         break;
2843     case '\\':
2844         switch (*++RExC_parse) {
2845         case 'A':
2846             RExC_seen_zerolen++;
2847             ret = reg_node(pRExC_state, SBOL);
2848             *flagp |= SIMPLE;
2849             nextchar(pRExC_state);
2850             Set_Node_Length(ret, 2); /* MJD */
2851             break;
2852         case 'G':
2853             ret = reg_node(pRExC_state, GPOS);
2854             RExC_seen |= REG_SEEN_GPOS;
2855             *flagp |= SIMPLE;
2856             nextchar(pRExC_state);
2857             Set_Node_Length(ret, 2); /* MJD */
2858             break;
2859         case 'Z':
2860             ret = reg_node(pRExC_state, SEOL);
2861             *flagp |= SIMPLE;
2862             RExC_seen_zerolen++;                /* Do not optimize RE away */
2863             nextchar(pRExC_state);
2864             break;
2865         case 'z':
2866             ret = reg_node(pRExC_state, EOS);
2867             *flagp |= SIMPLE;
2868             RExC_seen_zerolen++;                /* Do not optimize RE away */
2869             nextchar(pRExC_state);
2870             Set_Node_Length(ret, 2); /* MJD */
2871             break;
2872         case 'C':
2873             ret = reg_node(pRExC_state, CANY);
2874             RExC_seen |= REG_SEEN_CANY;
2875             *flagp |= HASWIDTH|SIMPLE;
2876             nextchar(pRExC_state);
2877             Set_Node_Length(ret, 2); /* MJD */
2878             break;
2879         case 'X':
2880             ret = reg_node(pRExC_state, CLUMP);
2881             *flagp |= HASWIDTH;
2882             nextchar(pRExC_state);
2883             Set_Node_Length(ret, 2); /* MJD */
2884             break;
2885         case 'w':
2886             ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
2887             *flagp |= HASWIDTH|SIMPLE;
2888             nextchar(pRExC_state);
2889             Set_Node_Length(ret, 2); /* MJD */
2890             break;
2891         case 'W':
2892             ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
2893             *flagp |= HASWIDTH|SIMPLE;
2894             nextchar(pRExC_state);
2895             Set_Node_Length(ret, 2); /* MJD */
2896             break;
2897         case 'b':
2898             RExC_seen_zerolen++;
2899             RExC_seen |= REG_SEEN_LOOKBEHIND;
2900             ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
2901             *flagp |= SIMPLE;
2902             nextchar(pRExC_state);
2903             Set_Node_Length(ret, 2); /* MJD */
2904             break;
2905         case 'B':
2906             RExC_seen_zerolen++;
2907             RExC_seen |= REG_SEEN_LOOKBEHIND;
2908             ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
2909             *flagp |= SIMPLE;
2910             nextchar(pRExC_state);
2911             Set_Node_Length(ret, 2); /* MJD */
2912             break;
2913         case 's':
2914             ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
2915             *flagp |= HASWIDTH|SIMPLE;
2916             nextchar(pRExC_state);
2917             Set_Node_Length(ret, 2); /* MJD */
2918             break;
2919         case 'S':
2920             ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
2921             *flagp |= HASWIDTH|SIMPLE;
2922             nextchar(pRExC_state);
2923             Set_Node_Length(ret, 2); /* MJD */
2924             break;
2925         case 'd':
2926             ret = reg_node(pRExC_state, DIGIT);
2927             *flagp |= HASWIDTH|SIMPLE;
2928             nextchar(pRExC_state);
2929             Set_Node_Length(ret, 2); /* MJD */
2930             break;
2931         case 'D':
2932             ret = reg_node(pRExC_state, NDIGIT);
2933             *flagp |= HASWIDTH|SIMPLE;
2934             nextchar(pRExC_state);
2935             Set_Node_Length(ret, 2); /* MJD */
2936             break;
2937         case 'p':
2938         case 'P':
2939             {   
2940                 char* oldregxend = RExC_end;
2941                 char* parse_start = RExC_parse;
2942
2943                 if (RExC_parse[1] == '{') {
2944                   /* a lovely hack--pretend we saw [\pX] instead */
2945                     RExC_end = strchr(RExC_parse, '}');
2946                     if (!RExC_end) {
2947                         U8 c = (U8)*RExC_parse;
2948                         RExC_parse += 2;
2949                         RExC_end = oldregxend;
2950                         vFAIL2("Missing right brace on \\%c{}", c);
2951                     }
2952                     RExC_end++;
2953                 }
2954                 else
2955                     RExC_end = RExC_parse + 2;
2956                 RExC_parse--;
2957
2958                 ret = regclass(pRExC_state);
2959
2960                 RExC_end = oldregxend;
2961                 RExC_parse--;
2962                 Set_Node_Cur_Length(ret); /* MJD */
2963                 nextchar(pRExC_state);
2964                 *flagp |= HASWIDTH|SIMPLE;
2965             }
2966             break;
2967         case 'n':
2968         case 'r':
2969         case 't':
2970         case 'f':
2971         case 'e':
2972         case 'a':
2973         case 'x':
2974         case 'c':
2975         case '0':
2976             goto defchar;
2977         case '1': case '2': case '3': case '4':
2978         case '5': case '6': case '7': case '8': case '9':
2979             {
2980                 I32 num = atoi(RExC_parse);
2981
2982                 if (num > 9 && num >= RExC_npar)
2983                     goto defchar;
2984                 else {
2985                     char * parse_start = RExC_parse - 1; /* MJD */
2986                     while (isDIGIT(*RExC_parse))
2987                         RExC_parse++;
2988
2989                     if (!SIZE_ONLY && num > RExC_rx->nparens)
2990                         vFAIL("Reference to nonexistent group");
2991                     RExC_sawback = 1;
2992                     ret = reganode(pRExC_state, FOLD
2993                                    ? (LOC ? REFFL : REFF)
2994                                    : REF, num);
2995                     *flagp |= HASWIDTH;
2996                     
2997                     /* override incorrect value set in reganode MJD */
2998                     Set_Node_Offset(ret, parse_start+1); 
2999                     Set_Node_Cur_Length(ret); /* MJD */
3000                     RExC_parse--;
3001                     nextchar(pRExC_state);
3002                 }
3003             }
3004             break;
3005         case '\0':
3006             if (RExC_parse >= RExC_end)
3007                 FAIL("Trailing \\");
3008             /* FALL THROUGH */
3009         default:
3010             /* Do not generate `unrecognized' warnings here, we fall
3011                back into the quick-grab loop below */
3012             goto defchar;
3013         }
3014         break;
3015
3016     case '#':
3017         if (RExC_flags16 & PMf_EXTENDED) {
3018             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3019             if (RExC_parse < RExC_end)
3020                 goto tryagain;
3021         }
3022         /* FALL THROUGH */
3023
3024     default: {
3025             register STRLEN len;
3026             register UV ender;
3027             register char *p;
3028             char *oldp, *s;
3029             STRLEN numlen;
3030             STRLEN foldlen;
3031             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3032
3033             parse_start = RExC_parse - 1;
3034
3035             RExC_parse++;
3036
3037         defchar:
3038             ret = reg_node(pRExC_state, FOLD
3039                           ? (LOC ? EXACTFL : EXACTF)
3040                           : EXACT);
3041             s = STRING(ret);
3042             for (len = 0, p = RExC_parse - 1;
3043               len < 127 && p < RExC_end;
3044               len++)
3045             {
3046                 oldp = p;
3047
3048                 if (RExC_flags16 & PMf_EXTENDED)
3049                     p = regwhite(p, RExC_end);
3050                 switch (*p) {
3051                 case '^':
3052                 case '$':
3053                 case '.':
3054                 case '[':
3055                 case '(':
3056                 case ')':
3057                 case '|':
3058                     goto loopdone;
3059                 case '\\':
3060                     switch (*++p) {
3061                     case 'A':
3062                     case 'C':
3063                     case 'X':
3064                     case 'G':
3065                     case 'Z':
3066                     case 'z':
3067                     case 'w':
3068                     case 'W':
3069                     case 'b':
3070                     case 'B':
3071                     case 's':
3072                     case 'S':
3073                     case 'd':
3074                     case 'D':
3075                     case 'p':
3076                     case 'P':
3077                         --p;
3078                         goto loopdone;
3079                     case 'n':
3080                         ender = '\n';
3081                         p++;
3082                         break;
3083                     case 'r':
3084                         ender = '\r';
3085                         p++;
3086                         break;
3087                     case 't':
3088                         ender = '\t';
3089                         p++;
3090                         break;
3091                     case 'f':
3092                         ender = '\f';
3093                         p++;
3094                         break;
3095                     case 'e':
3096                           ender = ASCII_TO_NATIVE('\033');
3097                         p++;
3098                         break;
3099                     case 'a':
3100                           ender = ASCII_TO_NATIVE('\007');
3101                         p++;
3102                         break;
3103                     case 'x':
3104                         if (*++p == '{') {
3105                             char* e = strchr(p, '}');
3106         
3107                             if (!e) {
3108                                 RExC_parse = p + 1;
3109                                 vFAIL("Missing right brace on \\x{}");
3110                             }
3111                             else {
3112                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3113                                     | PERL_SCAN_DISALLOW_PREFIX;
3114                                 numlen = e - p - 1;
3115                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3116                                 if (ender > 0xff)
3117                                     RExC_utf8 = 1;
3118                                 /* numlen is generous */
3119                                 if (numlen + len >= 127) {
3120                                     p--;
3121                                     goto loopdone;
3122                                 }
3123                                 p = e + 1;
3124                             }
3125                         }
3126                         else {
3127                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3128                             numlen = 2;
3129                             ender = grok_hex(p, &numlen, &flags, NULL);
3130                             p += numlen;
3131                         }
3132                         break;
3133                     case 'c':
3134                         p++;
3135                         ender = UCHARAT(p++);
3136                         ender = toCTRL(ender);
3137                         break;
3138                     case '0': case '1': case '2': case '3':case '4':
3139                     case '5': case '6': case '7': case '8':case '9':
3140                         if (*p == '0' ||
3141                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3142                             I32 flags = 0;
3143                             numlen = 3;
3144                             ender = grok_oct(p, &numlen, &flags, NULL);
3145                             p += numlen;
3146                         }
3147                         else {
3148                             --p;
3149                             goto loopdone;
3150                         }
3151                         break;
3152                     case '\0':
3153                         if (p >= RExC_end)
3154                             FAIL("Trailing \\");
3155                         /* FALL THROUGH */
3156                     default:
3157                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3158                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3159                         goto normal_default;
3160                     }
3161                     break;
3162                 default:
3163                   normal_default:
3164                     if (UTF8_IS_START(*p) && UTF) {
3165                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3166                                                &numlen, 0);
3167                         p += numlen;
3168                     }
3169                     else
3170                         ender = *p++;
3171                     break;
3172                 }
3173                 if (RExC_flags16 & PMf_EXTENDED)
3174                     p = regwhite(p, RExC_end);
3175                 if (UTF && FOLD) {
3176                     /* Prime the casefolded buffer. */
3177                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3178                 }
3179                 if (ISMULT2(p)) { /* Back off on ?+*. */
3180                     if (len)
3181                         p = oldp;
3182                     else if (UTF) {
3183                          if (FOLD) {
3184                               /* Emit all the Unicode characters. */
3185                               for (foldbuf = tmpbuf;
3186                                    foldlen;
3187                                    foldlen -= numlen) {
3188                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3189                                    reguni(pRExC_state, ender, s, &numlen);
3190                                    s       += numlen;
3191                                    len     += numlen;
3192                                    foldbuf += numlen;
3193                               }
3194                          }
3195                          else {
3196                               reguni(pRExC_state, ender, s, &numlen);
3197                               s   += numlen;
3198                               len += numlen;
3199                          }
3200                     }
3201                     else {
3202                         len++;
3203                         REGC(ender, s++);
3204                     }
3205                     break;
3206                 }
3207                 if (UTF) {
3208                      if (FOLD) {
3209                           /* Emit all the Unicode characters. */
3210                           for (foldbuf = tmpbuf;
3211                                foldlen;
3212                                foldlen -= numlen) {
3213                                ender = utf8_to_uvchr(foldbuf, &numlen);
3214                                reguni(pRExC_state, ender, s, &numlen);
3215                                s       += numlen;
3216                                len     += numlen;
3217                                foldbuf += numlen;
3218                           }
3219                      }
3220                      else {
3221                           reguni(pRExC_state, ender, s, &numlen);
3222                           s   += numlen;
3223                           len += numlen;
3224                      }
3225                      len--;
3226                 }
3227                 else
3228                     REGC(ender, s++);
3229             }
3230         loopdone:
3231             RExC_parse = p - 1;
3232             Set_Node_Cur_Length(ret); /* MJD */
3233             nextchar(pRExC_state);
3234             {
3235                 /* len is STRLEN which is unsigned, need to copy to signed */
3236                 IV iv = len;
3237                 if (iv < 0)
3238                     vFAIL("Internal disaster");
3239             }
3240             if (len > 0)
3241                 *flagp |= HASWIDTH;
3242             if (len == 1)
3243                 *flagp |= SIMPLE;
3244             if (!SIZE_ONLY)
3245                 STR_LEN(ret) = len;
3246             if (SIZE_ONLY)
3247                 RExC_size += STR_SZ(len);
3248             else
3249                 RExC_emit += STR_SZ(len);
3250         }
3251         break;
3252     }
3253
3254     /* If the encoding pragma is in effect recode the text of
3255      * any EXACT-kind nodes. */
3256     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3257          STRLEN oldlen = STR_LEN(ret);
3258          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3259
3260          if (RExC_utf8)
3261               SvUTF8_on(sv);
3262          if (sv_utf8_downgrade(sv, TRUE)) {
3263               char *s       = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3264               STRLEN newlen = SvCUR(sv);
3265          
3266               if (!SIZE_ONLY) {
3267                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3268                                          (int)oldlen, STRING(ret),
3269                                          (int)newlen, s));
3270                    Copy(s, STRING(ret), newlen, char);
3271                    STR_LEN(ret) += newlen - oldlen;
3272                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3273               } else
3274                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3275          }
3276     }
3277
3278     return(ret);
3279 }
3280
3281 STATIC char *
3282 S_regwhite(pTHX_ char *p, char *e)
3283 {
3284     while (p < e) {
3285         if (isSPACE(*p))
3286             ++p;
3287         else if (*p == '#') {
3288             do {
3289                 p++;
3290             } while (p < e && *p != '\n');
3291         }
3292         else
3293             break;
3294     }
3295     return p;
3296 }
3297
3298 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3299    Character classes ([:foo:]) can also be negated ([:^foo:]).
3300    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3301    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3302    but trigger failures because they are currently unimplemented. */
3303
3304 #define POSIXCC_DONE(c)   ((c) == ':')
3305 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3306 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3307
3308 STATIC I32
3309 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3310 {
3311     char *posixcc = 0;
3312     I32 namedclass = OOB_NAMEDCLASS;
3313
3314     if (value == '[' && RExC_parse + 1 < RExC_end &&
3315         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3316         POSIXCC(UCHARAT(RExC_parse))) {
3317         char  c = UCHARAT(RExC_parse);
3318         char* s = RExC_parse++;
3319         
3320         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3321             RExC_parse++;
3322         if (RExC_parse == RExC_end)
3323             /* Grandfather lone [:, [=, [. */
3324             RExC_parse = s;
3325         else {
3326             char* t = RExC_parse++; /* skip over the c */
3327
3328             if (UCHARAT(RExC_parse) == ']') {
3329                 RExC_parse++; /* skip over the ending ] */
3330                 posixcc = s + 1;
3331                 if (*s == ':') {
3332                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3333                     I32 skip = 5; /* the most common skip */
3334
3335                     switch (*posixcc) {
3336                     case 'a':
3337                         if (strnEQ(posixcc, "alnum", 5))
3338                             namedclass =
3339                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3340                         else if (strnEQ(posixcc, "alpha", 5))
3341                             namedclass =
3342                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3343                         else if (strnEQ(posixcc, "ascii", 5))
3344                             namedclass =
3345                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3346                         break;
3347                     case 'b':
3348                         if (strnEQ(posixcc, "blank", 5))
3349                             namedclass =
3350                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3351                         break;
3352                     case 'c':
3353                         if (strnEQ(posixcc, "cntrl", 5))
3354                             namedclass =
3355                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3356                         break;
3357                     case 'd':
3358                         if (strnEQ(posixcc, "digit", 5))
3359                             namedclass =
3360                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3361                         break;
3362                     case 'g':
3363                         if (strnEQ(posixcc, "graph", 5))
3364                             namedclass =
3365                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3366                         break;
3367                     case 'l':
3368                         if (strnEQ(posixcc, "lower", 5))
3369                             namedclass =
3370                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3371                         break;
3372                     case 'p':
3373                         if (strnEQ(posixcc, "print", 5))
3374                             namedclass =
3375                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3376                         else if (strnEQ(posixcc, "punct", 5))
3377                             namedclass =
3378                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3379                         break;
3380                     case 's':
3381                         if (strnEQ(posixcc, "space", 5))
3382                             namedclass =
3383                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3384                         break;
3385                     case 'u':
3386                         if (strnEQ(posixcc, "upper", 5))
3387                             namedclass =
3388                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3389                         break;
3390                     case 'w': /* this is not POSIX, this is the Perl \w */
3391                         if (strnEQ(posixcc, "word", 4)) {
3392                             namedclass =
3393                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3394                             skip = 4;
3395                         }
3396                         break;
3397                     case 'x':
3398                         if (strnEQ(posixcc, "xdigit", 6)) {
3399                             namedclass =
3400                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3401                             skip = 6;
3402                         }
3403                         break;
3404                     }
3405                     if (namedclass == OOB_NAMEDCLASS ||
3406                         posixcc[skip] != ':' ||
3407                         posixcc[skip+1] != ']')
3408                     {
3409                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3410                                       t - s - 1, s + 1);
3411                     }
3412                 } else if (!SIZE_ONLY) {
3413                     /* [[=foo=]] and [[.foo.]] are still future. */
3414
3415                     /* adjust RExC_parse so the warning shows after
3416                        the class closes */
3417                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3418                         RExC_parse++;
3419                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3420                 }
3421             } else {
3422                 /* Maternal grandfather:
3423                  * "[:" ending in ":" but not in ":]" */
3424                 RExC_parse = s;
3425             }
3426         }
3427     }
3428
3429     return namedclass;
3430 }
3431
3432 STATIC void
3433 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3434 {
3435     if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
3436         POSIXCC(UCHARAT(RExC_parse))) {
3437         char *s = RExC_parse;
3438         char  c = *s++;
3439
3440         while(*s && isALNUM(*s))
3441             s++;
3442         if (*s && c == *s && s[1] == ']') {
3443             vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3444
3445             /* [[=foo=]] and [[.foo.]] are still future. */
3446             if (POSIXCC_NOTYET(c)) {
3447                 /* adjust RExC_parse so the error shows after
3448                    the class closes */
3449                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3450                     ;
3451                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3452             }
3453         }
3454     }
3455 }
3456
3457 STATIC regnode *
3458 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3459 {
3460     register UV value;
3461     register UV nextvalue;
3462     register IV prevvalue = OOB_UNICODE;
3463     register IV range = 0;
3464     register regnode *ret;
3465     STRLEN numlen;
3466     IV namedclass;
3467     char *rangebegin = 0;
3468     bool need_class = 0;
3469     SV *listsv = Nullsv;
3470     register char *e;
3471     UV n;
3472     bool optimize_invert   = TRUE;
3473     AV* unicode_alternate  = 0;
3474
3475     ret = reganode(pRExC_state, ANYOF, 0);
3476
3477     if (!SIZE_ONLY)
3478         ANYOF_FLAGS(ret) = 0;
3479
3480     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3481         RExC_naughty++;
3482         RExC_parse++;
3483         if (!SIZE_ONLY)
3484             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3485     }
3486
3487     if (SIZE_ONLY)
3488         RExC_size += ANYOF_SKIP;
3489     else {
3490         RExC_emit += ANYOF_SKIP;
3491         if (FOLD)
3492             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3493         if (LOC)
3494             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3495         ANYOF_BITMAP_ZERO(ret);
3496         listsv = newSVpvn("# comment\n", 10);
3497     }
3498
3499     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3500
3501     if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
3502         checkposixcc(pRExC_state);
3503
3504     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3505     if (UCHARAT(RExC_parse) == ']')
3506         goto charclassloop;
3507
3508     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3509
3510     charclassloop:
3511
3512         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3513
3514         if (!range)
3515             rangebegin = RExC_parse;
3516         if (UTF) {
3517             value = utf8n_to_uvchr((U8*)RExC_parse,
3518                                    RExC_end - RExC_parse,
3519                                    &numlen, 0);
3520             RExC_parse += numlen;
3521         }
3522         else
3523             value = UCHARAT(RExC_parse++);
3524         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3525         if (value == '[' && POSIXCC(nextvalue))
3526             namedclass = regpposixcc(pRExC_state, value);
3527         else if (value == '\\') {
3528             if (UTF) {
3529                 value = utf8n_to_uvchr((U8*)RExC_parse,
3530                                    RExC_end - RExC_parse,
3531                                    &numlen, 0);
3532                 RExC_parse += numlen;
3533             }
3534             else
3535                 value = UCHARAT(RExC_parse++);
3536             /* Some compilers cannot handle switching on 64-bit integer
3537              * values, therefore value cannot be an UV.  Yes, this will
3538              * be a problem later if we want switch on Unicode.
3539              * A similar issue a little bit later when switching on
3540              * namedclass. --jhi */
3541             switch ((I32)value) {
3542             case 'w':   namedclass = ANYOF_ALNUM;       break;
3543             case 'W':   namedclass = ANYOF_NALNUM;      break;
3544             case 's':   namedclass = ANYOF_SPACE;       break;
3545             case 'S':   namedclass = ANYOF_NSPACE;      break;
3546             case 'd':   namedclass = ANYOF_DIGIT;       break;
3547             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3548             case 'p':
3549             case 'P':
3550                 if (*RExC_parse == '{') {
3551                     U8 c = (U8)value;
3552                     e = strchr(RExC_parse++, '}');
3553                     if (!e)
3554                         vFAIL2("Missing right brace on \\%c{}", c);
3555                     while (isSPACE(UCHARAT(RExC_parse)))
3556                         RExC_parse++;
3557                     if (e == RExC_parse)
3558                         vFAIL2("Empty \\%c{}", c);
3559                     n = e - RExC_parse;
3560                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3561                         n--;
3562                 }
3563                 else {
3564                     e = RExC_parse;
3565                     n = 1;
3566                 }
3567                 if (!SIZE_ONLY) {
3568                     if (UCHARAT(RExC_parse) == '^') {
3569                          RExC_parse++;
3570                          n--;
3571                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3572                          while (isSPACE(UCHARAT(RExC_parse))) {
3573                               RExC_parse++;
3574                               n--;
3575                          }
3576                     }
3577                     if (value == 'p')
3578                          Perl_sv_catpvf(aTHX_ listsv,
3579                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3580                     else
3581                          Perl_sv_catpvf(aTHX_ listsv,
3582                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3583                 }
3584                 RExC_parse = e + 1;
3585                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3586                 continue;
3587             case 'n':   value = '\n';                   break;
3588             case 'r':   value = '\r';                   break;
3589             case 't':   value = '\t';                   break;
3590             case 'f':   value = '\f';                   break;
3591             case 'b':   value = '\b';                   break;
3592             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3593             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3594             case 'x':
3595                 if (*RExC_parse == '{') {
3596                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3597                         | PERL_SCAN_DISALLOW_PREFIX;
3598                     e = strchr(RExC_parse++, '}');
3599                     if (!e)
3600                         vFAIL("Missing right brace on \\x{}");
3601
3602                     numlen = e - RExC_parse;
3603                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3604                     RExC_parse = e + 1;
3605                 }
3606                 else {
3607                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3608                     numlen = 2;
3609                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3610                     RExC_parse += numlen;
3611                 }
3612                 break;
3613             case 'c':
3614                 value = UCHARAT(RExC_parse++);
3615                 value = toCTRL(value);
3616                 break;
3617             case '0': case '1': case '2': case '3': case '4':
3618             case '5': case '6': case '7': case '8': case '9':
3619             {
3620                 I32 flags = 0;
3621                 numlen = 3;
3622                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3623                 RExC_parse += numlen;
3624                 break;
3625             }
3626             default:
3627                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3628                     vWARN2(RExC_parse,
3629                            "Unrecognized escape \\%c in character class passed through",
3630                            (int)value);
3631                 break;
3632             }
3633         } /* end of \blah */
3634
3635         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3636
3637             if (!SIZE_ONLY && !need_class)
3638                 ANYOF_CLASS_ZERO(ret);
3639
3640             need_class = 1;
3641
3642             /* a bad range like a-\d, a-[:digit:] ? */
3643             if (range) {
3644                 if (!SIZE_ONLY) {
3645                     if (ckWARN(WARN_REGEXP))
3646                         vWARN4(RExC_parse,
3647                                "False [] range \"%*.*s\"",
3648                                RExC_parse - rangebegin,
3649                                RExC_parse - rangebegin,
3650                                rangebegin);
3651                     if (prevvalue < 256) {
3652                         ANYOF_BITMAP_SET(ret, prevvalue);
3653                         ANYOF_BITMAP_SET(ret, '-');
3654                     }
3655                     else {
3656                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3657                         Perl_sv_catpvf(aTHX_ listsv,
3658                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3659                     }
3660                 }
3661
3662                 range = 0; /* this was not a true range */
3663             }
3664
3665             if (!SIZE_ONLY) {
3666                 if (namedclass > OOB_NAMEDCLASS)
3667                     optimize_invert = FALSE;
3668                 /* Possible truncation here but in some 64-bit environments
3669                  * the compiler gets heartburn about switch on 64-bit values.
3670                  * A similar issue a little earlier when switching on value.
3671                  * --jhi */
3672                 switch ((I32)namedclass) {
3673                 case ANYOF_ALNUM:
3674                     if (LOC)
3675                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3676                     else {
3677                         for (value = 0; value < 256; value++)
3678                             if (isALNUM(value))
3679                                 ANYOF_BITMAP_SET(ret, value);
3680                     }
3681                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3682                     break;
3683                 case ANYOF_NALNUM:
3684                     if (LOC)
3685                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3686                     else {
3687                         for (value = 0; value < 256; value++)
3688                             if (!isALNUM(value))
3689                                 ANYOF_BITMAP_SET(ret, value);
3690                     }
3691                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3692                     break;
3693                 case ANYOF_ALNUMC:
3694                     if (LOC)
3695                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3696                     else {
3697                         for (value = 0; value < 256; value++)
3698                             if (isALNUMC(value))
3699                                 ANYOF_BITMAP_SET(ret, value);
3700                     }
3701                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3702                     break;
3703                 case ANYOF_NALNUMC:
3704                     if (LOC)
3705                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3706                     else {
3707                         for (value = 0; value < 256; value++)
3708                             if (!isALNUMC(value))
3709                                 ANYOF_BITMAP_SET(ret, value);
3710                     }
3711                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3712                     break;
3713                 case ANYOF_ALPHA:
3714                     if (LOC)
3715                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3716                     else {
3717                         for (value = 0; value < 256; value++)
3718                             if (isALPHA(value))
3719                                 ANYOF_BITMAP_SET(ret, value);
3720                     }
3721                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3722                     break;
3723                 case ANYOF_NALPHA:
3724                     if (LOC)
3725                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3726                     else {
3727                         for (value = 0; value < 256; value++)
3728                             if (!isALPHA(value))
3729                                 ANYOF_BITMAP_SET(ret, value);
3730                     }
3731                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3732                     break;
3733                 case ANYOF_ASCII:
3734                     if (LOC)
3735                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3736                     else {
3737 #ifndef EBCDIC
3738                         for (value = 0; value < 128; value++)
3739                             ANYOF_BITMAP_SET(ret, value);
3740 #else  /* EBCDIC */
3741                         for (value = 0; value < 256; value++) {
3742                             if (isASCII(value))
3743                                 ANYOF_BITMAP_SET(ret, value);
3744                         }
3745 #endif /* EBCDIC */
3746                     }
3747                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3748                     break;
3749                 case ANYOF_NASCII:
3750                     if (LOC)
3751                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3752                     else {
3753 #ifndef EBCDIC
3754                         for (value = 128; value < 256; value++)
3755                             ANYOF_BITMAP_SET(ret, value);
3756 #else  /* EBCDIC */
3757                         for (value = 0; value < 256; value++) {
3758                             if (!isASCII(value))
3759                                 ANYOF_BITMAP_SET(ret, value);
3760                         }
3761 #endif /* EBCDIC */
3762                     }
3763                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3764                     break;
3765                 case ANYOF_BLANK:
3766                     if (LOC)
3767                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3768                     else {
3769                         for (value = 0; value < 256; value++)
3770                             if (isBLANK(value))
3771                                 ANYOF_BITMAP_SET(ret, value);
3772                     }
3773                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3774                     break;
3775                 case ANYOF_NBLANK:
3776                     if (LOC)
3777                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3778                     else {
3779                         for (value = 0; value < 256; value++)
3780                             if (!isBLANK(value))
3781                                 ANYOF_BITMAP_SET(ret, value);
3782                     }
3783                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3784                     break;
3785                 case ANYOF_CNTRL:
3786                     if (LOC)
3787                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3788                     else {
3789                         for (value = 0; value < 256; value++)
3790                             if (isCNTRL(value))
3791                                 ANYOF_BITMAP_SET(ret, value);
3792                     }
3793                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3794                     break;
3795                 case ANYOF_NCNTRL:
3796                     if (LOC)
3797                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3798                     else {
3799                         for (value = 0; value < 256; value++)
3800                             if (!isCNTRL(value))
3801                                 ANYOF_BITMAP_SET(ret, value);
3802                     }
3803                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3804                     break;
3805                 case ANYOF_DIGIT:
3806                     if (LOC)
3807                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3808                     else {
3809                         /* consecutive digits assumed */
3810                         for (value = '0'; value <= '9'; value++)
3811                             ANYOF_BITMAP_SET(ret, value);
3812                     }
3813                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3814                     break;
3815                 case ANYOF_NDIGIT:
3816                     if (LOC)
3817                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3818                     else {
3819                         /* consecutive digits assumed */
3820                         for (value = 0; value < '0'; value++)
3821                             ANYOF_BITMAP_SET(ret, value);
3822                         for (value = '9' + 1; value < 256; value++)
3823                             ANYOF_BITMAP_SET(ret, value);
3824                     }
3825                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3826                     break;
3827                 case ANYOF_GRAPH:
3828                     if (LOC)
3829                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3830                     else {
3831                         for (value = 0; value < 256; value++)
3832                             if (isGRAPH(value))
3833                                 ANYOF_BITMAP_SET(ret, value);
3834                     }
3835                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3836                     break;
3837                 case ANYOF_NGRAPH:
3838                     if (LOC)
3839                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3840                     else {
3841                         for (value = 0; value < 256; value++)
3842                             if (!isGRAPH(value))
3843                                 ANYOF_BITMAP_SET(ret, value);
3844                     }
3845                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3846                     break;
3847                 case ANYOF_LOWER:
3848                     if (LOC)
3849                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3850                     else {
3851                         for (value = 0; value < 256; value++)
3852                             if (isLOWER(value))
3853                                 ANYOF_BITMAP_SET(ret, value);
3854                     }
3855                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3856                     break;
3857                 case ANYOF_NLOWER:
3858                     if (LOC)
3859                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3860                     else {
3861                         for (value = 0; value < 256; value++)
3862                             if (!isLOWER(value))
3863                                 ANYOF_BITMAP_SET(ret, value);
3864                     }
3865                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3866                     break;
3867                 case ANYOF_PRINT:
3868                     if (LOC)
3869                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3870                     else {
3871                         for (value = 0; value < 256; value++)
3872                             if (isPRINT(value))
3873                                 ANYOF_BITMAP_SET(ret, value);
3874                     }
3875                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3876                     break;
3877                 case ANYOF_NPRINT: