This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate mainline.
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors 
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif 
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2000, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifdef PERL_IN_XSUB_RE
87 #  if defined(PERL_CAPI) || defined(PERL_OBJECT)
88 #    include "XSUB.h"
89 #  endif
90 #else
91 #  include "INTERN.h"
92 #endif
93
94 #define REG_COMP_C
95 #include "regcomp.h"
96
97 #ifdef op
98 #undef op
99 #endif /* op */
100
101 #ifdef MSDOS
102 # if defined(BUGGY_MSC6)
103  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104  # pragma optimize("a",off)
105  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106  # pragma optimize("w",on )
107 # endif /* BUGGY_MSC6 */
108 #endif /* MSDOS */
109
110 #ifndef STATIC
111 #define STATIC  static
112 #endif
113
114 typedef struct RExC_state_t {
115     U16         flags16;                /* are we folding, multilining? */
116     char        *precomp;               /* uncompiled string. */
117     regexp      *rx;
118     char        *end;                   /* End of input for compile */
119     char        *parse;                 /* Input-scan pointer. */
120     I32         whilem_seen;            /* number of WHILEM in this expr */
121     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
122     I32         naughty;                /* How bad is this pattern? */
123     I32         sawback;                /* Did we see \1, ...? */
124     U32         seen;
125     I32         size;                   /* Code size. */
126     I32         npar;                   /* () count. */
127     I32         extralen;
128     I32         seen_zerolen;
129     I32         seen_evals;
130 #if ADD_TO_REGEXEC
131     char        *starttry;              /* -Dr: where regtry was called. */
132 #define RExC_starttry   (pRExC_state->starttry)
133 #endif
134 } RExC_state_t;
135
136 #define RExC_flags16    (pRExC_state->flags16)
137 #define RExC_precomp    (pRExC_state->precomp)
138 #define RExC_rx         (pRExC_state->rx)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_emit       (pRExC_state->emit)
143 #define RExC_naughty    (pRExC_state->naughty)
144 #define RExC_sawback    (pRExC_state->sawback)
145 #define RExC_seen       (pRExC_state->seen)
146 #define RExC_size       (pRExC_state->size)
147 #define RExC_npar       (pRExC_state->npar)
148 #define RExC_extralen   (pRExC_state->extralen)
149 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151
152 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
153 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
154         ((*s) == '{' && regcurly(s)))
155 #ifdef atarist
156 #define PERL_META       "^$.[()|?+*\\"
157 #else
158 #define META    "^$.[()|?+*\\"
159 #endif
160
161 #ifdef SPSTART
162 #undef SPSTART          /* dratted cpp namespace... */
163 #endif
164 /*
165  * Flags to be passed up and down.
166  */
167 #define WORST           0       /* Worst case. */
168 #define HASWIDTH        0x1     /* Known to match non-null strings. */
169 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
170 #define SPSTART         0x4     /* Starts with * or +. */
171 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
172
173 /* Length of a variant. */
174
175 typedef struct scan_data_t {
176     I32 len_min;
177     I32 len_delta;
178     I32 pos_min;
179     I32 pos_delta;
180     SV *last_found;
181     I32 last_end;                       /* min value, <0 unless valid. */
182     I32 last_start_min;
183     I32 last_start_max;
184     SV **longest;                       /* Either &l_fixed, or &l_float. */
185     SV *longest_fixed;
186     I32 offset_fixed;
187     SV *longest_float;
188     I32 offset_float_min;
189     I32 offset_float_max;
190     I32 flags;
191     I32 whilem_c;
192     I32 *last_closep;
193     struct regnode_charclass_class *start_class;
194 } scan_data_t;
195
196 /*
197  * Forward declarations for pregcomp()'s friends.
198  */
199
200 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
201                                       0, 0, 0, 0, 0, 0};
202
203 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
204 #define SF_BEFORE_SEOL          0x1
205 #define SF_BEFORE_MEOL          0x2
206 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
207 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208
209 #ifdef NO_UNARY_PLUS
210 #  define SF_FIX_SHIFT_EOL      (0+2)
211 #  define SF_FL_SHIFT_EOL               (0+4)
212 #else
213 #  define SF_FIX_SHIFT_EOL      (+2)
214 #  define SF_FL_SHIFT_EOL               (+4)
215 #endif
216
217 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
219
220 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
221 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
222 #define SF_IS_INF               0x40
223 #define SF_HAS_PAR              0x80
224 #define SF_IN_PAR               0x100
225 #define SF_HAS_EVAL             0x200
226 #define SCF_DO_SUBSTR           0x400
227 #define SCF_DO_STCLASS_AND      0x0800
228 #define SCF_DO_STCLASS_OR       0x1000
229 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
230 #define SCF_WHILEM_VISITED_POS  0x2000
231
232 #define RF_utf8         8
233 #define UTF (PL_reg_flags & RF_utf8)
234 #define LOC (RExC_flags16 & PMf_LOCALE)
235 #define FOLD (RExC_flags16 & PMf_FOLD)
236
237 #define OOB_UNICODE             12345678
238 #define OOB_NAMEDCLASS          -1
239
240 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
241 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
242
243
244 /* length of regex to show in messages that don't mark a position within */
245 #define RegexLengthToShowInErrorMessages 127
246
247 /*
248  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
249  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
250  * op/pragma/warn/regcomp.
251  */
252 #define MARKER1 "HERE"      /* marker as it appears in the description */
253 #define MARKER2 " << HERE "  /* marker as it appears within the regex */
254    
255 #define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
256
257 /*
258  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
259  * arg. Show regex, up to a maximum length. If it's too long, chop and add
260  * "...".
261  */
262 #define FAIL(msg)                                                             \
263     STMT_START {                                                             \
264         char *ellipses = "";                                                 \
265         unsigned len = strlen(RExC_precomp);                                \
266                                                                              \
267         if (!SIZE_ONLY)                                                      \
268             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
269                                                                              \
270         if (len > RegexLengthToShowInErrorMessages) {                        \
271             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
272             len = RegexLengthToShowInErrorMessages - 10;                     \
273             ellipses = "...";                                                \
274         }                                                                    \
275         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
276                    msg, (int)len, RExC_precomp, ellipses);                  \
277     } STMT_END
278
279 /*
280  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
281  * args. Show regex, up to a maximum length. If it's too long, chop and add
282  * "...".
283  */
284 #define FAIL2(pat,msg)                                                        \
285     STMT_START {                                                             \
286         char *ellipses = "";                                                 \
287         unsigned len = strlen(RExC_precomp);                                \
288                                                                              \
289         if (!SIZE_ONLY)                                                      \
290             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
291                                                                              \
292         if (len > RegexLengthToShowInErrorMessages) {                        \
293             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
294             len = RegexLengthToShowInErrorMessages - 10;                     \
295             ellipses = "...";                                                \
296         }                                                                    \
297         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
298                     msg, (int)len, RExC_precomp, ellipses);                \
299     } STMT_END
300
301
302 /*
303  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
304  */
305 #define Simple_vFAIL(m)                                                      \
306     STMT_START {                                                             \
307       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
308                                                                              \
309       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
310                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
311     } STMT_END
312
313 /*
314  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
315  */
316 #define vFAIL(m)                                                             \
317     STMT_START {                                                             \
318       if (!SIZE_ONLY)                                                        \
319             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
320       Simple_vFAIL(m);                                                       \
321     } STMT_END
322
323 /*
324  * Like Simple_vFAIL(), but accepts two arguments.
325  */
326 #define Simple_vFAIL2(m,a1)                                                  \
327     STMT_START {                                                             \
328       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
329                                                                              \
330       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
331                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
332     } STMT_END
333
334 /*
335  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336  */
337 #define vFAIL2(m,a1)                                                         \
338     STMT_START {                                                             \
339       if (!SIZE_ONLY)                                                        \
340             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
341       Simple_vFAIL2(m, a1);                                                  \
342     } STMT_END
343
344
345 /*
346  * Like Simple_vFAIL(), but accepts three arguments.
347  */
348 #define Simple_vFAIL3(m, a1, a2)                                             \
349     STMT_START {                                                             \
350       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
351                                                                              \
352       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
353                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
354     } STMT_END
355
356 /*
357  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
358  */
359 #define vFAIL3(m,a1,a2)                                                      \
360     STMT_START {                                                             \
361       if (!SIZE_ONLY)                                                        \
362             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
363       Simple_vFAIL3(m, a1, a2);                                              \
364     } STMT_END
365
366 /*
367  * Like Simple_vFAIL(), but accepts four arguments.
368  */
369 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
370     STMT_START {                                                             \
371       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
372                                                                              \
373       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
374                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
375     } STMT_END
376
377 /*
378  * Like Simple_vFAIL(), but accepts five arguments.
379  */
380 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
381     STMT_START {                                                             \
382       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
383       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
384                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
385     } STMT_END
386
387
388 #define vWARN(loc,m)                                                         \
389     STMT_START {                                                             \
390         unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));          \
391         Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
392                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
393     } STMT_END                                                               \
394
395
396 #define vWARN2(loc, m, a1)                                                   \
397     STMT_START {                                                             \
398         unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));          \
399         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
400                  a1,                                                         \
401                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
402     } STMT_END
403
404 #define vWARN3(loc, m, a1, a2)                                               \
405     STMT_START {                                                             \
406       unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc));        \
407         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,                    \
408                  a1, a2,                                                     \
409                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
410     } STMT_END
411
412 #define vWARN4(loc, m, a1, a2, a3)                                           \
413     STMT_START {                                                             \
414       unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));            \
415         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
416                  a1, a2, a3,                                                 \
417                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
418     } STMT_END
419
420
421 /* Allow for side effects in s */
422 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
423
424 static void clear_re(pTHXo_ void *r);
425
426 /* Mark that we cannot extend a found fixed substring at this point.
427    Updata the longest found anchored substring and the longest found
428    floating substrings if needed. */
429
430 STATIC void
431 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
432 {
433     STRLEN l = CHR_SVLEN(data->last_found);
434     STRLEN old_l = CHR_SVLEN(*data->longest);
435     
436     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
437         sv_setsv(*data->longest, data->last_found);
438         if (*data->longest == data->longest_fixed) {
439             data->offset_fixed = l ? data->last_start_min : data->pos_min;
440             if (data->flags & SF_BEFORE_EOL)
441                 data->flags 
442                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
443             else
444                 data->flags &= ~SF_FIX_BEFORE_EOL;
445         }
446         else {
447             data->offset_float_min = l ? data->last_start_min : data->pos_min;
448             data->offset_float_max = (l 
449                                       ? data->last_start_max 
450                                       : data->pos_min + data->pos_delta);
451             if (data->flags & SF_BEFORE_EOL)
452                 data->flags 
453                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
454             else
455                 data->flags &= ~SF_FL_BEFORE_EOL;
456         }
457     }
458     SvCUR_set(data->last_found, 0);
459     data->last_end = -1;
460     data->flags &= ~SF_BEFORE_EOL;
461 }
462
463 /* Can match anything (initialization) */
464 STATIC void
465 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
466 {
467     int value;
468
469     ANYOF_CLASS_ZERO(cl);
470     for (value = 0; value < 256; ++value)
471         ANYOF_BITMAP_SET(cl, value);
472     cl->flags = ANYOF_EOS;
473     if (LOC)
474         cl->flags |= ANYOF_LOCALE;
475 }
476
477 /* Can match anything (initialization) */
478 STATIC int
479 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
480 {
481     int value;
482
483     for (value = 0; value <= ANYOF_MAX; value += 2)
484         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
485             return 1;
486     for (value = 0; value < 256; ++value)
487         if (!ANYOF_BITMAP_TEST(cl, value))
488             return 0;
489     return 1;
490 }
491
492 /* Can match anything (initialization) */
493 STATIC void
494 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
495 {
496     Zero(cl, 1, struct regnode_charclass_class);
497     cl->type = ANYOF;
498     cl_anything(pRExC_state, cl);
499 }
500
501 STATIC void
502 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
503 {
504     Zero(cl, 1, struct regnode_charclass_class);
505     cl->type = ANYOF;
506     cl_anything(pRExC_state, cl);
507     if (LOC)
508         cl->flags |= ANYOF_LOCALE;
509 }
510
511 /* 'And' a given class with another one.  Can create false positives */
512 /* We assume that cl is not inverted */
513 STATIC void
514 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
515          struct regnode_charclass_class *and_with)
516 {
517     if (!(and_with->flags & ANYOF_CLASS)
518         && !(cl->flags & ANYOF_CLASS)
519         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
520         && !(and_with->flags & ANYOF_FOLD)
521         && !(cl->flags & ANYOF_FOLD)) {
522         int i;
523
524         if (and_with->flags & ANYOF_INVERT)
525             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
526                 cl->bitmap[i] &= ~and_with->bitmap[i];
527         else
528             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
529                 cl->bitmap[i] &= and_with->bitmap[i];
530     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
531     if (!(and_with->flags & ANYOF_EOS))
532         cl->flags &= ~ANYOF_EOS;
533 }
534
535 /* 'OR' a given class with another one.  Can create false positives */
536 /* We assume that cl is not inverted */
537 STATIC void
538 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
539 {
540     if (or_with->flags & ANYOF_INVERT) {
541         /* We do not use
542          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
543          *   <= (B1 | !B2) | (CL1 | !CL2)
544          * which is wasteful if CL2 is small, but we ignore CL2:
545          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
546          * XXXX Can we handle case-fold?  Unclear:
547          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
548          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
549          */
550         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
551              && !(or_with->flags & ANYOF_FOLD)
552              && !(cl->flags & ANYOF_FOLD) ) {
553             int i;
554
555             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
556                 cl->bitmap[i] |= ~or_with->bitmap[i];
557         } /* XXXX: logic is complicated otherwise */
558         else {
559             cl_anything(pRExC_state, cl);
560         }
561     } else {
562         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
563         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564              && (!(or_with->flags & ANYOF_FOLD) 
565                  || (cl->flags & ANYOF_FOLD)) ) {
566             int i;
567
568             /* OR char bitmap and class bitmap separately */
569             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570                 cl->bitmap[i] |= or_with->bitmap[i];
571             if (or_with->flags & ANYOF_CLASS) {
572                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
573                     cl->classflags[i] |= or_with->classflags[i];
574                 cl->flags |= ANYOF_CLASS;
575             }
576         }
577         else { /* XXXX: logic is complicated, leave it along for a moment. */
578             cl_anything(pRExC_state, cl);
579         }
580     }
581     if (or_with->flags & ANYOF_EOS)
582         cl->flags |= ANYOF_EOS;
583 }
584
585 /* REx optimizer.  Converts nodes into quickier variants "in place".
586    Finds fixed substrings.  */
587
588 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
589    to the position after last scanned or to NULL. */
590
591 STATIC I32
592 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
593                         /* scanp: Start here (read-write). */
594                         /* deltap: Write maxlen-minlen here. */
595                         /* last: Stop before this one. */
596 {
597     I32 min = 0, pars = 0, code;
598     regnode *scan = *scanp, *next;
599     I32 delta = 0;
600     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
601     int is_inf_internal = 0;            /* The studied chunk is infinite */
602     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
603     scan_data_t data_fake;
604     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
605     
606     while (scan && OP(scan) != END && scan < last) {
607         /* Peephole optimizer: */
608
609         if (PL_regkind[(U8)OP(scan)] == EXACT) {
610             /* Merge several consecutive EXACTish nodes into one. */
611             regnode *n = regnext(scan);
612             U32 stringok = 1;
613 #ifdef DEBUGGING
614             regnode *stop = scan;
615 #endif 
616
617             next = scan + NODE_SZ_STR(scan);
618             /* Skip NOTHING, merge EXACT*. */
619             while (n &&
620                    ( PL_regkind[(U8)OP(n)] == NOTHING || 
621                      (stringok && (OP(n) == OP(scan))))
622                    && NEXT_OFF(n)
623                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
624                 if (OP(n) == TAIL || n > next)
625                     stringok = 0;
626                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
627                     NEXT_OFF(scan) += NEXT_OFF(n);
628                     next = n + NODE_STEP_REGNODE;
629 #ifdef DEBUGGING
630                     if (stringok)
631                         stop = n;
632 #endif 
633                     n = regnext(n);
634                 }
635                 else {
636                     int oldl = STR_LEN(scan);
637                     regnode *nnext = regnext(n);
638                     
639                     if (oldl + STR_LEN(n) > U8_MAX) 
640                         break;
641                     NEXT_OFF(scan) += NEXT_OFF(n);
642                     STR_LEN(scan) += STR_LEN(n);
643                     next = n + NODE_SZ_STR(n);
644                     /* Now we can overwrite *n : */
645                     Move(STRING(n), STRING(scan) + oldl,
646                          STR_LEN(n), char);
647 #ifdef DEBUGGING
648                     if (stringok)
649                         stop = next - 1;
650 #endif 
651                     n = nnext;
652                 }
653             }
654 #ifdef DEBUGGING
655             /* Allow dumping */
656             n = scan + NODE_SZ_STR(scan);
657             while (n <= stop) {
658                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
659                     OP(n) = OPTIMIZED;
660                     NEXT_OFF(n) = 0;
661                 }
662                 n++;
663             }
664 #endif
665         }
666         /* Follow the next-chain of the current node and optimize
667            away all the NOTHINGs from it.  */
668         if (OP(scan) != CURLYX) {
669             int max = (reg_off_by_arg[OP(scan)]
670                        ? I32_MAX
671                        /* I32 may be smaller than U16 on CRAYs! */
672                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
673             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
674             int noff;
675             regnode *n = scan;
676             
677             /* Skip NOTHING and LONGJMP. */
678             while ((n = regnext(n))
679                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
680                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
681                    && off + noff < max)
682                 off += noff;
683             if (reg_off_by_arg[OP(scan)])
684                 ARG(scan) = off;
685             else 
686                 NEXT_OFF(scan) = off;
687         }
688         /* The principal pseudo-switch.  Cannot be a switch, since we
689            look into several different things.  */
690         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 
691                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
692             next = regnext(scan);
693             code = OP(scan);
694             
695             if (OP(next) == code || code == IFTHEN || code == SUSPEND) { 
696                 I32 max1 = 0, min1 = I32_MAX, num = 0;
697                 struct regnode_charclass_class accum;
698                 
699                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
700                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
701                 if (flags & SCF_DO_STCLASS)
702                     cl_init_zero(pRExC_state, &accum);
703                 while (OP(scan) == code) {
704                     I32 deltanext, minnext, f = 0, fake;
705                     struct regnode_charclass_class this_class;
706
707                     num++;
708                     data_fake.flags = 0;
709                     if (data) {             
710                         data_fake.whilem_c = data->whilem_c;
711                         data_fake.last_closep = data->last_closep;
712                     }
713                     else
714                         data_fake.last_closep = &fake;
715                     next = regnext(scan);
716                     scan = NEXTOPER(scan);
717                     if (code != BRANCH)
718                         scan = NEXTOPER(scan);
719                     if (flags & SCF_DO_STCLASS) {
720                         cl_init(pRExC_state, &this_class);
721                         data_fake.start_class = &this_class;
722                         f = SCF_DO_STCLASS_AND;
723                     }               
724                     if (flags & SCF_WHILEM_VISITED_POS)
725                         f |= SCF_WHILEM_VISITED_POS;
726                     /* we suppose the run is continuous, last=next...*/
727                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
728                                           next, &data_fake, f);
729                     if (min1 > minnext) 
730                         min1 = minnext;
731                     if (max1 < minnext + deltanext)
732                         max1 = minnext + deltanext;
733                     if (deltanext == I32_MAX)
734                         is_inf = is_inf_internal = 1;
735                     scan = next;
736                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
737                         pars++;
738                     if (data && (data_fake.flags & SF_HAS_EVAL))
739                         data->flags |= SF_HAS_EVAL;
740                     if (data)
741                         data->whilem_c = data_fake.whilem_c;
742                     if (flags & SCF_DO_STCLASS)
743                         cl_or(pRExC_state, &accum, &this_class);
744                     if (code == SUSPEND) 
745                         break;
746                 }
747                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
748                     min1 = 0;
749                 if (flags & SCF_DO_SUBSTR) {
750                     data->pos_min += min1;
751                     data->pos_delta += max1 - min1;
752                     if (max1 != min1 || is_inf)
753                         data->longest = &(data->longest_float);
754                 }
755                 min += min1;
756                 delta += max1 - min1;
757                 if (flags & SCF_DO_STCLASS_OR) {
758                     cl_or(pRExC_state, data->start_class, &accum);
759                     if (min1) {
760                         cl_and(data->start_class, &and_with);
761                         flags &= ~SCF_DO_STCLASS;
762                     }
763                 }
764                 else if (flags & SCF_DO_STCLASS_AND) {
765                     if (min1) {
766                         cl_and(data->start_class, &accum);
767                         flags &= ~SCF_DO_STCLASS;
768                     }
769                     else {
770                         /* Switch to OR mode: cache the old value of 
771                          * data->start_class */
772                         StructCopy(data->start_class, &and_with,
773                                    struct regnode_charclass_class);
774                         flags &= ~SCF_DO_STCLASS_AND;
775                         StructCopy(&accum, data->start_class,
776                                    struct regnode_charclass_class);
777                         flags |= SCF_DO_STCLASS_OR;
778                         data->start_class->flags |= ANYOF_EOS;
779                     }
780                 }
781             }
782             else if (code == BRANCHJ)   /* single branch is optimized. */
783                 scan = NEXTOPER(NEXTOPER(scan));
784             else                        /* single branch is optimized. */
785                 scan = NEXTOPER(scan);
786             continue;
787         }
788         else if (OP(scan) == EXACT) {
789             I32 l = STR_LEN(scan);
790             if (UTF) {
791                 unsigned char *s = (unsigned char *)STRING(scan);
792                 unsigned char *e = s + l;
793                 I32 newl = 0;
794                 while (s < e) {
795                     newl++;
796                     s += UTF8SKIP(s);
797                 }
798                 l = newl;
799             }
800             min += l;
801             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
802                 /* The code below prefers earlier match for fixed
803                    offset, later match for variable offset.  */
804                 if (data->last_end == -1) { /* Update the start info. */
805                     data->last_start_min = data->pos_min;
806                     data->last_start_max = is_inf
807                         ? I32_MAX : data->pos_min + data->pos_delta; 
808                 }
809                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
810                 data->last_end = data->pos_min + l;
811                 data->pos_min += l; /* As in the first entry. */
812                 data->flags &= ~SF_BEFORE_EOL;
813             }
814             if (flags & SCF_DO_STCLASS_AND) {
815                 /* Check whether it is compatible with what we know already! */
816                 int compat = 1;
817
818                 if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
819                     && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
820                     && (!(data->start_class->flags & ANYOF_FOLD)
821                         || !ANYOF_BITMAP_TEST(data->start_class,
822                                               PL_fold[*(U8*)STRING(scan)])))
823                     compat = 0;
824                 ANYOF_CLASS_ZERO(data->start_class);
825                 ANYOF_BITMAP_ZERO(data->start_class);
826                 if (compat)
827                     ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
828                 data->start_class->flags &= ~ANYOF_EOS;
829             }
830             else if (flags & SCF_DO_STCLASS_OR) {
831                 /* false positive possible if the class is case-folded */
832                 ANYOF_BITMAP_SET(data->start_class, *STRING(scan));     
833                 data->start_class->flags &= ~ANYOF_EOS;
834                 cl_and(data->start_class, &and_with);
835             }
836             flags &= ~SCF_DO_STCLASS;
837         }
838         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
839             I32 l = STR_LEN(scan);
840
841             /* Search for fixed substrings supports EXACT only. */
842             if (flags & SCF_DO_SUBSTR) 
843                 scan_commit(pRExC_state, data);
844             if (UTF) {
845                 unsigned char *s = (unsigned char *)STRING(scan);
846                 unsigned char *e = s + l;
847                 I32 newl = 0;
848                 while (s < e) {
849                     newl++;
850                     s += UTF8SKIP(s);
851                 }
852                 l = newl;
853             }
854             min += l;
855             if (data && (flags & SCF_DO_SUBSTR))
856                 data->pos_min += l;
857             if (flags & SCF_DO_STCLASS_AND) {
858                 /* Check whether it is compatible with what we know already! */
859                 int compat = 1;
860
861                 if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
862                     && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
863                     && !ANYOF_BITMAP_TEST(data->start_class, 
864                                           PL_fold[*(U8*)STRING(scan)]))
865                     compat = 0;
866                 ANYOF_CLASS_ZERO(data->start_class);
867                 ANYOF_BITMAP_ZERO(data->start_class);
868                 if (compat) {
869                     ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
870                     data->start_class->flags &= ~ANYOF_EOS;
871                     data->start_class->flags |= ANYOF_FOLD;
872                     if (OP(scan) == EXACTFL)
873                         data->start_class->flags |= ANYOF_LOCALE;
874                 }
875             }
876             else if (flags & SCF_DO_STCLASS_OR) {
877                 if (data->start_class->flags & ANYOF_FOLD) {
878                     /* false positive possible if the class is case-folded.
879                        Assume that the locale settings are the same... */
880                     ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); 
881                     data->start_class->flags &= ~ANYOF_EOS;
882                 }
883                 cl_and(data->start_class, &and_with);
884             }
885             flags &= ~SCF_DO_STCLASS;
886         }
887         else if (strchr((char*)PL_varies,OP(scan))) {
888             I32 mincount, maxcount, minnext, deltanext, fl;
889             I32 f = flags, pos_before = 0;
890             regnode *oscan = scan;
891             struct regnode_charclass_class this_class;
892             struct regnode_charclass_class *oclass = NULL;
893
894             switch (PL_regkind[(U8)OP(scan)]) {
895             case WHILEM:                /* End of (?:...)* . */
896                 scan = NEXTOPER(scan);
897                 goto finish;
898             case PLUS:
899                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
900                     next = NEXTOPER(scan);
901                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
902                         mincount = 1; 
903                         maxcount = REG_INFTY; 
904                         next = regnext(scan);
905                         scan = NEXTOPER(scan);
906                         goto do_curly;
907                     }
908                 }
909                 if (flags & SCF_DO_SUBSTR)
910                     data->pos_min++;
911                 min++;
912                 /* Fall through. */
913             case STAR:
914                 if (flags & SCF_DO_STCLASS) {
915                     mincount = 0;
916                     maxcount = REG_INFTY; 
917                     next = regnext(scan);
918                     scan = NEXTOPER(scan);
919                     goto do_curly;
920                 }
921                 is_inf = is_inf_internal = 1; 
922                 scan = regnext(scan);
923                 if (flags & SCF_DO_SUBSTR) {
924                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
925                     data->longest = &(data->longest_float);
926                 }
927                 goto optimize_curly_tail;
928             case CURLY:
929                 mincount = ARG1(scan); 
930                 maxcount = ARG2(scan);
931                 next = regnext(scan);
932                 if (OP(scan) == CURLYX) {
933                     I32 lp = (data ? *(data->last_closep) : 0);
934
935                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
936                 }
937                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
938               do_curly:
939                 if (flags & SCF_DO_SUBSTR) {
940                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
941                     pos_before = data->pos_min;
942                 }
943                 if (data) {
944                     fl = data->flags;
945                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
946                     if (is_inf)
947                         data->flags |= SF_IS_INF;
948                 }
949                 if (flags & SCF_DO_STCLASS) {
950                     cl_init(pRExC_state, &this_class);
951                     oclass = data->start_class;
952                     data->start_class = &this_class;
953                     f |= SCF_DO_STCLASS_AND;
954                     f &= ~SCF_DO_STCLASS_OR;
955                 }
956                 /* These are the cases when once a subexpression
957                    fails at a particular position, it cannot succeed
958                    even after backtracking at the enclosing scope.
959                    
960                    XXXX what if minimal match and we are at the
961                         initial run of {n,m}? */
962                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
963                     f &= ~SCF_WHILEM_VISITED_POS;
964
965                 /* This will finish on WHILEM, setting scan, or on NULL: */
966                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, 
967                                       mincount == 0 
968                                         ? (f & ~SCF_DO_SUBSTR) : f);
969
970                 if (flags & SCF_DO_STCLASS)
971                     data->start_class = oclass;
972                 if (mincount == 0 || minnext == 0) {
973                     if (flags & SCF_DO_STCLASS_OR) {
974                         cl_or(pRExC_state, data->start_class, &this_class);
975                     }
976                     else if (flags & SCF_DO_STCLASS_AND) {
977                         /* Switch to OR mode: cache the old value of 
978                          * data->start_class */
979                         StructCopy(data->start_class, &and_with,
980                                    struct regnode_charclass_class);
981                         flags &= ~SCF_DO_STCLASS_AND;
982                         StructCopy(&this_class, data->start_class,
983                                    struct regnode_charclass_class);
984                         flags |= SCF_DO_STCLASS_OR;
985                         data->start_class->flags |= ANYOF_EOS;
986                     }
987                 } else {                /* Non-zero len */
988                     if (flags & SCF_DO_STCLASS_OR) {
989                         cl_or(pRExC_state, data->start_class, &this_class);
990                         cl_and(data->start_class, &and_with);
991                     }
992                     else if (flags & SCF_DO_STCLASS_AND)
993                         cl_and(data->start_class, &this_class);
994                     flags &= ~SCF_DO_STCLASS;
995                 }
996                 if (!scan)              /* It was not CURLYX, but CURLY. */
997                     scan = next;
998                 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) 
999                     && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1000                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1001                 {
1002                     vWARN(RExC_parse,
1003                           "Quantifier unexpected on zero-length expression");
1004                 }
1005
1006                 min += minnext * mincount;
1007                 is_inf_internal |= ((maxcount == REG_INFTY 
1008                                      && (minnext + deltanext) > 0)
1009                                     || deltanext == I32_MAX);
1010                 is_inf |= is_inf_internal;
1011                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1012
1013                 /* Try powerful optimization CURLYX => CURLYN. */
1014                 if (  OP(oscan) == CURLYX && data 
1015                       && data->flags & SF_IN_PAR
1016                       && !(data->flags & SF_HAS_EVAL)
1017                       && !deltanext && minnext == 1 ) {
1018                     /* Try to optimize to CURLYN.  */
1019                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1020                     regnode *nxt1 = nxt, *nxt2;
1021
1022                     /* Skip open. */
1023                     nxt = regnext(nxt);
1024                     if (!strchr((char*)PL_simple,OP(nxt))
1025                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1026                              && STR_LEN(nxt) == 1)) 
1027                         goto nogo;
1028                     nxt2 = nxt;
1029                     nxt = regnext(nxt);
1030                     if (OP(nxt) != CLOSE) 
1031                         goto nogo;
1032                     /* Now we know that nxt2 is the only contents: */
1033                     oscan->flags = ARG(nxt);
1034                     OP(oscan) = CURLYN;
1035                     OP(nxt1) = NOTHING; /* was OPEN. */
1036 #ifdef DEBUGGING
1037                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1038                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1039                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1040                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1041                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1042                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1043 #endif 
1044                 }
1045               nogo:
1046
1047                 /* Try optimization CURLYX => CURLYM. */
1048                 if (  OP(oscan) == CURLYX && data 
1049                       && !(data->flags & SF_HAS_PAR)
1050                       && !(data->flags & SF_HAS_EVAL)
1051                       && !deltanext  ) {
1052                     /* XXXX How to optimize if data == 0? */
1053                     /* Optimize to a simpler form.  */
1054                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1055                     regnode *nxt2;
1056
1057                     OP(oscan) = CURLYM;
1058                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1059                             && (OP(nxt2) != WHILEM)) 
1060                         nxt = nxt2;
1061                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1062                     /* Need to optimize away parenths. */
1063                     if (data->flags & SF_IN_PAR) {
1064                         /* Set the parenth number.  */
1065                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1066
1067                         if (OP(nxt) != CLOSE) 
1068                             FAIL("Panic opt close");
1069                         oscan->flags = ARG(nxt);
1070                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1071                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1072 #ifdef DEBUGGING
1073                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1074                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1075                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1076                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1077 #endif 
1078 #if 0
1079                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1080                             regnode *nnxt = regnext(nxt1);
1081                             
1082                             if (nnxt == nxt) {
1083                                 if (reg_off_by_arg[OP(nxt1)])
1084                                     ARG_SET(nxt1, nxt2 - nxt1);
1085                                 else if (nxt2 - nxt1 < U16_MAX)
1086                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1087                                 else
1088                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1089                             }
1090                             nxt1 = nnxt;
1091                         }
1092 #endif
1093                         /* Optimize again: */
1094                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt, 
1095                                     NULL, 0);
1096                     }
1097                     else
1098                         oscan->flags = 0;
1099                 }
1100                 else if ((OP(oscan) == CURLYX)
1101                          && (flags & SCF_WHILEM_VISITED_POS)
1102                          /* See the comment on a similar expression above.
1103                             However, this time it not a subexpression
1104                             we care about, but the expression itself. */
1105                          && (maxcount == REG_INFTY)
1106                          && data && ++data->whilem_c < 16) {
1107                     /* This stays as CURLYX, we can put the count/of pair. */
1108                     /* Find WHILEM (as in regexec.c) */
1109                     regnode *nxt = oscan + NEXT_OFF(oscan);
1110
1111                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1112                         nxt += ARG(nxt);
1113                     PREVOPER(nxt)->flags = data->whilem_c
1114                         | (RExC_whilem_seen << 4); /* On WHILEM */
1115                 }
1116                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 
1117                     pars++;
1118                 if (flags & SCF_DO_SUBSTR) {
1119                     SV *last_str = Nullsv;
1120                     int counted = mincount != 0;
1121
1122                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1123                         I32 b = pos_before >= data->last_start_min 
1124                             ? pos_before : data->last_start_min;
1125                         STRLEN l;
1126                         char *s = SvPV(data->last_found, l);
1127                         I32 old = b - data->last_start_min;
1128
1129                         if (UTF)
1130                             old = utf8_hop((U8*)s, old) - (U8*)s;
1131                         
1132                         l -= old;
1133                         /* Get the added string: */
1134                         last_str = newSVpvn(s  + old, l);
1135                         if (deltanext == 0 && pos_before == b) {
1136                             /* What was added is a constant string */
1137                             if (mincount > 1) {
1138                                 SvGROW(last_str, (mincount * l) + 1);
1139                                 repeatcpy(SvPVX(last_str) + l, 
1140                                           SvPVX(last_str), l, mincount - 1);
1141                                 SvCUR(last_str) *= mincount;
1142                                 /* Add additional parts. */
1143                                 SvCUR_set(data->last_found, 
1144                                           SvCUR(data->last_found) - l);
1145                                 sv_catsv(data->last_found, last_str);
1146                                 data->last_end += l * (mincount - 1);
1147                             }
1148                         } else {
1149                             /* start offset must point into the last copy */
1150                             data->last_start_min += minnext * (mincount - 1);
1151                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1152                                 * (minnext + data->pos_delta);
1153                         }
1154                     }
1155                     /* It is counted once already... */
1156                     data->pos_min += minnext * (mincount - counted);
1157                     data->pos_delta += - counted * deltanext +
1158                         (minnext + deltanext) * maxcount - minnext * mincount;
1159                     if (mincount != maxcount) {
1160                          /* Cannot extend fixed substrings found inside
1161                             the group.  */
1162                         scan_commit(pRExC_state,data);
1163                         if (mincount && last_str) {
1164                             sv_setsv(data->last_found, last_str);
1165                             data->last_end = data->pos_min;
1166                             data->last_start_min = 
1167                                 data->pos_min - CHR_SVLEN(last_str);
1168                             data->last_start_max = is_inf 
1169                                 ? I32_MAX 
1170                                 : data->pos_min + data->pos_delta
1171                                 - CHR_SVLEN(last_str);
1172                         }
1173                         data->longest = &(data->longest_float);
1174                     }
1175                     SvREFCNT_dec(last_str);
1176                 }
1177                 if (data && (fl & SF_HAS_EVAL))
1178                     data->flags |= SF_HAS_EVAL;
1179               optimize_curly_tail:
1180                 if (OP(oscan) != CURLYX) {
1181                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1182                            && NEXT_OFF(next))
1183                         NEXT_OFF(oscan) += NEXT_OFF(next);
1184                 }
1185                 continue;
1186             default:                    /* REF and CLUMP only? */
1187                 if (flags & SCF_DO_SUBSTR) {
1188                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1189                     data->longest = &(data->longest_float);
1190                 }
1191                 is_inf = is_inf_internal = 1;
1192                 if (flags & SCF_DO_STCLASS_OR)
1193                     cl_anything(pRExC_state, data->start_class);
1194                 flags &= ~SCF_DO_STCLASS;
1195                 break;
1196             }
1197         }
1198         else if (strchr((char*)PL_simple,OP(scan))) {
1199             int value;
1200
1201             if (flags & SCF_DO_SUBSTR) {
1202                 scan_commit(pRExC_state,data);
1203                 data->pos_min++;
1204             }
1205             min++;
1206             if (flags & SCF_DO_STCLASS) {
1207                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1208
1209                 /* Some of the logic below assumes that switching
1210                    locale on will only add false positives. */
1211                 switch (PL_regkind[(U8)OP(scan)]) {
1212                 case SANY:
1213                 default:
1214                   do_default:
1215                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1216                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1217                         cl_anything(pRExC_state, data->start_class);
1218                     break;
1219                 case REG_ANY:
1220                     if (OP(scan) == SANY)
1221                         goto do_default;
1222                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1223                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1224                                  || (data->start_class->flags & ANYOF_CLASS));
1225                         cl_anything(pRExC_state, data->start_class);
1226                     }
1227                     if (flags & SCF_DO_STCLASS_AND || !value)
1228                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1229                     break;
1230                 case ANYOF:
1231                     if (flags & SCF_DO_STCLASS_AND)
1232                         cl_and(data->start_class,
1233                                (struct regnode_charclass_class*)scan);
1234                     else
1235                         cl_or(pRExC_state, data->start_class,
1236                               (struct regnode_charclass_class*)scan);
1237                     break;
1238                 case ALNUM:
1239                     if (flags & SCF_DO_STCLASS_AND) {
1240                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1241                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1242                             for (value = 0; value < 256; value++)
1243                                 if (!isALNUM(value))
1244                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1245                         }
1246                     }
1247                     else {
1248                         if (data->start_class->flags & ANYOF_LOCALE)
1249                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1250                         else {
1251                             for (value = 0; value < 256; value++)
1252                                 if (isALNUM(value))
1253                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1254                         }
1255                     }
1256                     break;
1257                 case ALNUML:
1258                     if (flags & SCF_DO_STCLASS_AND) {
1259                         if (data->start_class->flags & ANYOF_LOCALE)
1260                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1261                     }
1262                     else {
1263                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1264                         data->start_class->flags |= ANYOF_LOCALE;
1265                     }
1266                     break;
1267                 case NALNUM:
1268                     if (flags & SCF_DO_STCLASS_AND) {
1269                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1270                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1271                             for (value = 0; value < 256; value++)
1272                                 if (isALNUM(value))
1273                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1274                         }
1275                     }
1276                     else {
1277                         if (data->start_class->flags & ANYOF_LOCALE)
1278                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1279                         else {
1280                             for (value = 0; value < 256; value++)
1281                                 if (!isALNUM(value))
1282                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1283                         }
1284                     }
1285                     break;
1286                 case NALNUML:
1287                     if (flags & SCF_DO_STCLASS_AND) {
1288                         if (data->start_class->flags & ANYOF_LOCALE)
1289                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1290                     }
1291                     else {
1292                         data->start_class->flags |= ANYOF_LOCALE;
1293                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1294                     }
1295                     break;
1296                 case SPACE:
1297                     if (flags & SCF_DO_STCLASS_AND) {
1298                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1299                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1300                             for (value = 0; value < 256; value++)
1301                                 if (!isSPACE(value))
1302                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1303                         }
1304                     }
1305                     else {
1306                         if (data->start_class->flags & ANYOF_LOCALE)
1307                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1308                         else {
1309                             for (value = 0; value < 256; value++)
1310                                 if (isSPACE(value))
1311                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1312                         }
1313                     }
1314                     break;
1315                 case SPACEL:
1316                     if (flags & SCF_DO_STCLASS_AND) {
1317                         if (data->start_class->flags & ANYOF_LOCALE)
1318                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1319                     }
1320                     else {
1321                         data->start_class->flags |= ANYOF_LOCALE;
1322                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1323                     }
1324                     break;
1325                 case NSPACE:
1326                     if (flags & SCF_DO_STCLASS_AND) {
1327                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1328                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1329                             for (value = 0; value < 256; value++)
1330                                 if (isSPACE(value))
1331                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1332                         }
1333                     }
1334                     else {
1335                         if (data->start_class->flags & ANYOF_LOCALE)
1336                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1337                         else {
1338                             for (value = 0; value < 256; value++)
1339                                 if (!isSPACE(value))
1340                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1341                         }
1342                     }
1343                     break;
1344                 case NSPACEL:
1345                     if (flags & SCF_DO_STCLASS_AND) {
1346                         if (data->start_class->flags & ANYOF_LOCALE) {
1347                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1348                             for (value = 0; value < 256; value++)
1349                                 if (!isSPACE(value))
1350                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1351                         }
1352                     }
1353                     else {
1354                         data->start_class->flags |= ANYOF_LOCALE;
1355                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1356                     }
1357                     break;
1358                 case DIGIT:
1359                     if (flags & SCF_DO_STCLASS_AND) {
1360                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1361                         for (value = 0; value < 256; value++)
1362                             if (!isDIGIT(value))
1363                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1364                     }
1365                     else {
1366                         if (data->start_class->flags & ANYOF_LOCALE)
1367                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1368                         else {
1369                             for (value = 0; value < 256; value++)
1370                                 if (isDIGIT(value))
1371                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1372                         }
1373                     }
1374                     break;
1375                 case NDIGIT:
1376                     if (flags & SCF_DO_STCLASS_AND) {
1377                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1378                         for (value = 0; value < 256; value++)
1379                             if (isDIGIT(value))
1380                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1381                     }
1382                     else {
1383                         if (data->start_class->flags & ANYOF_LOCALE)
1384                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1385                         else {
1386                             for (value = 0; value < 256; value++)
1387                                 if (!isDIGIT(value))
1388                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1389                         }
1390                     }
1391                     break;
1392                 }
1393                 if (flags & SCF_DO_STCLASS_OR)
1394                     cl_and(data->start_class, &and_with);
1395                 flags &= ~SCF_DO_STCLASS;
1396             }
1397         }
1398         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1399             data->flags |= (OP(scan) == MEOL
1400                             ? SF_BEFORE_MEOL
1401                             : SF_BEFORE_SEOL);
1402         }
1403         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1404                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1405                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1406                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1407             /* Lookahead/lookbehind */
1408             I32 deltanext, minnext, fake = 0;
1409             regnode *nscan;
1410             struct regnode_charclass_class intrnl;
1411             int f = 0;
1412
1413             data_fake.flags = 0;
1414             if (data) {             
1415                 data_fake.whilem_c = data->whilem_c;
1416                 data_fake.last_closep = data->last_closep;
1417             }
1418             else
1419                 data_fake.last_closep = &fake;
1420             if ( flags & SCF_DO_STCLASS && !scan->flags
1421                  && OP(scan) == IFMATCH ) { /* Lookahead */
1422                 cl_init(pRExC_state, &intrnl);
1423                 data_fake.start_class = &intrnl;
1424                 f |= SCF_DO_STCLASS_AND;
1425             }
1426             if (flags & SCF_WHILEM_VISITED_POS)
1427                 f |= SCF_WHILEM_VISITED_POS;
1428             next = regnext(scan);
1429             nscan = NEXTOPER(NEXTOPER(scan));
1430             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1431             if (scan->flags) {
1432                 if (deltanext) {
1433                     vFAIL("Variable length lookbehind not implemented");
1434                 }
1435                 else if (minnext > U8_MAX) {
1436                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1437                 }
1438                 scan->flags = minnext;
1439             }
1440             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1441                 pars++;
1442             if (data && (data_fake.flags & SF_HAS_EVAL))
1443                 data->flags |= SF_HAS_EVAL;
1444             if (data)
1445                 data->whilem_c = data_fake.whilem_c;
1446             if (f & SCF_DO_STCLASS_AND) {
1447                 int was = (data->start_class->flags & ANYOF_EOS);
1448
1449                 cl_and(data->start_class, &intrnl);
1450                 if (was)
1451                     data->start_class->flags |= ANYOF_EOS;
1452             }
1453         }
1454         else if (OP(scan) == OPEN) {
1455             pars++;
1456         }
1457         else if (OP(scan) == CLOSE) {
1458             if (ARG(scan) == is_par) {
1459                 next = regnext(scan);
1460
1461                 if ( next && (OP(next) != WHILEM) && next < last)
1462                     is_par = 0;         /* Disable optimization */
1463             }
1464             if (data)
1465                 *(data->last_closep) = ARG(scan);
1466         }
1467         else if (OP(scan) == EVAL) {
1468                 if (data)
1469                     data->flags |= SF_HAS_EVAL;
1470         }
1471         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1472                 if (flags & SCF_DO_SUBSTR) {
1473                     scan_commit(pRExC_state,data);
1474                     data->longest = &(data->longest_float);
1475                 }
1476                 is_inf = is_inf_internal = 1;
1477                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1478                     cl_anything(pRExC_state, data->start_class);
1479                 flags &= ~SCF_DO_STCLASS;
1480         }
1481         /* Else: zero-length, ignore. */
1482         scan = regnext(scan);
1483     }
1484
1485   finish:
1486     *scanp = scan;
1487     *deltap = is_inf_internal ? I32_MAX : delta;
1488     if (flags & SCF_DO_SUBSTR && is_inf) 
1489         data->pos_delta = I32_MAX - data->pos_min;
1490     if (is_par > U8_MAX)
1491         is_par = 0;
1492     if (is_par && pars==1 && data) {
1493         data->flags |= SF_IN_PAR;
1494         data->flags &= ~SF_HAS_PAR;
1495     }
1496     else if (pars && data) {
1497         data->flags |= SF_HAS_PAR;
1498         data->flags &= ~SF_IN_PAR;
1499     }
1500     if (flags & SCF_DO_STCLASS_OR)
1501         cl_and(data->start_class, &and_with);
1502     return min;
1503 }
1504
1505 STATIC I32
1506 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1507 {
1508     if (RExC_rx->data) {
1509         Renewc(RExC_rx->data, 
1510                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1), 
1511                char, struct reg_data);
1512         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1513         RExC_rx->data->count += n;
1514     }
1515     else {
1516         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1517              char, struct reg_data);
1518         New(1208, RExC_rx->data->what, n, U8);
1519         RExC_rx->data->count = n;
1520     }
1521     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1522     return RExC_rx->data->count - n;
1523 }
1524
1525 void
1526 Perl_reginitcolors(pTHX)
1527 {
1528     int i = 0;
1529     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1530             
1531     if (s) {
1532         PL_colors[0] = s = savepv(s);
1533         while (++i < 6) {
1534             s = strchr(s, '\t');
1535             if (s) {
1536                 *s = '\0';
1537                 PL_colors[i] = ++s;
1538             }
1539             else
1540                 PL_colors[i] = s = "";
1541         }
1542     } else {
1543         while (i < 6) 
1544             PL_colors[i++] = "";
1545     }
1546     PL_colorset = 1;
1547 }
1548
1549
1550 /*
1551  - pregcomp - compile a regular expression into internal code
1552  *
1553  * We can't allocate space until we know how big the compiled form will be,
1554  * but we can't compile it (and thus know how big it is) until we've got a
1555  * place to put the code.  So we cheat:  we compile it twice, once with code
1556  * generation turned off and size counting turned on, and once "for real".
1557  * This also means that we don't allocate space until we are sure that the
1558  * thing really will compile successfully, and we never have to move the
1559  * code and thus invalidate pointers into it.  (Note that it has to be in
1560  * one piece because free() must be able to free it all.) [NB: not true in perl]
1561  *
1562  * Beware that the optimization-preparation code in here knows about some
1563  * of the structure of the compiled regexp.  [I'll say.]
1564  */
1565 regexp *
1566 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1567 {
1568     register regexp *r;
1569     regnode *scan;
1570     regnode *first;
1571     I32 flags;
1572     I32 minlen = 0;
1573     I32 sawplus = 0;
1574     I32 sawopen = 0;
1575     scan_data_t data;
1576     RExC_state_t RExC_state;
1577     RExC_state_t *pRExC_state = &RExC_state;
1578
1579     if (exp == NULL)
1580         FAIL("NULL regexp argument");
1581
1582     /* XXXX This looks very suspicious... */
1583     if (pm->op_pmdynflags & PMdf_UTF8) {
1584         PL_reg_flags |= RF_utf8;
1585     }
1586     else
1587         PL_reg_flags = 0;
1588
1589     RExC_precomp = savepvn(exp, xend - exp);
1590     DEBUG_r(if (!PL_colorset) reginitcolors());
1591     DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1592                       PL_colors[4],PL_colors[5],PL_colors[0],
1593                       (int)(xend - exp), RExC_precomp, PL_colors[1]));
1594     RExC_flags16 = pm->op_pmflags;
1595     RExC_sawback = 0;
1596
1597     RExC_seen = 0;
1598     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1599     RExC_seen_evals = 0;
1600     RExC_extralen = 0;
1601
1602     /* First pass: determine size, legality. */
1603     RExC_parse = exp;
1604     RExC_end = xend;
1605     RExC_naughty = 0;
1606     RExC_npar = 1;
1607     RExC_size = 0L;
1608     RExC_emit = &PL_regdummy;
1609     RExC_whilem_seen = 0;
1610 #if 0 /* REGC() is (currently) a NOP at the first pass.
1611        * Clever compilers notice this and complain. --jhi */
1612     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1613 #endif
1614     if (reg(pRExC_state, 0, &flags) == NULL) {
1615         Safefree(RExC_precomp);
1616         RExC_precomp = Nullch;
1617         return(NULL);
1618     }
1619     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1620
1621     /* Small enough for pointer-storage convention?
1622        If extralen==0, this means that we will not need long jumps. */
1623     if (RExC_size >= 0x10000L && RExC_extralen)
1624         RExC_size += RExC_extralen;
1625     else
1626         RExC_extralen = 0;
1627     if (RExC_whilem_seen > 15)
1628         RExC_whilem_seen = 15;
1629
1630     /* Allocate space and initialize. */
1631     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1632          char, regexp);
1633     if (r == NULL)
1634         FAIL("Regexp out of space");
1635
1636 #ifdef DEBUGGING
1637     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1638     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1639 #endif
1640     r->refcnt = 1;
1641     r->prelen = xend - exp;
1642     r->precomp = RExC_precomp;
1643     r->subbeg = NULL;
1644     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1645     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1646
1647     r->substrs = 0;                     /* Useful during FAIL. */
1648     r->startp = 0;                      /* Useful during FAIL. */
1649     r->endp = 0;                        /* Useful during FAIL. */
1650
1651     RExC_rx = r;
1652
1653     /* Second pass: emit code. */
1654     RExC_parse = exp;
1655     RExC_end = xend;
1656     RExC_naughty = 0;
1657     RExC_npar = 1;
1658     RExC_emit = r->program;
1659     /* Store the count of eval-groups for security checks: */
1660     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1661     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1662     r->data = 0;
1663     if (reg(pRExC_state, 0, &flags) == NULL)
1664         return(NULL);
1665
1666     /* Dig out information for optimizations. */
1667     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1668     pm->op_pmflags = RExC_flags16;
1669     if (UTF)
1670         r->reganch |= ROPT_UTF8;
1671     r->regstclass = NULL;
1672     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1673         r->reganch |= ROPT_NAUGHTY;
1674     scan = r->program + 1;              /* First BRANCH. */
1675
1676     /* XXXX To minimize changes to RE engine we always allocate
1677        3-units-long substrs field. */
1678     Newz(1004, r->substrs, 1, struct reg_substr_data);
1679
1680     StructCopy(&zero_scan_data, &data, scan_data_t);
1681     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1682     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1683         I32 fake;
1684         STRLEN longest_float_length, longest_fixed_length;
1685         struct regnode_charclass_class ch_class;
1686         int stclass_flag;
1687         I32 last_close = 0;
1688
1689         first = scan;
1690         /* Skip introductions and multiplicators >= 1. */
1691         while ((OP(first) == OPEN && (sawopen = 1)) ||
1692                /* An OR of *one* alternative - should not happen now. */
1693             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1694             (OP(first) == PLUS) ||
1695             (OP(first) == MINMOD) ||
1696                /* An {n,m} with n>0 */
1697             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1698                 if (OP(first) == PLUS)
1699                     sawplus = 1;
1700                 else
1701                     first += regarglen[(U8)OP(first)];
1702                 first = NEXTOPER(first);
1703         }
1704
1705         /* Starting-point info. */
1706       again:
1707         if (PL_regkind[(U8)OP(first)] == EXACT) {
1708             if (OP(first) == EXACT);    /* Empty, get anchored substr later. */
1709             else if ((OP(first) == EXACTF || OP(first) == EXACTFL)
1710                      && !UTF)
1711                 r->regstclass = first;
1712         }
1713         else if (strchr((char*)PL_simple,OP(first)))
1714             r->regstclass = first;
1715         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1716                  PL_regkind[(U8)OP(first)] == NBOUND)
1717             r->regstclass = first;
1718         else if (PL_regkind[(U8)OP(first)] == BOL) {
1719             r->reganch |= (OP(first) == MBOL
1720                            ? ROPT_ANCH_MBOL
1721                            : (OP(first) == SBOL
1722                               ? ROPT_ANCH_SBOL
1723                               : ROPT_ANCH_BOL));
1724             first = NEXTOPER(first);
1725             goto again;
1726         }
1727         else if (OP(first) == GPOS) {
1728             r->reganch |= ROPT_ANCH_GPOS;
1729             first = NEXTOPER(first);
1730             goto again;
1731         }
1732         else if ((OP(first) == STAR &&
1733             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1734             !(r->reganch & ROPT_ANCH) )
1735         {
1736             /* turn .* into ^.* with an implied $*=1 */
1737             int type = OP(NEXTOPER(first));
1738
1739             if (type == REG_ANY)
1740                 type = ROPT_ANCH_MBOL;
1741             else
1742                 type = ROPT_ANCH_SBOL;
1743
1744             r->reganch |= type | ROPT_IMPLICIT;
1745             first = NEXTOPER(first);
1746             goto again;
1747         }
1748         if (sawplus && (!sawopen || !RExC_sawback) 
1749             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1750             /* x+ must match at the 1st pos of run of x's */
1751             r->reganch |= ROPT_SKIP;
1752
1753         /* Scan is after the zeroth branch, first is atomic matcher. */
1754         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 
1755                               (IV)(first - scan + 1)));
1756         /*
1757         * If there's something expensive in the r.e., find the
1758         * longest literal string that must appear and make it the
1759         * regmust.  Resolve ties in favor of later strings, since
1760         * the regstart check works with the beginning of the r.e.
1761         * and avoiding duplication strengthens checking.  Not a
1762         * strong reason, but sufficient in the absence of others.
1763         * [Now we resolve ties in favor of the earlier string if
1764         * it happens that c_offset_min has been invalidated, since the
1765         * earlier string may buy us something the later one won't.]
1766         */
1767         minlen = 0;
1768
1769         data.longest_fixed = newSVpvn("",0);
1770         data.longest_float = newSVpvn("",0);
1771         data.last_found = newSVpvn("",0);
1772         data.longest = &(data.longest_fixed);
1773         first = scan;
1774         if (!r->regstclass) {
1775             cl_init(pRExC_state, &ch_class);
1776             data.start_class = &ch_class;
1777             stclass_flag = SCF_DO_STCLASS_AND;
1778         } else                          /* XXXX Check for BOUND? */
1779             stclass_flag = 0;
1780         data.last_closep = &last_close;
1781
1782         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1783                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1784         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1785              && data.last_start_min == 0 && data.last_end > 0 
1786              && !RExC_seen_zerolen
1787              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1788             r->reganch |= ROPT_CHECK_ALL;
1789         scan_commit(pRExC_state, &data);
1790         SvREFCNT_dec(data.last_found);
1791
1792         longest_float_length = CHR_SVLEN(data.longest_float);
1793         if (longest_float_length
1794             || (data.flags & SF_FL_BEFORE_EOL
1795                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1796                     || (RExC_flags16 & PMf_MULTILINE)))) {
1797             int t;
1798
1799             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1800                 && data.offset_fixed == data.offset_float_min
1801                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1802                     goto remove_float;          /* As in (a)+. */
1803
1804             r->float_substr = data.longest_float;
1805             r->float_min_offset = data.offset_float_min;
1806             r->float_max_offset = data.offset_float_max;
1807             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1808                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1809                            || (RExC_flags16 & PMf_MULTILINE)));
1810             fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1811         }
1812         else {
1813           remove_float:
1814             r->float_substr = Nullsv;
1815             SvREFCNT_dec(data.longest_float);
1816             longest_float_length = 0;
1817         }
1818
1819         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1820         if (longest_fixed_length
1821             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1822                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1823                     || (RExC_flags16 & PMf_MULTILINE)))) {
1824             int t;
1825
1826             r->anchored_substr = data.longest_fixed;
1827             r->anchored_offset = data.offset_fixed;
1828             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1829                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1830                      || (RExC_flags16 & PMf_MULTILINE)));
1831             fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1832         }
1833         else {
1834             r->anchored_substr = Nullsv;
1835             SvREFCNT_dec(data.longest_fixed);
1836             longest_fixed_length = 0;
1837         }
1838         if (r->regstclass 
1839             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
1840             r->regstclass = NULL;
1841         if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1842             && !(data.start_class->flags & ANYOF_EOS)
1843             && !cl_is_anything(data.start_class)) {
1844             SV *sv;
1845             I32 n = add_data(pRExC_state, 1, "f");
1846
1847             New(1006, RExC_rx->data->data[n], 1, 
1848                 struct regnode_charclass_class);
1849             StructCopy(data.start_class,
1850                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
1851                        struct regnode_charclass_class);
1852             r->regstclass = (regnode*)RExC_rx->data->data[n];
1853             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
1854             PL_regdata = r->data; /* for regprop() */
1855             DEBUG_r((sv = sv_newmortal(),
1856                      regprop(sv, (regnode*)data.start_class),
1857                      PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
1858                                    SvPVX(sv))));
1859         }
1860
1861         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
1862         if (longest_fixed_length > longest_float_length) {
1863             r->check_substr = r->anchored_substr;
1864             r->check_offset_min = r->check_offset_max = r->anchored_offset;
1865             if (r->reganch & ROPT_ANCH_SINGLE)
1866                 r->reganch |= ROPT_NOSCAN;
1867         }
1868         else {
1869             r->check_substr = r->float_substr;
1870             r->check_offset_min = data.offset_float_min;
1871             r->check_offset_max = data.offset_float_max;
1872         }
1873         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1874            This should be changed ASAP!  */
1875         if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
1876             r->reganch |= RE_USE_INTUIT;
1877             if (SvTAIL(r->check_substr))
1878                 r->reganch |= RE_INTUIT_TAIL;
1879         }
1880     }
1881     else {
1882         /* Several toplevels. Best we can is to set minlen. */
1883         I32 fake;
1884         struct regnode_charclass_class ch_class;
1885         I32 last_close = 0;
1886         
1887         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1888         scan = r->program + 1;
1889         cl_init(pRExC_state, &ch_class);
1890         data.start_class = &ch_class;
1891         data.last_closep = &last_close;
1892         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
1893         r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
1894         if (!(data.start_class->flags & ANYOF_EOS)
1895             && !cl_is_anything(data.start_class)) {
1896             SV *sv;
1897             I32 n = add_data(pRExC_state, 1, "f");
1898
1899             New(1006, RExC_rx->data->data[n], 1, 
1900                 struct regnode_charclass_class);
1901             StructCopy(data.start_class,
1902                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
1903                        struct regnode_charclass_class);
1904             r->regstclass = (regnode*)RExC_rx->data->data[n];
1905             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
1906             DEBUG_r((sv = sv_newmortal(),
1907                      regprop(sv, (regnode*)data.start_class),
1908                      PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
1909                                    SvPVX(sv))));
1910         }
1911     }
1912
1913     r->minlen = minlen;
1914     if (RExC_seen & REG_SEEN_GPOS) 
1915         r->reganch |= ROPT_GPOS_SEEN;
1916     if (RExC_seen & REG_SEEN_LOOKBEHIND)
1917         r->reganch |= ROPT_LOOKBEHIND_SEEN;
1918     if (RExC_seen & REG_SEEN_EVAL)
1919         r->reganch |= ROPT_EVAL_SEEN;
1920     Newz(1002, r->startp, RExC_npar, I32);
1921     Newz(1002, r->endp, RExC_npar, I32);
1922     PL_regdata = r->data; /* for regprop() */
1923     DEBUG_r(regdump(r));
1924     return(r);
1925 }
1926
1927 /*
1928  - reg - regular expression, i.e. main body or parenthesized thing
1929  *
1930  * Caller must absorb opening parenthesis.
1931  *
1932  * Combining parenthesis handling with the base level of regular expression
1933  * is a trifle forced, but the need to tie the tails of the branches to what
1934  * follows makes it hard to avoid.
1935  */
1936 STATIC regnode *
1937 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
1938     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
1939 {
1940     register regnode *ret;              /* Will be the head of the group. */
1941     register regnode *br;
1942     register regnode *lastbr;
1943     register regnode *ender = 0;
1944     register I32 parno = 0;
1945     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
1946     char *oregcomp_parse = RExC_parse;
1947     char c;
1948
1949     *flagp = 0;                         /* Tentatively. */
1950
1951     /* Make an OPEN node, if parenthesized. */
1952     if (paren) {
1953         if (*RExC_parse == '?') {
1954             U16 posflags = 0, negflags = 0;
1955             U16 *flagsp = &posflags;
1956             int logical = 0;
1957             char *seqstart = RExC_parse;
1958
1959             RExC_parse++;
1960             paren = *RExC_parse++;
1961             ret = NULL;                 /* For look-ahead/behind. */
1962             switch (paren) {
1963             case '<':
1964                 RExC_seen |= REG_SEEN_LOOKBEHIND;
1965                 if (*RExC_parse == '!') 
1966                     paren = ',';
1967                 if (*RExC_parse != '=' && *RExC_parse != '!') 
1968                     goto unknown;
1969                 RExC_parse++;
1970             case '=':
1971             case '!':
1972                 RExC_seen_zerolen++;
1973             case ':':
1974             case '>':
1975                 break;
1976             case '$':
1977             case '@':
1978                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
1979                 break;
1980             case '#':
1981                 while (*RExC_parse && *RExC_parse != ')')
1982                     RExC_parse++;
1983                 if (*RExC_parse != ')')
1984                     FAIL("Sequence (?#... not terminated");
1985                 nextchar(pRExC_state);
1986                 *flagp = TRYAGAIN;
1987                 return NULL;
1988             case 'p':
1989                 if (SIZE_ONLY)
1990                     vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
1991                 /* FALL THROUGH*/
1992             case '?':
1993                 logical = 1;
1994                 paren = *RExC_parse++;
1995                 /* FALL THROUGH */
1996             case '{':
1997             {
1998                 I32 count = 1, n = 0;
1999                 char c;
2000                 char *s = RExC_parse;
2001                 SV *sv;
2002                 OP_4tree *sop, *rop;
2003
2004                 RExC_seen_zerolen++;
2005                 RExC_seen |= REG_SEEN_EVAL;
2006                 while (count && (c = *RExC_parse)) {
2007                     if (c == '\\' && RExC_parse[1])
2008                         RExC_parse++;
2009                     else if (c == '{') 
2010                         count++;
2011                     else if (c == '}') 
2012                         count--;
2013                     RExC_parse++;
2014                 }
2015                 if (*RExC_parse != ')')
2016                 {
2017                     RExC_parse = s;                 
2018                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2019                 }
2020                 if (!SIZE_ONLY) {
2021                     AV *av;
2022                     
2023                     if (RExC_parse - 1 - s) 
2024                         sv = newSVpvn(s, RExC_parse - 1 - s);
2025                     else
2026                         sv = newSVpvn("", 0);
2027
2028                     ENTER;
2029                     Perl_save_re_context(aTHX);
2030                     rop = sv_compile_2op(sv, &sop, "re", &av);
2031                     LEAVE;
2032
2033                     n = add_data(pRExC_state, 3, "nop");
2034                     RExC_rx->data->data[n] = (void*)rop;
2035                     RExC_rx->data->data[n+1] = (void*)sop;
2036                     RExC_rx->data->data[n+2] = (void*)av;
2037                     SvREFCNT_dec(sv);
2038                 }
2039                 else {                                          /* First pass */
2040                     if (PL_reginterp_cnt < ++RExC_seen_evals
2041                         && PL_curcop != &PL_compiling)
2042                         /* No compiled RE interpolated, has runtime
2043                            components ===> unsafe.  */
2044                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2045                     if (PL_tainted)
2046                         FAIL("Eval-group in insecure regular expression");
2047                 }
2048                 
2049                 nextchar(pRExC_state);
2050                 if (logical) {
2051                     ret = reg_node(pRExC_state, LOGICAL);
2052                     if (!SIZE_ONLY)
2053                         ret->flags = 2;
2054                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2055                     return ret;
2056                 }
2057                 return reganode(pRExC_state, EVAL, n);
2058             }
2059             case '(':
2060             {
2061                 if (RExC_parse[0] == '?') {
2062                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 
2063                         || RExC_parse[1] == '<' 
2064                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2065                         I32 flag;
2066                         
2067                         ret = reg_node(pRExC_state, LOGICAL);
2068                         if (!SIZE_ONLY)
2069                             ret->flags = 1;
2070                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2071                         goto insert_if;
2072                     } 
2073                 }
2074                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2075                     parno = atoi(RExC_parse++);
2076
2077                     while (isDIGIT(*RExC_parse))
2078                         RExC_parse++;
2079                     ret = reganode(pRExC_state, GROUPP, parno);
2080                     if ((c = *nextchar(pRExC_state)) != ')')
2081                         vFAIL("Switch condition not recognized");
2082                   insert_if:
2083                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2084                     br = regbranch(pRExC_state, &flags, 1);
2085                     if (br == NULL)
2086                         br = reganode(pRExC_state, LONGJMP, 0);
2087                     else
2088                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2089                     c = *nextchar(pRExC_state);
2090                     if (flags&HASWIDTH)
2091                         *flagp |= HASWIDTH;
2092                     if (c == '|') {
2093                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2094                         regbranch(pRExC_state, &flags, 1);
2095                         regtail(pRExC_state, ret, lastbr);
2096                         if (flags&HASWIDTH)
2097                             *flagp |= HASWIDTH;
2098                         c = *nextchar(pRExC_state);
2099                     }
2100                     else
2101                         lastbr = NULL;
2102                     if (c != ')')
2103                         vFAIL("Switch (?(condition)... contains too many branches");
2104                     ender = reg_node(pRExC_state, TAIL);
2105                     regtail(pRExC_state, br, ender);
2106                     if (lastbr) {
2107                         regtail(pRExC_state, lastbr, ender);
2108                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2109                     }
2110                     else
2111                         regtail(pRExC_state, ret, ender);
2112                     return ret;
2113                 }
2114                 else {
2115                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2116                 }
2117             }
2118             case 0:
2119                 RExC_parse--; /* for vFAIL to print correctly */
2120                 vFAIL("Sequence (? incomplete");
2121                 break;
2122             default:
2123                 --RExC_parse;
2124               parse_flags:
2125                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2126                     if (*RExC_parse != 'o')
2127                         pmflag(flagsp, *RExC_parse);
2128                     ++RExC_parse;
2129                 }
2130                 if (*RExC_parse == '-') {
2131                     flagsp = &negflags;
2132                     ++RExC_parse;
2133                     goto parse_flags;
2134                 }
2135                 RExC_flags16 |= posflags;
2136                 RExC_flags16 &= ~negflags;
2137                 if (*RExC_parse == ':') {
2138                     RExC_parse++;
2139                     paren = ':';
2140                     break;
2141                 }               
2142               unknown:
2143                 if (*RExC_parse != ')') {
2144                     RExC_parse++;
2145                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2146                 }
2147                 nextchar(pRExC_state);
2148                 *flagp = TRYAGAIN;
2149                 return NULL;
2150             }
2151         }
2152         else {
2153             parno = RExC_npar;
2154             RExC_npar++;
2155             ret = reganode(pRExC_state, OPEN, parno);
2156             open = 1;
2157         }
2158     }
2159     else
2160         ret = NULL;
2161
2162     /* Pick up the branches, linking them together. */
2163     br = regbranch(pRExC_state, &flags, 1);
2164     if (br == NULL)
2165         return(NULL);
2166     if (*RExC_parse == '|') {
2167         if (!SIZE_ONLY && RExC_extralen) {
2168             reginsert(pRExC_state, BRANCHJ, br);
2169         }
2170         else
2171             reginsert(pRExC_state, BRANCH, br);
2172         have_branch = 1;
2173         if (SIZE_ONLY)
2174             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2175     }
2176     else if (paren == ':') {
2177         *flagp |= flags&SIMPLE;
2178     }
2179     if (open) {                         /* Starts with OPEN. */
2180         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2181     }
2182     else if (paren != '?')              /* Not Conditional */
2183         ret = br;
2184     if (flags&HASWIDTH)
2185         *flagp |= HASWIDTH;
2186     *flagp |= flags&SPSTART;
2187     lastbr = br;
2188     while (*RExC_parse == '|') {
2189         if (!SIZE_ONLY && RExC_extralen) {
2190             ender = reganode(pRExC_state, LONGJMP,0);
2191             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2192         }
2193         if (SIZE_ONLY)
2194             RExC_extralen += 2;         /* Account for LONGJMP. */
2195         nextchar(pRExC_state);
2196         br = regbranch(pRExC_state, &flags, 0);
2197         if (br == NULL)
2198             return(NULL);
2199         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2200         lastbr = br;
2201         if (flags&HASWIDTH)
2202             *flagp |= HASWIDTH;
2203         *flagp |= flags&SPSTART;
2204     }
2205
2206     if (have_branch || paren != ':') {
2207         /* Make a closing node, and hook it on the end. */
2208         switch (paren) {
2209         case ':':
2210             ender = reg_node(pRExC_state, TAIL);
2211             break;
2212         case 1:
2213             ender = reganode(pRExC_state, CLOSE, parno);
2214             break;
2215         case '<':
2216         case ',':
2217         case '=':
2218         case '!':
2219             *flagp &= ~HASWIDTH;
2220             /* FALL THROUGH */
2221         case '>':
2222             ender = reg_node(pRExC_state, SUCCEED);
2223             break;
2224         case 0:
2225             ender = reg_node(pRExC_state, END);
2226             break;
2227         }
2228         regtail(pRExC_state, lastbr, ender);
2229
2230         if (have_branch) {
2231             /* Hook the tails of the branches to the closing node. */
2232             for (br = ret; br != NULL; br = regnext(br)) {
2233                 regoptail(pRExC_state, br, ender);
2234             }
2235         }
2236     }
2237
2238     {
2239         char *p;
2240         static char parens[] = "=!<,>";
2241
2242         if (paren && (p = strchr(parens, paren))) {
2243             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2244             int flag = (p - parens) > 1;
2245
2246             if (paren == '>')
2247                 node = SUSPEND, flag = 0;
2248             reginsert(pRExC_state, node,ret);
2249             ret->flags = flag;
2250             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2251         }
2252     }
2253
2254     /* Check for proper termination. */
2255     if (paren) {
2256         RExC_flags16 = oregflags;
2257         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2258             RExC_parse = oregcomp_parse;
2259             vFAIL("Unmatched (");
2260         }
2261     }
2262     else if (!paren && RExC_parse < RExC_end) {
2263         if (*RExC_parse == ')') {
2264             RExC_parse++;
2265             vFAIL("Unmatched )");
2266         }
2267         else
2268             FAIL("Junk on end of regexp");      /* "Can't happen". */
2269         /* NOTREACHED */
2270     }
2271
2272     return(ret);
2273 }
2274
2275 /*
2276  - regbranch - one alternative of an | operator
2277  *
2278  * Implements the concatenation operator.
2279  */
2280 STATIC regnode *
2281 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2282 {
2283     register regnode *ret;
2284     register regnode *chain = NULL;
2285     register regnode *latest;
2286     I32 flags = 0, c = 0;
2287
2288     if (first) 
2289         ret = NULL;
2290     else {
2291         if (!SIZE_ONLY && RExC_extralen) 
2292             ret = reganode(pRExC_state, BRANCHJ,0);
2293         else
2294             ret = reg_node(pRExC_state, BRANCH);
2295     }
2296         
2297     if (!first && SIZE_ONLY) 
2298         RExC_extralen += 1;                     /* BRANCHJ */
2299     
2300     *flagp = WORST;                     /* Tentatively. */
2301
2302     RExC_parse--;
2303     nextchar(pRExC_state);
2304     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2305         flags &= ~TRYAGAIN;
2306         latest = regpiece(pRExC_state, &flags);
2307         if (latest == NULL) {
2308             if (flags & TRYAGAIN)
2309                 continue;
2310             return(NULL);
2311         }
2312         else if (ret == NULL)
2313             ret = latest;
2314         *flagp |= flags&HASWIDTH;
2315         if (chain == NULL)      /* First piece. */
2316             *flagp |= flags&SPSTART;
2317         else {
2318             RExC_naughty++;
2319             regtail(pRExC_state, chain, latest);
2320         }
2321         chain = latest;
2322         c++;
2323     }
2324     if (chain == NULL) {        /* Loop ran zero times. */
2325         chain = reg_node(pRExC_state, NOTHING);
2326         if (ret == NULL)
2327             ret = chain;
2328     }
2329     if (c == 1) {
2330         *flagp |= flags&SIMPLE;
2331     }
2332
2333     return(ret);
2334 }
2335
2336 /*
2337  - regpiece - something followed by possible [*+?]
2338  *
2339  * Note that the branching code sequences used for ? and the general cases
2340  * of * and + are somewhat optimized:  they use the same NOTHING node as
2341  * both the endmarker for their branch list and the body of the last branch.
2342  * It might seem that this node could be dispensed with entirely, but the
2343  * endmarker role is not redundant.
2344  */
2345 STATIC regnode *
2346 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2347 {
2348     register regnode *ret;
2349     register char op;
2350     register char *next;
2351     I32 flags;
2352     char *origparse = RExC_parse;
2353     char *maxpos;
2354     I32 min;
2355     I32 max = REG_INFTY;
2356
2357     ret = regatom(pRExC_state, &flags);
2358     if (ret == NULL) {
2359         if (flags & TRYAGAIN)
2360             *flagp |= TRYAGAIN;
2361         return(NULL);
2362     }
2363
2364     op = *RExC_parse;
2365
2366     if (op == '{' && regcurly(RExC_parse)) {
2367         next = RExC_parse + 1;
2368         maxpos = Nullch;
2369         while (isDIGIT(*next) || *next == ',') {
2370             if (*next == ',') {
2371                 if (maxpos)
2372                     break;
2373                 else
2374                     maxpos = next;
2375             }
2376             next++;
2377         }
2378         if (*next == '}') {             /* got one */
2379             if (!maxpos)
2380                 maxpos = next;
2381             RExC_parse++;
2382             min = atoi(RExC_parse);
2383             if (*maxpos == ',')
2384                 maxpos++;
2385             else
2386                 maxpos = RExC_parse;
2387             max = atoi(maxpos);
2388             if (!max && *maxpos != '0')
2389                 max = REG_INFTY;                /* meaning "infinity" */
2390             else if (max >= REG_INFTY)
2391                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2392             RExC_parse = next;
2393             nextchar(pRExC_state);
2394
2395         do_curly:
2396             if ((flags&SIMPLE)) {
2397                 RExC_naughty += 2 + RExC_naughty / 2;
2398                 reginsert(pRExC_state, CURLY, ret);
2399             }
2400             else {
2401                 regnode *w = reg_node(pRExC_state, WHILEM);
2402
2403                 w->flags = 0;
2404                 regtail(pRExC_state, ret, w);
2405                 if (!SIZE_ONLY && RExC_extralen) {
2406                     reginsert(pRExC_state, LONGJMP,ret);
2407                     reginsert(pRExC_state, NOTHING,ret);
2408                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2409                 }
2410                 reginsert(pRExC_state, CURLYX,ret);
2411                 if (!SIZE_ONLY && RExC_extralen)
2412                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2413                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2414                 if (SIZE_ONLY)
2415                     RExC_whilem_seen++, RExC_extralen += 3;
2416                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2417             }
2418             ret->flags = 0;
2419
2420             if (min > 0)
2421                 *flagp = WORST;
2422             if (max > 0)
2423                 *flagp |= HASWIDTH;
2424             if (max && max < min)
2425                 vFAIL("Can't do {n,m} with n > m");
2426             if (!SIZE_ONLY) {
2427                 ARG1_SET(ret, min);
2428                 ARG2_SET(ret, max);
2429             }
2430
2431             goto nest_check;
2432         }
2433     }
2434
2435     if (!ISMULT1(op)) {
2436         *flagp = flags;
2437         return(ret);
2438     }
2439
2440 #if 0                           /* Now runtime fix should be reliable. */
2441
2442     /* if this is reinstated, don't forget to put this back into perldiag:
2443
2444             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2445
2446            (F) The part of the regexp subject to either the * or + quantifier
2447            could match an empty string. The {#} shows in the regular
2448            expression about where the problem was discovered.
2449
2450     */
2451
2452     if (!(flags&HASWIDTH) && op != '?')
2453       vFAIL("Regexp *+ operand could be empty");
2454 #endif 
2455
2456     nextchar(pRExC_state);
2457
2458     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2459
2460     if (op == '*' && (flags&SIMPLE)) {
2461         reginsert(pRExC_state, STAR, ret);
2462         ret->flags = 0;
2463         RExC_naughty += 4;
2464     }
2465     else if (op == '*') {
2466         min = 0;
2467         goto do_curly;
2468     }
2469     else if (op == '+' && (flags&SIMPLE)) {
2470         reginsert(pRExC_state, PLUS, ret);
2471         ret->flags = 0;
2472         RExC_naughty += 3;
2473     }
2474     else if (op == '+') {
2475         min = 1;
2476         goto do_curly;
2477     }
2478     else if (op == '?') {
2479         min = 0; max = 1;
2480         goto do_curly;
2481     }
2482   nest_check:
2483     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2484         vWARN3(RExC_parse,
2485                "%.*s matches null string many times",
2486                RExC_parse - origparse,
2487                origparse);
2488     }
2489
2490     if (*RExC_parse == '?') {
2491         nextchar(pRExC_state);
2492         reginsert(pRExC_state, MINMOD, ret);
2493         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2494     }
2495     if (ISMULT2(RExC_parse)) {
2496         RExC_parse++;
2497         vFAIL("Nested quantifiers");
2498     }
2499
2500     return(ret);
2501 }
2502
2503 /*
2504  - regatom - the lowest level
2505  *
2506  * Optimization:  gobbles an entire sequence of ordinary characters so that
2507  * it can turn them into a single node, which is smaller to store and
2508  * faster to run.  Backslashed characters are exceptions, each becoming a
2509  * separate node; the code is simpler that way and it's not worth fixing.
2510  *
2511  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2512 STATIC regnode *
2513 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2514 {
2515     register regnode *ret = 0;
2516     I32 flags;
2517
2518     *flagp = WORST;             /* Tentatively. */
2519
2520 tryagain:
2521     switch (*RExC_parse) {
2522     case '^':
2523         RExC_seen_zerolen++;
2524         nextchar(pRExC_state);
2525         if (RExC_flags16 & PMf_MULTILINE)
2526             ret = reg_node(pRExC_state, MBOL);
2527         else if (RExC_flags16 & PMf_SINGLELINE)
2528             ret = reg_node(pRExC_state, SBOL);
2529         else
2530             ret = reg_node(pRExC_state, BOL);
2531         break;
2532     case '$':
2533         nextchar(pRExC_state);
2534         if (*RExC_parse) 
2535             RExC_seen_zerolen++;
2536         if (RExC_flags16 & PMf_MULTILINE)
2537             ret = reg_node(pRExC_state, MEOL);
2538         else if (RExC_flags16 & PMf_SINGLELINE)
2539             ret = reg_node(pRExC_state, SEOL);
2540         else
2541             ret = reg_node(pRExC_state, EOL);
2542         break;
2543     case '.':
2544         nextchar(pRExC_state);
2545         if (RExC_flags16 & PMf_SINGLELINE)
2546             ret = reg_node(pRExC_state, SANY);
2547         else
2548             ret = reg_node(pRExC_state, REG_ANY);
2549         *flagp |= HASWIDTH|SIMPLE;
2550         RExC_naughty++;
2551         break;
2552     case '[':
2553     {
2554         char *oregcomp_parse = ++RExC_parse;
2555         ret = regclass(pRExC_state);
2556         if (*RExC_parse != ']') {
2557             RExC_parse = oregcomp_parse;
2558             vFAIL("Unmatched [");
2559         }
2560         nextchar(pRExC_state);
2561         *flagp |= HASWIDTH|SIMPLE;
2562         break;
2563     }
2564     case '(':
2565         nextchar(pRExC_state);
2566         ret = reg(pRExC_state, 1, &flags);
2567         if (ret == NULL) {
2568                 if (flags & TRYAGAIN) {
2569                     if (RExC_parse == RExC_end) {
2570                          /* Make parent create an empty node if needed. */
2571                         *flagp |= TRYAGAIN;
2572                         return(NULL);
2573                     }
2574                     goto tryagain;
2575                 }
2576                 return(NULL);
2577         }
2578         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2579         break;
2580     case '|':
2581     case ')':
2582         if (flags & TRYAGAIN) {
2583             *flagp |= TRYAGAIN;
2584             return NULL;
2585         }
2586         vFAIL("Internal urp");
2587                                 /* Supposed to be caught earlier. */
2588         break;
2589     case '{':
2590         if (!regcurly(RExC_parse)) {
2591             RExC_parse++;
2592             goto defchar;
2593         }
2594         /* FALL THROUGH */
2595     case '?':
2596     case '+':
2597     case '*':
2598         RExC_parse++;
2599         vFAIL("Quantifier follows nothing");
2600         break;
2601     case '\\':
2602         switch (*++RExC_parse) {
2603         case 'A':
2604             RExC_seen_zerolen++;
2605             ret = reg_node(pRExC_state, SBOL);
2606             *flagp |= SIMPLE;
2607             nextchar(pRExC_state);
2608             break;
2609         case 'G':
2610             ret = reg_node(pRExC_state, GPOS);
2611             RExC_seen |= REG_SEEN_GPOS;
2612             *flagp |= SIMPLE;
2613             nextchar(pRExC_state);
2614             break;
2615         case 'Z':
2616             ret = reg_node(pRExC_state, SEOL);
2617             *flagp |= SIMPLE;
2618             nextchar(pRExC_state);
2619             break;
2620         case 'z':
2621             ret = reg_node(pRExC_state, EOS);
2622             *flagp |= SIMPLE;
2623             RExC_seen_zerolen++;                /* Do not optimize RE away */
2624             nextchar(pRExC_state);
2625             break;
2626         case 'C':
2627             ret = reg_node(pRExC_state, SANY);
2628             *flagp |= HASWIDTH|SIMPLE;
2629             nextchar(pRExC_state);
2630             break;
2631         case 'X':
2632             ret = reg_node(pRExC_state, CLUMP);
2633             *flagp |= HASWIDTH;
2634             nextchar(pRExC_state);
2635             if (UTF && !PL_utf8_mark)
2636                 is_utf8_mark((U8*)"~");         /* preload table */
2637             break;
2638         case 'w':
2639             ret = reg_node(pRExC_state, LOC ? ALNUML     : ALNUM);
2640             *flagp |= HASWIDTH|SIMPLE;
2641             nextchar(pRExC_state);
2642             if (UTF && !PL_utf8_alnum)
2643                 is_utf8_alnum((U8*)"a");        /* preload table */
2644             break;
2645         case 'W':
2646             ret = reg_node(pRExC_state, LOC ? NALNUML     : NALNUM);
2647             *flagp |= HASWIDTH|SIMPLE;
2648             nextchar(pRExC_state);
2649             if (UTF && !PL_utf8_alnum)
2650                 is_utf8_alnum((U8*)"a");        /* preload table */
2651             break;
2652         case 'b':
2653             RExC_seen_zerolen++;
2654             RExC_seen |= REG_SEEN_LOOKBEHIND;
2655             ret = reg_node(pRExC_state, LOC ? BOUNDL     : BOUND);
2656             *flagp |= SIMPLE;
2657             nextchar(pRExC_state);
2658             if (UTF && !PL_utf8_alnum)
2659                 is_utf8_alnum((U8*)"a");        /* preload table */
2660             break;
2661         case 'B':
2662             RExC_seen_zerolen++;
2663             RExC_seen |= REG_SEEN_LOOKBEHIND;
2664             ret = reg_node(pRExC_state, LOC ? NBOUNDL     : NBOUND);
2665             *flagp |= SIMPLE;
2666             nextchar(pRExC_state);
2667             if (UTF && !PL_utf8_alnum)
2668                 is_utf8_alnum((U8*)"a");        /* preload table */
2669             break;
2670         case 's':
2671             ret = reg_node(pRExC_state, LOC ? SPACEL     : SPACE);
2672             *flagp |= HASWIDTH|SIMPLE;
2673             nextchar(pRExC_state);
2674             if (UTF && !PL_utf8_space)
2675                 is_utf8_space((U8*)" ");        /* preload table */
2676             break;
2677         case 'S':
2678             ret = reg_node(pRExC_state, LOC ? NSPACEL     : NSPACE);
2679             *flagp |= HASWIDTH|SIMPLE;
2680             nextchar(pRExC_state);
2681             if (UTF && !PL_utf8_space)
2682                 is_utf8_space((U8*)" ");        /* preload table */
2683             break;
2684         case 'd':
2685             ret = reg_node(pRExC_state, DIGIT);
2686             *flagp |= HASWIDTH|SIMPLE;
2687             nextchar(pRExC_state);
2688             if (UTF && !PL_utf8_digit)
2689                 is_utf8_digit((U8*)"1");        /* preload table */
2690             break;
2691         case 'D':
2692             ret = reg_node(pRExC_state, NDIGIT);
2693             *flagp |= HASWIDTH|SIMPLE;
2694             nextchar(pRExC_state);
2695             if (UTF && !PL_utf8_digit)
2696                 is_utf8_digit((U8*)"1");        /* preload table */
2697             break;
2698         case 'p':
2699         case 'P':
2700             {   /* a lovely hack--pretend we saw [\pX] instead */
2701                 char* oldregxend = RExC_end;
2702
2703                 if (RExC_parse[1] == '{') {
2704                     RExC_end = strchr(RExC_parse, '}');
2705                     if (!RExC_end) {
2706                         RExC_parse += 2;
2707                         RExC_end = oldregxend;
2708                         vFAIL("Missing right brace on \\p{}");
2709                     }
2710                     RExC_end++;
2711                 }
2712                 else
2713                     RExC_end = RExC_parse + 2;
2714                 RExC_parse--;
2715
2716                 ret = regclass(pRExC_state);
2717
2718                 RExC_end = oldregxend;
2719                 RExC_parse--;
2720                 nextchar(pRExC_state);
2721                 *flagp |= HASWIDTH|SIMPLE;
2722             }
2723             break;
2724         case 'n':
2725         case 'r':
2726         case 't':
2727         case 'f':
2728         case 'e':
2729         case 'a':
2730         case 'x':
2731         case 'c':
2732         case '0':
2733             goto defchar;
2734         case '1': case '2': case '3': case '4':
2735         case '5': case '6': case '7': case '8': case '9':
2736             {
2737                 I32 num = atoi(RExC_parse);
2738
2739                 if (num > 9 && num >= RExC_npar)
2740                     goto defchar;
2741                 else {
2742                     while (isDIGIT(*RExC_parse))
2743                         RExC_parse++;
2744
2745                     if (!SIZE_ONLY && num > RExC_rx->nparens)
2746                         vFAIL("Reference to nonexistent group");
2747                     RExC_sawback = 1;
2748                     ret = reganode(pRExC_state, FOLD
2749                                    ? (LOC ? REFFL : REFF)
2750                                    : REF, num);
2751                     *flagp |= HASWIDTH;
2752                     RExC_parse--;
2753                     nextchar(pRExC_state);
2754                 }
2755             }
2756             break;
2757         case '\0':
2758             if (RExC_parse >= RExC_end)
2759                 FAIL("Trailing \\");
2760             /* FALL THROUGH */
2761         default:
2762             /* Do not generate `unrecognized' warnings here, we fall
2763                back into the quick-grab loop below */
2764             goto defchar;
2765         }
2766         break;
2767
2768     case '#':
2769         if (RExC_flags16 & PMf_EXTENDED) {
2770             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2771             if (RExC_parse < RExC_end)
2772                 goto tryagain;
2773         }
2774         /* FALL THROUGH */
2775
2776     default: {
2777             register STRLEN len;
2778             register UV ender;
2779             register char *p;
2780             char *oldp, *s;
2781             STRLEN numlen;
2782
2783             RExC_parse++;
2784
2785         defchar:
2786             ret = reg_node(pRExC_state, FOLD
2787                           ? (LOC ? EXACTFL : EXACTF)
2788                           : EXACT);
2789             s = STRING(ret);
2790             for (len = 0, p = RExC_parse - 1;
2791               len < 127 && p < RExC_end;
2792               len++)
2793             {
2794                 oldp = p;
2795
2796                 if (RExC_flags16 & PMf_EXTENDED)
2797                     p = regwhite(p, RExC_end);
2798                 switch (*p) {
2799                 case '^':
2800                 case '$':
2801                 case '.':
2802                 case '[':
2803                 case '(':
2804                 case ')':
2805                 case '|':
2806                     goto loopdone;
2807                 case '\\':
2808                     switch (*++p) {
2809                     case 'A':
2810                     case 'G':
2811                     case 'Z':
2812                     case 'z':
2813                     case 'w':
2814                     case 'W':
2815                     case 'b':
2816                     case 'B':
2817                     case 's':
2818                     case 'S':
2819                     case 'd':
2820                     case 'D':
2821                     case 'p':
2822                     case 'P':
2823                         --p;
2824                         goto loopdone;
2825                     case 'n':
2826                         ender = '\n';
2827                         p++;
2828                         break;
2829                     case 'r':
2830                         ender = '\r';
2831                         p++;
2832                         break;
2833                     case 't':
2834                         ender = '\t';
2835                         p++;
2836                         break;
2837                     case 'f':
2838                         ender = '\f';
2839                         p++;
2840                         break;
2841                     case 'e':
2842 #ifdef ASCIIish
2843                           ender = '\033';
2844 #else
2845                           ender = '\047';
2846 #endif
2847                         p++;
2848                         break;
2849                     case 'a':
2850 #ifdef ASCIIish
2851                           ender = '\007';
2852 #else
2853                           ender = '\057';
2854 #endif
2855                         p++;
2856                         break;
2857                     case 'x':
2858                         if (*++p == '{') {
2859                             char* e = strchr(p, '}');
2860          
2861                             if (!e) {
2862                                 RExC_parse = p + 1;
2863                                 vFAIL("Missing right brace on \\x{}");
2864                             }
2865                             else {
2866                                 numlen = 1;     /* allow underscores */
2867                                 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
2868                                 /* numlen is generous */
2869                                 if (numlen + len >= 127) {
2870                                     p--;
2871                                     goto loopdone;
2872                                 }
2873                                 p = e + 1;
2874                             }
2875                         }
2876                         else {
2877                             numlen = 0;         /* disallow underscores */
2878                             ender = (UV)scan_hex(p, 2, &numlen);
2879                             p += numlen;
2880                         }
2881                         break;
2882                     case 'c':
2883                         p++;
2884                         ender = UCHARAT(p++);
2885                         ender = toCTRL(ender);
2886                         break;
2887                     case '0': case '1': case '2': case '3':case '4':
2888                     case '5': case '6': case '7': case '8':case '9':
2889                         if (*p == '0' ||
2890                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
2891                             numlen = 0;         /* disallow underscores */
2892                             ender = (UV)scan_oct(p, 3, &numlen);
2893                             p += numlen;
2894                         }
2895                         else {
2896                             --p;
2897                             goto loopdone;
2898                         }
2899                         break;
2900                     case '\0':
2901                         if (p >= RExC_end)
2902                             FAIL("Trailing \\");
2903                         /* FALL THROUGH */
2904                     default:
2905                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
2906                             vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
2907                         goto normal_default;
2908                     }
2909                     break;
2910                 default:
2911                   normal_default:
2912                     if ((*p & 0xc0) == 0xc0 && UTF) {
2913                         ender = utf8_to_uv((U8*)p, RExC_end - p,
2914                                                &numlen, 0);
2915                         p += numlen;
2916                     }
2917                     else
2918                         ender = *p++;
2919                     break;
2920                 }
2921                 if (RExC_flags16 & PMf_EXTENDED)
2922                     p = regwhite(p, RExC_end);
2923                 if (UTF && FOLD) {
2924                     if (LOC)
2925                         ender = toLOWER_LC_uni(ender);
2926                     else
2927                         ender = toLOWER_uni(ender);
2928                 }
2929                 if (ISMULT2(p)) { /* Back off on ?+*. */
2930                     if (len)
2931                         p = oldp;
2932                     else if (ender >= 0x80 && UTF) {
2933                         reguni(pRExC_state, ender, s, &numlen);
2934                         s += numlen;
2935                         len += numlen;
2936                     }
2937                     else {
2938                         len++;
2939                         REGC(ender, s++);
2940                     }
2941                     break;
2942                 }
2943                 if (ender >= 0x80 && UTF) {
2944                     reguni(pRExC_state, ender, s, &numlen);
2945                     s += numlen;
2946                     len += numlen - 1;
2947                 }
2948                 else
2949                     REGC(ender, s++);
2950             }
2951         loopdone:
2952             RExC_parse = p - 1;
2953             nextchar(pRExC_state);
2954             {
2955                 /* len is STRLEN which is unsigned, need to copy to signed */
2956                 IV iv = len;
2957                 if (iv < 0)
2958                     vFAIL("Internal disaster");
2959             }
2960             if (len > 0)
2961                 *flagp |= HASWIDTH;
2962             if (len == 1)
2963                 *flagp |= SIMPLE;
2964             if (!SIZE_ONLY)
2965                 STR_LEN(ret) = len;
2966             if (SIZE_ONLY)
2967                 RExC_size += STR_SZ(len);
2968             else
2969                 RExC_emit += STR_SZ(len);
2970         }
2971         break;
2972     }
2973
2974     return(ret);
2975 }
2976
2977 STATIC char *
2978 S_regwhite(pTHX_ char *p, char *e)
2979 {
2980     while (p < e) {
2981         if (isSPACE(*p))
2982             ++p;
2983         else if (*p == '#') {
2984             do {
2985                 p++;
2986             } while (p < e && *p != '\n');
2987         }
2988         else
2989             break;
2990     }
2991     return p;
2992 }
2993
2994 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
2995    Character classes ([:foo:]) can also be negated ([:^foo:]).
2996    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
2997    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
2998    but trigger warnings because they are currently unimplemented. */
2999 STATIC I32
3000 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3001 {
3002     char *posixcc = 0;
3003     I32 namedclass = OOB_NAMEDCLASS;
3004
3005     if (value == '[' && RExC_parse + 1 < RExC_end &&
3006         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3007         (*RExC_parse == ':' ||
3008          *RExC_parse == '=' ||
3009          *RExC_parse == '.')) {
3010         char  c = *RExC_parse;
3011         char* s = RExC_parse++;
3012             
3013         while (RExC_parse < RExC_end && *RExC_parse != c)
3014             RExC_parse++;
3015         if (RExC_parse == RExC_end)
3016             /* Grandfather lone [:, [=, [. */
3017             RExC_parse = s;
3018         else {
3019             char* t = RExC_parse++; /* skip over the c */
3020
3021             if (*RExC_parse == ']') {
3022                 RExC_parse++; /* skip over the ending ] */
3023                 posixcc = s + 1;
3024                 if (*s == ':') {
3025                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3026                     I32 skip = 5; /* the most common skip */
3027
3028                     switch (*posixcc) {
3029                     case 'a':
3030                         if (strnEQ(posixcc, "alnum", 5))
3031                             namedclass =
3032                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3033                         else if (strnEQ(posixcc, "alpha", 5))
3034                             namedclass =
3035                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3036                         else if (strnEQ(posixcc, "ascii", 5))
3037                             namedclass =
3038                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3039                         break;
3040                     case 'b':
3041                         if (strnEQ(posixcc, "blank", 5))
3042                             namedclass =
3043                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3044                         break;
3045                     case 'c':
3046                         if (strnEQ(posixcc, "cntrl", 5))
3047                             namedclass =
3048                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3049                         break;
3050                     case 'd':
3051                         if (strnEQ(posixcc, "digit", 5))
3052                             namedclass =
3053                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3054                         break;
3055                     case 'g':
3056                         if (strnEQ(posixcc, "graph", 5))
3057                             namedclass =
3058                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3059                         break;
3060                     case 'l':
3061                         if (strnEQ(posixcc, "lower", 5))
3062                             namedclass =
3063                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3064                         break;
3065                     case 'p':
3066                         if (strnEQ(posixcc, "print", 5))
3067                             namedclass =
3068                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3069                         else if (strnEQ(posixcc, "punct", 5))
3070                             namedclass =
3071                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3072                         break;
3073                     case 's':
3074                         if (strnEQ(posixcc, "space", 5))
3075                             namedclass =
3076                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3077                         break;
3078                     case 'u':
3079                         if (strnEQ(posixcc, "upper", 5))
3080                             namedclass =
3081                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3082                         break;
3083                     case 'w': /* this is not POSIX, this is the Perl \w */
3084                         if (strnEQ(posixcc, "word", 4)) {
3085                             namedclass =
3086                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3087                             skip = 4;
3088                         }
3089                         break;
3090                     case 'x':
3091                         if (strnEQ(posixcc, "xdigit", 6)) {
3092                             namedclass =
3093                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3094                             skip = 6;
3095                         }
3096                         break;
3097                     }
3098                     if (namedclass == OOB_NAMEDCLASS ||
3099                         posixcc[skip] != ':' ||
3100                         posixcc[skip+1] != ']')
3101                     {
3102                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3103                                       t - s - 1, s + 1);
3104                     }
3105                 } else if (!SIZE_ONLY) {
3106                     /* [[=foo=]] and [[.foo.]] are still future. */
3107
3108                     /* adjust RExC_parse so the warning shows after
3109                        the class closes */
3110                     while (*RExC_parse && *RExC_parse != ']')
3111                         RExC_parse++;
3112                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3113                 }
3114             } else {
3115                 /* Maternal grandfather:
3116                  * "[:" ending in ":" but not in ":]" */
3117                 RExC_parse = s;
3118             }
3119         }
3120     }
3121
3122     return namedclass;
3123 }
3124
3125 STATIC void
3126 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3127 {
3128     if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
3129         (*RExC_parse == ':' ||
3130          *RExC_parse == '=' ||
3131          *RExC_parse == '.')) {
3132         char *s = RExC_parse;
3133         char  c = *s++;
3134
3135         while(*s && isALNUM(*s))
3136             s++;
3137         if (*s && c == *s && s[1] == ']') {
3138             vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3139
3140             /* [[=foo=]] and [[.foo.]] are still future. */
3141             if (c == '=' || c == '.')
3142             {
3143                 /* adjust RExC_parse so the error shows after
3144                    the class closes */
3145                 while (*RExC_parse && *RExC_parse++ != ']')
3146                     ;
3147                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3148             }
3149         }
3150     }
3151 }
3152
3153 STATIC regnode *
3154 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3155 {
3156     register UV value;
3157     register IV lastvalue = OOB_UNICODE;
3158     register IV range = 0;
3159     register regnode *ret;
3160     STRLEN numlen;
3161     IV namedclass;
3162     char *rangebegin;
3163     bool need_class = 0;
3164     SV *listsv;
3165     register char *e;
3166     UV n;
3167
3168     ret = reganode(pRExC_state, ANYOF, 0);
3169
3170     if (!SIZE_ONLY)
3171         ANYOF_FLAGS(ret) = 0;
3172
3173     if (*RExC_parse == '^') {   /* Complement of range. */
3174         RExC_naughty++;
3175         RExC_parse++;
3176         if (!SIZE_ONLY)
3177             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3178     }
3179
3180     if (SIZE_ONLY)
3181         RExC_size += ANYOF_SKIP;
3182     else {
3183         RExC_emit += ANYOF_SKIP;
3184         if (FOLD)
3185             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3186         if (LOC)
3187             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3188         ANYOF_BITMAP_ZERO(ret);
3189         listsv = newSVpvn("# comment\n", 10);
3190     }
3191
3192     if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
3193         checkposixcc(pRExC_state);
3194
3195     if (*RExC_parse == ']' || *RExC_parse == '-')
3196         goto charclassloop;             /* allow 1st char to be ] or - */
3197
3198     while (RExC_parse < RExC_end && *RExC_parse != ']') {
3199
3200     charclassloop:
3201
3202         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3203
3204         if (!range)
3205             rangebegin = RExC_parse;
3206         if (UTF) {
3207             value = utf8_to_uv((U8*)RExC_parse,
3208                                RExC_end - RExC_parse,
3209                                &numlen, 0);
3210             RExC_parse += numlen;
3211         }
3212         else
3213             value = UCHARAT(RExC_parse++);
3214         if (value == '[')
3215             namedclass = regpposixcc(pRExC_state, value);
3216         else if (value == '\\') {
3217             if (UTF) {
3218                 value = utf8_to_uv((U8*)RExC_parse,
3219                                    RExC_end - RExC_parse,
3220                                    &numlen, 0);
3221                 RExC_parse += numlen;
3222             }
3223             else
3224                 value = UCHARAT(RExC_parse++);
3225             /* Some compilers cannot handle switching on 64-bit integer
3226              * values, therefore value cannot be an UV.  Yes, this will
3227              * be a problem later if we want switch on Unicode.
3228              * A similar issue a little bit later when switching on
3229              * namedclass. --jhi */
3230             switch ((I32)value) {
3231             case 'w':   namedclass = ANYOF_ALNUM;       break;
3232             case 'W':   namedclass = ANYOF_NALNUM;      break;
3233             case 's':   namedclass = ANYOF_SPACE;       break;
3234             case 'S':   namedclass = ANYOF_NSPACE;      break;
3235             case 'd':   namedclass = ANYOF_DIGIT;       break;
3236             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3237             case 'p':
3238             case 'P':
3239                 if (*RExC_parse == '{') {
3240                     e = strchr(RExC_parse++, '}');
3241                     if (!e)
3242                         vFAIL("Missing right brace on \\p{}");
3243                     n = e - RExC_parse;
3244                 }
3245                 else {
3246                     e = RExC_parse;
3247                     n = 1;
3248                 }
3249                 if (!SIZE_ONLY) {
3250                     if (value == 'p')
3251                         Perl_sv_catpvf(aTHX_ listsv,
3252                                        "+utf8::%.*s\n", (int)n, RExC_parse);
3253                     else
3254                         Perl_sv_catpvf(aTHX_ listsv,
3255                                        "!utf8::%.*s\n", (int)n, RExC_parse);
3256                 }
3257                 RExC_parse = e + 1;
3258                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3259                 continue;
3260             case 'n':   value = '\n';                   break;
3261             case 'r':   value = '\r';                   break;
3262             case 't':   value = '\t';                   break;
3263             case 'f':   value = '\f';                   break;
3264             case 'b':   value = '\b';                   break;
3265 #ifdef ASCIIish
3266             case 'e':   value = '\033';                 break;
3267             case 'a':   value = '\007';                 break;
3268 #else
3269             case 'e':   value = '\047';                 break;
3270             case 'a':   value = '\057';                 break;
3271 #endif
3272             case 'x':
3273                 if (*RExC_parse == '{') {
3274                     e = strchr(RExC_parse++, '}');
3275                     if (!e) 
3276                         vFAIL("Missing right brace on \\x{}");
3277                     numlen = 1;         /* allow underscores */
3278                     value = (UV)scan_hex(RExC_parse,
3279                                          e - RExC_parse,
3280                                          &numlen);
3281                     RExC_parse = e + 1;
3282                 }
3283                 else {
3284                     numlen = 0;         /* disallow underscores */
3285                     value = (UV)scan_hex(RExC_parse, 2, &numlen);
3286                     RExC_parse += numlen;
3287                 }
3288                 break;
3289             case 'c':
3290                 value = UCHARAT(RExC_parse++);
3291                 value = toCTRL(value);
3292                 break;
3293             case '0': case '1': case '2': case '3': case '4':
3294             case '5': case '6': case '7': case '8': case '9':
3295                 numlen = 0;             /* disallow underscores */
3296                 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3297                 RExC_parse += numlen;
3298                 break;
3299             default:
3300                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3301                     vWARN2(RExC_parse,
3302                            "Unrecognized escape \\%c in character class passed through",
3303                            (int)value);
3304                 break;
3305             }
3306         } /* end of \blah */
3307
3308         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3309
3310             if (!SIZE_ONLY && !need_class)
3311                 ANYOF_CLASS_ZERO(ret);
3312
3313             need_class = 1;
3314
3315             /* a bad range like a-\d, a-[:digit:] ? */
3316             if (range) {
3317                 if (!SIZE_ONLY) {
3318                     if (ckWARN(WARN_REGEXP))
3319                         vWARN4(RExC_parse,
3320                                "False [] range \"%*.*s\"",
3321                                RExC_parse - rangebegin,
3322                                RExC_parse - rangebegin,
3323                                rangebegin);
3324                     if (lastvalue < 256) {
3325                         ANYOF_BITMAP_SET(ret, lastvalue);
3326                         ANYOF_BITMAP_SET(ret, '-');
3327                     }
3328                     else {
3329                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3330                         Perl_sv_catpvf(aTHX_ listsv,
3331                                        /* 0x002D is Unicode for '-' */
3332                                        "%04"UVxf"\n002D\n", (UV)lastvalue);
3333                     }
3334                 }
3335
3336                 range = 0; /* this was not a true range */
3337             }
3338
3339             if (!SIZE_ONLY) {
3340                 /* Possible truncation here but in some 64-bit environments
3341                  * the compiler gets heartburn about switch on 64-bit values.
3342                  * A similar issue a little earlier when switching on value.
3343                  * --jhi */
3344                 switch ((I32)namedclass) {
3345                 case ANYOF_ALNUM:
3346                     if (LOC)
3347                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3348                     else {
3349                         for (value = 0; value < 256; value++)
3350                             if (isALNUM(value))
3351                                 ANYOF_BITMAP_SET(ret, value);
3352                     }
3353                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3354                     break;
3355                 case ANYOF_NALNUM:
3356                     if (LOC)
3357                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3358                     else {
3359                         for (value = 0; value < 256; value++)
3360                             if (!isALNUM(value))
3361                                 ANYOF_BITMAP_SET(ret, value);
3362                     }
3363                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3364                     break;
3365                 case ANYOF_ALNUMC:
3366                     if (LOC)
3367                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3368                     else {
3369                         for (value = 0; value < 256; value++)
3370                             if (isALNUMC(value))
3371                                 ANYOF_BITMAP_SET(ret, value);
3372                     }
3373                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3374                     break;
3375                 case ANYOF_NALNUMC:
3376                     if (LOC)
3377                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3378                     else {
3379                         for (value = 0; value < 256; value++)
3380                             if (!isALNUMC(value))
3381                                 ANYOF_BITMAP_SET(ret, value);
3382                     }
3383                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3384                     break;
3385                 case ANYOF_ALPHA:
3386                     if (LOC)
3387                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3388                     else {
3389                         for (value = 0; value < 256; value++)
3390                             if (isALPHA(value))
3391                                 ANYOF_BITMAP_SET(ret, value);
3392                     }
3393                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3394                     break;
3395                 case ANYOF_NALPHA:
3396                     if (LOC)
3397                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3398                     else {
3399                         for (value = 0; value < 256; value++)
3400                             if (!isALPHA(value))
3401                                 ANYOF_BITMAP_SET(ret, value);
3402                     }
3403                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3404                     break;
3405                 case ANYOF_ASCII:
3406                     if (LOC)
3407                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3408                     else {
3409 #ifdef ASCIIish
3410                         for (value = 0; value < 128; value++)
3411                             ANYOF_BITMAP_SET(ret, value);
3412 #else  /* EBCDIC */
3413                         for (value = 0; value < 256; value++)
3414                             if (isASCII(value))
3415                                 ANYOF_BITMAP_SET(ret, value);
3416 #endif /* EBCDIC */
3417                     }
3418                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3419                     break;
3420                 case ANYOF_NASCII:
3421                     if (LOC)
3422                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3423                     else {
3424 #ifdef ASCIIish
3425                         for (value = 128; value < 256; value++)
3426                             ANYOF_BITMAP_SET(ret, value);
3427 #else  /* EBCDIC */
3428                         for (value = 0; value < 256; value++)
3429                             if (!isASCII(value))
3430                                 ANYOF_BITMAP_SET(ret, value);
3431 #endif /* EBCDIC */
3432                     }
3433                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3434                     break;
3435                 case ANYOF_BLANK:
3436                     if (LOC)
3437                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3438                     else {
3439                         for (value = 0; value < 256; value++)
3440                             if (isBLANK(value))
3441                                 ANYOF_BITMAP_SET(ret, value);
3442                     }
3443                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3444                     break;
3445                 case ANYOF_NBLANK:
3446                     if (LOC)
3447                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3448                     else {
3449                         for (value = 0; value < 256; value++)
3450                             if (!isBLANK(value))
3451                                 ANYOF_BITMAP_SET(ret, value);
3452                     }
3453                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3454                     break;
3455                 case ANYOF_CNTRL:
3456                     if (LOC)
3457                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3458                     else {
3459                         for (value = 0; value < 256; value++)
3460                             if (isCNTRL(value))
3461                                 ANYOF_BITMAP_SET(ret, value);
3462                     }
3463                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3464                     break;
3465                 case ANYOF_NCNTRL:
3466                     if (LOC)
3467                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3468                     else {
3469                         for (value = 0; value < 256; value++)
3470                             if (!isCNTRL(value))
3471                                 ANYOF_BITMAP_SET(ret, value);
3472                     }
3473                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3474                     break;
3475                 case ANYOF_DIGIT:
3476                     if (LOC)
3477                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3478                     else {
3479                         /* consecutive digits assumed */
3480                         for (value = '0'; value <= '9'; value++)
3481                             ANYOF_BITMAP_SET(ret, value);
3482                     }
3483                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3484                     break;
3485                 case ANYOF_NDIGIT:
3486                     if (LOC)
3487                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3488                     else {
3489                         /* consecutive digits assumed */
3490                         for (value = 0; value < '0'; value++)
3491                             ANYOF_BITMAP_SET(ret, value);
3492                         for (value = '9' + 1; value < 256; value++)
3493                             ANYOF_BITMAP_SET(ret, value);
3494                     }
3495                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3496                     break;
3497                 case ANYOF_GRAPH:
3498                     if (LOC)
3499                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3500                     else {
3501                         for (value = 0; value < 256; value++)
3502                             if (isGRAPH(value))
3503                                 ANYOF_BITMAP_SET(ret, value);
3504                     }
3505                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3506                     break;
3507                 case ANYOF_NGRAPH:
3508                     if (LOC)
3509                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3510                     else {
3511                         for (value = 0; value < 256; value++)
3512                             if (!isGRAPH(value))
3513                                 ANYOF_BITMAP_SET(ret, value);
3514                     }
3515                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3516                     break;
3517                 case ANYOF_LOWER:
3518                     if (LOC)
3519                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3520                     else {
3521                         for (value = 0; value < 256; value++)
3522                             if (isLOWER(value))
3523                                 ANYOF_BITMAP_SET(ret, value);
3524                     }
3525                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3526                     break;
3527                 case ANYOF_NLOWER:
3528                     if (LOC)
3529                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3530                     else {
3531                         for (value = 0; value < 256; value++)
3532                             if (!isLOWER(value))
3533                                 ANYOF_BITMAP_SET(ret, value);
3534                     }
3535                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3536                     break;
3537                 case ANYOF_PRINT:
3538                     if (LOC)
3539                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3540                     else {
3541                         for (value = 0; value < 256; value++)
3542                             if (isPRINT(value))
3543                                 ANYOF_BITMAP_SET(ret, value);
3544                     }
3545                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3546                     break;
3547                 case ANYOF_NPRINT:
3548                     if (LOC)
3549                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3550                     else {
3551                         for (value = 0; value < 256; value++)
3552                             if (!isPRINT(value))
3553                                 ANYOF_BITMAP_SET(ret, value);
3554                     }
3555                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3556                     break;
3557                 case ANYOF_PSXSPC:
3558                     if (LOC)
3559                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3560                     else {
3561                         for (value = 0; value < 256; value++)
3562                             if (isPSXSPC(value))
3563                                 ANYOF_BITMAP_SET(ret, value);
3564                     }
3565                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3566                     break;
3567                 case ANYOF_NPSXSPC:
3568                     if (LOC)
3569                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3570                     else {
3571                         for (value = 0; value < 256; value++)
3572                             if (!isPSXSPC(value))
3573                                 ANYOF_BITMAP_SET(ret, value);
3574                     }
3575                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3576                     break;
3577                 case ANYOF_PUNCT:
3578                     if (LOC)
3579                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3580                     else {
3581                         for (value = 0; value < 256; value++)
3582                             if (isPUNCT(value))
3583                                 ANYOF_BITMAP_SET(ret, value);
3584                     }
3585                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3586                     break;
3587                 case ANYOF_NPUNCT:
3588                     if (LOC)
3589                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3590                     else {
3591                         for (value = 0; value < 256; value++)
3592                             if (!isPUNCT(value))
3593                                 ANYOF_BITMAP_SET(ret, value);
3594                     }
3595                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3596                     break;
3597                 case ANYOF_SPACE:
3598                     if (LOC)
3599                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3600                     else {
3601                         for (value = 0; value < 256; value++)
3602                             if (isSPACE(value))
3603                                 ANYOF_BITMAP_SET(ret, value);
3604                     }
3605                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3606                     break;
3607                 case ANYOF_NSPACE:
3608                     if (LOC)
3609                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3610                     else {
3611                         for (value = 0; value < 256; value++)
3612                             if (!isSPACE(value))
3613                                 ANYOF_BITMAP_SET(ret, value);
3614                     }
3615                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
3616                     break;
3617                 case ANYOF_UPPER:
3618                     if (LOC)
3619                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3620                     else {
3621                         for (value = 0; value < 256; value++)
3622                             if (isUPPER(value))
3623                                 ANYOF_BITMAP_SET(ret, value);
3624                     }
3625                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
3626                     break;
3627                 case ANYOF_NUPPER:
3628                     if (LOC)
3629                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3630                     else {
3631                         for (value = 0; value < 256; value++)
3632                             if (!isUPPER(value))
3633                                 ANYOF_BITMAP_SET(ret, value);
3634                     }
3635                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
3636                     break;
3637                 case ANYOF_XDIGIT:
3638                     if (LOC)
3639                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3640                     else {
3641                         for (value = 0; value < 256; value++)
3642                             if (isXDIGIT(value))
3643                                 ANYOF_BITMAP_SET(ret, value);
3644                     }
3645                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
3646                     break;
3647                 case ANYOF_NXDIGIT:
3648                     if (LOC)
3649                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
3650                     else {
3651                         for (value = 0; value < 256; value++)
3652                             if (!isXDIGIT(value))
3653                                 ANYOF_BITMAP_SET(ret, value);
3654                     }
3655                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
3656                     break;
3657                 default:
3658                     vFAIL("Invalid [::] class");
3659                     break;
3660                 }
3661                 if (LOC)
3662                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
3663                 continue;
3664             }
3665         } /* end of namedclass \blah */
3666
3667         if (range) {
3668             if (lastvalue > value) /* b-a */ {
3669                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
3670                               RExC_parse - rangebegin,
3671                               RExC_parse - rangebegin,
3672                               rangebegin);
3673             }
3674             range = 0; /* not a true range */
3675         }
3676         else {
3677             lastvalue = value; /* save the beginning of the range */
3678             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3679                 RExC_parse[1] != ']') {
3680                 RExC_parse++;
3681
3682                 /* a bad range like \w-, [:word:]- ? */
3683                 if (namedclass > OOB_NAMEDCLASS) {
3684                     if (ckWARN(WARN_REGEXP))
3685                         vWARN4(RExC_parse,
3686                                "False [] range \"%*.*s\"",
3687                                RExC_parse - rangebegin,
3688                                RExC_parse - rangebegin,
3689                                rangebegin);
3690                     if (!SIZE_ONLY)
3691                         ANYOF_BITMAP_SET(ret, '-');
3692                 } else
3693                     range = 1;  /* yeah, it's a range! */
3694                 continue;       /* but do it the next time */
3695             }
3696         }
3697
3698         /* now is the next time */
3699         if (!SIZE_ONLY) {
3700             if (lastvalue < 256 && value < 256) {
3701 #ifndef ASCIIish /* EBCDIC, for example. */
3702                 if ((isLOWER(lastvalue) && isLOWER(value)) ||
3703                     (isUPPER(lastvalue) && isUPPER(value)))
3704                 {
3705                     IV i;
3706                     if (isLOWER(lastvalue)) {
3707                         for (i = lastvalue; i <= value; i++)
3708                             if (isLOWER(i))
3709                                 ANYOF_BITMAP_SET(ret, i);
3710                     } else {
3711                         for (i = lastvalue; i <= value; i++)
3712                             if (isUPPER(i))
3713                                 ANYOF_BITMAP_SET(ret, i);
3714                     }
3715                 }
3716                 else
3717 #endif
3718                     for ( ; lastvalue <= value; lastvalue++)
3719                         ANYOF_BITMAP_SET(ret, lastvalue);
3720             } else {
3721                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3722                 if (lastvalue < value)
3723                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3724                                    (UV)lastvalue, (UV)value);
3725                 else
3726                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
3727                                    (UV)value);
3728             }
3729         }
3730
3731         range = 0; /* this range (if it was one) is done now */
3732     }
3733
3734     if (need_class) {
3735         if (SIZE_ONLY)
3736             RExC_size += ANYOF_CLASS_ADD_SKIP;
3737         else
3738             RExC_emit += ANYOF_CLASS_ADD_SKIP;
3739     }
3740
3741     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
3742     if (!SIZE_ONLY &&
3743         (ANYOF_FLAGS(ret) &
3744          /* If the only flag is folding (plus possibly inversion). */
3745          (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
3746         for (value = 0; value < 256; ++value) {
3747             if (ANYOF_BITMAP_TEST(ret, value)) {
3748                 IV fold = PL_fold[value];
3749
3750                 if (fold != value)
3751                     ANYOF_BITMAP_SET(ret, fold);
3752             }
3753         }
3754         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
3755     }
3756
3757     /* optimize inverted simple patterns (e.g. [^a-z]) */
3758     if (!SIZE_ONLY &&
3759         /* If the only flag is inversion. */
3760         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
3761         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
3762             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
3763         ANYOF_FLAGS(ret) = 0;
3764     }
3765
3766     if (!SIZE_ONLY) { 
3767         AV *av = newAV();
3768         SV *rv;
3769
3770         av_store(av, 0, listsv);
3771         av_store(av, 1, NULL);
3772         rv = newRV_noinc((SV*)av);
3773         n = add_data(pRExC_state, 1, "s");
3774         RExC_rx->data->data[n] = (void*)rv;
3775         ARG_SET(ret, n);
3776     }
3777
3778     return ret;
3779 }
3780
3781 STATIC char*
3782 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
3783 {
3784     char* retval = RExC_parse++;
3785
3786     for (;;) {
3787         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
3788                 RExC_parse[2] == '#') {
3789             while (*RExC_parse && *RExC_parse != ')')
3790                 RExC_parse++;
3791             RExC_parse++;
3792             continue;
3793         }
3794         if (RExC_flags16 & PMf_EXTENDED) {
3795             if (isSPACE(*RExC_parse)) {
3796                 RExC_parse++;
3797                 continue;
3798             }
3799             else if (*RExC_parse == '#') {
3800                 while (*RExC_parse && *RExC_parse != '\n')
3801                     RExC_parse++;
3802                 RExC_parse++;
3803                 continue;
3804             }
3805         }
3806         return retval;
3807     }
3808 }
3809
3810 /*
3811 - reg_node - emit a node
3812 */
3813 STATIC regnode *                        /* Location. */
3814 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
3815 {
3816     register regnode *ret;
3817     register regnode *ptr;
3818
3819     ret = RExC_emit;
3820     if (SIZE_ONLY) {
3821         SIZE_ALIGN(RExC_size);
3822         RExC_size += 1;
3823         return(ret);
3824     }
3825
3826     NODE_ALIGN_FILL(ret);
3827     ptr = ret;
3828     FILL_ADVANCE_NODE(ptr, op);
3829     RExC_emit = ptr;
3830
3831     return(ret);
3832 }
3833
3834 /*
3835 - reganode - emit a node with an argument
3836 */
3837 STATIC regnode *                        /* Location. */
3838 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
3839 {
3840     register regnode *ret;
3841     register regnode *ptr;
3842
3843     ret = RExC_emit;
3844     if (SIZE_ONLY) {
3845         SIZE_ALIGN(RExC_size);
3846         RExC_size += 2;
3847         return(ret);
3848     }
3849
3850     NODE_ALIGN_FILL(ret);
3851     ptr = ret;
3852     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
3853     RExC_emit = ptr;
3854
3855     return(ret);
3856 }
3857
3858 /*
3859 - reguni - emit (if appropriate) a Unicode character
3860 */
3861 STATIC void
3862 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
3863 {
3864     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s);
3865 }
3866
3867 /*
3868 - reginsert - insert an operator in front of already-emitted operand
3869 *
3870 * Means relocating the operand.
3871 */
3872 STATIC void
3873 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
3874 {
3875     register regnode *src;
3876     register regnode *dst;
3877     register regnode *place;
3878     register int offset = regarglen[(U8)op];
3879     
3880 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
3881
3882     if (SIZE_ONLY) {
3883         RExC_size += NODE_STEP_REGNODE + offset;
3884         return;
3885     }
3886
3887     src = RExC_emit;
3888     RExC_emit += NODE_STEP_REGNODE + offset;
3889     dst = RExC_emit;
3890     while (src > opnd)
3891         StructCopy(--src, --dst, regnode);
3892
3893     place = opnd;               /* Op node, where operand used to be. */
3894     src = NEXTOPER(place);
3895     FILL_ADVANCE_NODE(place, op);
3896     Zero(src, offset, regnode);
3897 }
3898
3899 /*
3900 - regtail - set the next-pointer at the end of a node chain of p to val.
3901 */
3902 STATIC void
3903 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
3904 {
3905     register regnode *scan;
3906     register regnode *temp;
3907
3908     if (SIZE_ONLY)
3909         return;
3910
3911     /* Find last node. */
3912     scan = p;
3913     for (;;) {
3914         temp = regnext(scan);
3915         if (temp == NULL)
3916             break;
3917         scan = temp;
3918     }
3919
3920     if (reg_off_by_arg[OP(scan)]) {
3921         ARG_SET(scan, val - scan);
3922     }
3923     else {
3924         NEXT_OFF(scan) = val - scan;
3925     }
3926 }
3927
3928 /*
3929 - regoptail - regtail on operand of first argument; nop if operandless
3930 */
3931 STATIC void
3932 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
3933 {
3934     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
3935     if (p == NULL || SIZE_ONLY)
3936         return;
3937     if (PL_regkind[(U8)OP(p)] == BRANCH) {
3938         regtail(pRExC_state, NEXTOPER(p), val);
3939     }
3940     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
3941         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
3942     }
3943     else
3944         return;
3945 }
3946
3947 /*
3948  - regcurly - a