This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
aae2ceda5ff69aecf249a6378a15a1d925fa541a
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors 
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif 
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2000, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifdef PERL_IN_XSUB_RE
87 #  if defined(PERL_CAPI) || defined(PERL_OBJECT)
88 #    include "XSUB.h"
89 #  endif
90 #else
91 #  include "INTERN.h"
92 #endif
93
94 #define REG_COMP_C
95 #include "regcomp.h"
96
97 #ifdef op
98 #undef op
99 #endif /* op */
100
101 #ifdef MSDOS
102 # if defined(BUGGY_MSC6)
103  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
104  # pragma optimize("a",off)
105  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
106  # pragma optimize("w",on )
107 # endif /* BUGGY_MSC6 */
108 #endif /* MSDOS */
109
110 #ifndef STATIC
111 #define STATIC  static
112 #endif
113
114 typedef struct RExC_state_t {
115     U16         flags16;                /* are we folding, multilining? */
116     char        *precomp;               /* uncompiled string. */
117     regexp      *rx;
118     char        *end;                   /* End of input for compile */
119     char        *parse;                 /* Input-scan pointer. */
120     I32         whilem_seen;            /* number of WHILEM in this expr */
121     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't */
122     I32         naughty;                /* How bad is this pattern? */
123     I32         sawback;                /* Did we see \1, ...? */
124     U32         seen;
125     I32         size;                   /* Code size. */
126     I32         npar;                   /* () count. */
127     I32         extralen;
128     I32         seen_zerolen;
129     I32         seen_evals;
130 #if ADD_TO_REGEXEC
131     char        *starttry;              /* -Dr: where regtry was called. */
132 #define RExC_starttry   (pRExC_state->starttry)
133 #endif
134 } RExC_state_t;
135
136 #define RExC_flags16    (pRExC_state->flags16)
137 #define RExC_precomp    (pRExC_state->precomp)
138 #define RExC_rx         (pRExC_state->rx)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_emit       (pRExC_state->emit)
143 #define RExC_naughty    (pRExC_state->naughty)
144 #define RExC_sawback    (pRExC_state->sawback)
145 #define RExC_seen       (pRExC_state->seen)
146 #define RExC_size       (pRExC_state->size)
147 #define RExC_npar       (pRExC_state->npar)
148 #define RExC_extralen   (pRExC_state->extralen)
149 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
150 #define RExC_seen_evals (pRExC_state->seen_evals)
151
152 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
153 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
154         ((*s) == '{' && regcurly(s)))
155 #ifdef atarist
156 #define PERL_META       "^$.[()|?+*\\"
157 #else
158 #define META    "^$.[()|?+*\\"
159 #endif
160
161 #ifdef SPSTART
162 #undef SPSTART          /* dratted cpp namespace... */
163 #endif
164 /*
165  * Flags to be passed up and down.
166  */
167 #define WORST           0       /* Worst case. */
168 #define HASWIDTH        0x1     /* Known to match non-null strings. */
169 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
170 #define SPSTART         0x4     /* Starts with * or +. */
171 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
172
173 /* Length of a variant. */
174
175 typedef struct scan_data_t {
176     I32 len_min;
177     I32 len_delta;
178     I32 pos_min;
179     I32 pos_delta;
180     SV *last_found;
181     I32 last_end;                       /* min value, <0 unless valid. */
182     I32 last_start_min;
183     I32 last_start_max;
184     SV **longest;                       /* Either &l_fixed, or &l_float. */
185     SV *longest_fixed;
186     I32 offset_fixed;
187     SV *longest_float;
188     I32 offset_float_min;
189     I32 offset_float_max;
190     I32 flags;
191     I32 whilem_c;
192     I32 *last_closep;
193     struct regnode_charclass_class *start_class;
194 } scan_data_t;
195
196 /*
197  * Forward declarations for pregcomp()'s friends.
198  */
199
200 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
201                                       0, 0, 0, 0, 0, 0};
202
203 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
204 #define SF_BEFORE_SEOL          0x1
205 #define SF_BEFORE_MEOL          0x2
206 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
207 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208
209 #ifdef NO_UNARY_PLUS
210 #  define SF_FIX_SHIFT_EOL      (0+2)
211 #  define SF_FL_SHIFT_EOL               (0+4)
212 #else
213 #  define SF_FIX_SHIFT_EOL      (+2)
214 #  define SF_FL_SHIFT_EOL               (+4)
215 #endif
216
217 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
219
220 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
221 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
222 #define SF_IS_INF               0x40
223 #define SF_HAS_PAR              0x80
224 #define SF_IN_PAR               0x100
225 #define SF_HAS_EVAL             0x200
226 #define SCF_DO_SUBSTR           0x400
227 #define SCF_DO_STCLASS_AND      0x0800
228 #define SCF_DO_STCLASS_OR       0x1000
229 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
230 #define SCF_WHILEM_VISITED_POS  0x2000
231
232 #define RF_utf8         8
233 #define UTF (PL_reg_flags & RF_utf8)
234 #define LOC (RExC_flags16 & PMf_LOCALE)
235 #define FOLD (RExC_flags16 & PMf_FOLD)
236
237 #define OOB_CHAR8               1234
238 #define OOB_UTF8                123456
239 #define OOB_NAMEDCLASS          -1
240
241 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
242 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
243
244
245 /* length of regex to show in messages that don't mark a position within */
246 #define RegexLengthToShowInErrorMessages 127
247
248 /*
249  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
250  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
251  * op/pragma/warn/regcomp.
252  */
253 #define MARKER1 "HERE"      /* marker as it appears in the description */
254 #define MARKER2 " << HERE "  /* marker as it appears within the regex */
255    
256 #define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
257
258 /*
259  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
260  * arg. Show regex, up to a maximum length. If it's too long, chop and add
261  * "...".
262  */
263 #define FAIL(msg)                                                             \
264     STMT_START {                                                             \
265         char *ellipses = "";                                                 \
266         unsigned len = strlen(RExC_precomp);                                \
267                                                                              \
268         if (!SIZE_ONLY)                                                      \
269             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
270                                                                              \
271         if (len > RegexLengthToShowInErrorMessages) {                        \
272             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
273             len = RegexLengthToShowInErrorMessages - 10;                     \
274             ellipses = "...";                                                \
275         }                                                                    \
276         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
277                    msg, (int)len, RExC_precomp, ellipses);                  \
278     } STMT_END
279
280 /*
281  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282  * args. Show regex, up to a maximum length. If it's too long, chop and add
283  * "...".
284  */
285 #define FAIL2(pat,msg)                                                        \
286     STMT_START {                                                             \
287         char *ellipses = "";                                                 \
288         unsigned len = strlen(RExC_precomp);                                \
289                                                                              \
290         if (!SIZE_ONLY)                                                      \
291             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
292                                                                              \
293         if (len > RegexLengthToShowInErrorMessages) {                        \
294             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
295             len = RegexLengthToShowInErrorMessages - 10;                     \
296             ellipses = "...";                                                \
297         }                                                                    \
298         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
299                     msg, (int)len, RExC_precomp, ellipses);                \
300     } STMT_END
301
302
303 /*
304  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
305  */
306 #define Simple_vFAIL(m)                                                      \
307     STMT_START {                                                             \
308       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
309                                                                              \
310       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
311                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
312     } STMT_END
313
314 /*
315  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316  */
317 #define vFAIL(m)                                                             \
318     STMT_START {                                                             \
319       if (!SIZE_ONLY)                                                        \
320             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
321       Simple_vFAIL(m);                                                       \
322     } STMT_END
323
324 /*
325  * Like Simple_vFAIL(), but accepts two arguments.
326  */
327 #define Simple_vFAIL2(m,a1)                                                  \
328     STMT_START {                                                             \
329       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
330                                                                              \
331       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
332                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
333     } STMT_END
334
335 /*
336  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
337  */
338 #define vFAIL2(m,a1)                                                         \
339     STMT_START {                                                             \
340       if (!SIZE_ONLY)                                                        \
341             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
342       Simple_vFAIL2(m, a1);                                                  \
343     } STMT_END
344
345
346 /*
347  * Like Simple_vFAIL(), but accepts three arguments.
348  */
349 #define Simple_vFAIL3(m, a1, a2)                                             \
350     STMT_START {                                                             \
351       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
352                                                                              \
353       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
354                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
355     } STMT_END
356
357 /*
358  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
359  */
360 #define vFAIL3(m,a1,a2)                                                      \
361     STMT_START {                                                             \
362       if (!SIZE_ONLY)                                                        \
363             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
364       Simple_vFAIL3(m, a1, a2);                                              \
365     } STMT_END
366
367 /*
368  * Like Simple_vFAIL(), but accepts four arguments.
369  */
370 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
371     STMT_START {                                                             \
372       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
373                                                                              \
374       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
375                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
376     } STMT_END
377
378 /*
379  * Like Simple_vFAIL(), but accepts five arguments.
380  */
381 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
382     STMT_START {                                                             \
383       unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
384       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
385                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
386     } STMT_END
387
388
389 #define vWARN(loc,m)                                                         \
390     STMT_START {                                                             \
391         unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));          \
392         Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
393                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
394     } STMT_END                                                               \
395
396
397 #define vWARN2(loc, m, a1)                                                   \
398     STMT_START {                                                             \
399         unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));          \
400         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
401                  a1,                                                         \
402                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
403     } STMT_END
404
405 #define vWARN3(loc, m, a1, a2)                                               \
406     STMT_START {                                                             \
407       unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc));        \
408         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,                    \
409                  a1, a2,                                                     \
410                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
411     } STMT_END
412
413 #define vWARN4(loc, m, a1, a2, a3)                                           \
414     STMT_START {                                                             \
415       unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));            \
416         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
417                  a1, a2, a3,                                                 \
418                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
419     } STMT_END
420
421
422 /* Allow for side effects in s */
423 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
424
425 static void clear_re(pTHXo_ void *r);
426
427 /* Mark that we cannot extend a found fixed substring at this point.
428    Updata the longest found anchored substring and the longest found
429    floating substrings if needed. */
430
431 STATIC void
432 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
433 {
434     STRLEN l = CHR_SVLEN(data->last_found);
435     STRLEN old_l = CHR_SVLEN(*data->longest);
436     
437     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
438         sv_setsv(*data->longest, data->last_found);
439         if (*data->longest == data->longest_fixed) {
440             data->offset_fixed = l ? data->last_start_min : data->pos_min;
441             if (data->flags & SF_BEFORE_EOL)
442                 data->flags 
443                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
444             else
445                 data->flags &= ~SF_FIX_BEFORE_EOL;
446         }
447         else {
448             data->offset_float_min = l ? data->last_start_min : data->pos_min;
449             data->offset_float_max = (l 
450                                       ? data->last_start_max 
451                                       : data->pos_min + data->pos_delta);
452             if (data->flags & SF_BEFORE_EOL)
453                 data->flags 
454                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
455             else
456                 data->flags &= ~SF_FL_BEFORE_EOL;
457         }
458     }
459     SvCUR_set(data->last_found, 0);
460     data->last_end = -1;
461     data->flags &= ~SF_BEFORE_EOL;
462 }
463
464 /* Can match anything (initialization) */
465 STATIC void
466 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
467 {
468     int value;
469
470     ANYOF_CLASS_ZERO(cl);
471     for (value = 0; value < 256; ++value)
472         ANYOF_BITMAP_SET(cl, value);
473     cl->flags = ANYOF_EOS;
474     if (LOC)
475         cl->flags |= ANYOF_LOCALE;
476 }
477
478 /* Can match anything (initialization) */
479 STATIC int
480 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
481 {
482     int value;
483
484     for (value = 0; value <= ANYOF_MAX; value += 2)
485         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
486             return 1;
487     for (value = 0; value < 256; ++value)
488         if (!ANYOF_BITMAP_TEST(cl, value))
489             return 0;
490     return 1;
491 }
492
493 /* Can match anything (initialization) */
494 STATIC void
495 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
496 {
497     Zero(cl, 1, struct regnode_charclass_class);
498     cl->type = ANYOF;
499     cl_anything(pRExC_state, cl);
500 }
501
502 STATIC void
503 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
504 {
505     Zero(cl, 1, struct regnode_charclass_class);
506     cl->type = ANYOF;
507     cl_anything(pRExC_state, cl);
508     if (LOC)
509         cl->flags |= ANYOF_LOCALE;
510 }
511
512 /* 'And' a given class with another one.  Can create false positives */
513 /* We assume that cl is not inverted */
514 STATIC void
515 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
516          struct regnode_charclass_class *and_with)
517 {
518     if (!(and_with->flags & ANYOF_CLASS)
519         && !(cl->flags & ANYOF_CLASS)
520         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
521         && !(and_with->flags & ANYOF_FOLD)
522         && !(cl->flags & ANYOF_FOLD)) {
523         int i;
524
525         if (and_with->flags & ANYOF_INVERT)
526             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
527                 cl->bitmap[i] &= ~and_with->bitmap[i];
528         else
529             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
530                 cl->bitmap[i] &= and_with->bitmap[i];
531     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
532     if (!(and_with->flags & ANYOF_EOS))
533         cl->flags &= ~ANYOF_EOS;
534 }
535
536 /* 'OR' a given class with another one.  Can create false positives */
537 /* We assume that cl is not inverted */
538 STATIC void
539 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
540 {
541     if (or_with->flags & ANYOF_INVERT) {
542         /* We do not use
543          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
544          *   <= (B1 | !B2) | (CL1 | !CL2)
545          * which is wasteful if CL2 is small, but we ignore CL2:
546          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
547          * XXXX Can we handle case-fold?  Unclear:
548          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
549          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
550          */
551         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
552              && !(or_with->flags & ANYOF_FOLD)
553              && !(cl->flags & ANYOF_FOLD) ) {
554             int i;
555
556             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557                 cl->bitmap[i] |= ~or_with->bitmap[i];
558         } /* XXXX: logic is complicated otherwise */
559         else {
560             cl_anything(pRExC_state, cl);
561         }
562     } else {
563         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
564         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565              && (!(or_with->flags & ANYOF_FOLD) 
566                  || (cl->flags & ANYOF_FOLD)) ) {
567             int i;
568
569             /* OR char bitmap and class bitmap separately */
570             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571                 cl->bitmap[i] |= or_with->bitmap[i];
572             if (or_with->flags & ANYOF_CLASS) {
573                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
574                     cl->classflags[i] |= or_with->classflags[i];
575                 cl->flags |= ANYOF_CLASS;
576             }
577         }
578         else { /* XXXX: logic is complicated, leave it along for a moment. */
579             cl_anything(pRExC_state, cl);
580         }
581     }
582     if (or_with->flags & ANYOF_EOS)
583         cl->flags |= ANYOF_EOS;
584 }
585
586 /* REx optimizer.  Converts nodes into quickier variants "in place".
587    Finds fixed substrings.  */
588
589 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
590    to the position after last scanned or to NULL. */
591
592 STATIC I32
593 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
594                         /* scanp: Start here (read-write). */
595                         /* deltap: Write maxlen-minlen here. */
596                         /* last: Stop before this one. */
597 {
598     I32 min = 0, pars = 0, code;
599     regnode *scan = *scanp, *next;
600     I32 delta = 0;
601     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
602     int is_inf_internal = 0;            /* The studied chunk is infinite */
603     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
604     scan_data_t data_fake;
605     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
606     
607     while (scan && OP(scan) != END && scan < last) {
608         /* Peephole optimizer: */
609
610         if (PL_regkind[(U8)OP(scan)] == EXACT) {
611             /* Merge several consecutive EXACTish nodes into one. */
612             regnode *n = regnext(scan);
613             U32 stringok = 1;
614 #ifdef DEBUGGING
615             regnode *stop = scan;
616 #endif 
617
618             next = scan + NODE_SZ_STR(scan);
619             /* Skip NOTHING, merge EXACT*. */
620             while (n &&
621                    ( PL_regkind[(U8)OP(n)] == NOTHING || 
622                      (stringok && (OP(n) == OP(scan))))
623                    && NEXT_OFF(n)
624                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
625                 if (OP(n) == TAIL || n > next)
626                     stringok = 0;
627                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
628                     NEXT_OFF(scan) += NEXT_OFF(n);
629                     next = n + NODE_STEP_REGNODE;
630 #ifdef DEBUGGING
631                     if (stringok)
632                         stop = n;
633 #endif 
634                     n = regnext(n);
635                 }
636                 else {
637                     int oldl = STR_LEN(scan);
638                     regnode *nnext = regnext(n);
639                     
640                     if (oldl + STR_LEN(n) > U8_MAX) 
641                         break;
642                     NEXT_OFF(scan) += NEXT_OFF(n);
643                     STR_LEN(scan) += STR_LEN(n);
644                     next = n + NODE_SZ_STR(n);
645                     /* Now we can overwrite *n : */
646                     Move(STRING(n), STRING(scan) + oldl,
647                          STR_LEN(n), char);
648 #ifdef DEBUGGING
649                     if (stringok)
650                         stop = next - 1;
651 #endif 
652                     n = nnext;
653                 }
654             }
655 #ifdef DEBUGGING
656             /* Allow dumping */
657             n = scan + NODE_SZ_STR(scan);
658             while (n <= stop) {
659                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
660                     OP(n) = OPTIMIZED;
661                     NEXT_OFF(n) = 0;
662                 }
663                 n++;
664             }
665 #endif
666         }
667         /* Follow the next-chain of the current node and optimize
668            away all the NOTHINGs from it.  */
669         if (OP(scan) != CURLYX) {
670             int max = (reg_off_by_arg[OP(scan)]
671                        ? I32_MAX
672                        /* I32 may be smaller than U16 on CRAYs! */
673                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
674             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
675             int noff;
676             regnode *n = scan;
677             
678             /* Skip NOTHING and LONGJMP. */
679             while ((n = regnext(n))
680                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
681                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
682                    && off + noff < max)
683                 off += noff;
684             if (reg_off_by_arg[OP(scan)])
685                 ARG(scan) = off;
686             else 
687                 NEXT_OFF(scan) = off;
688         }
689         /* The principal pseudo-switch.  Cannot be a switch, since we
690            look into several different things.  */
691         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ 
692                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
693             next = regnext(scan);
694             code = OP(scan);
695             
696             if (OP(next) == code || code == IFTHEN || code == SUSPEND) { 
697                 I32 max1 = 0, min1 = I32_MAX, num = 0;
698                 struct regnode_charclass_class accum;
699                 
700                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
701                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
702                 if (flags & SCF_DO_STCLASS)
703                     cl_init_zero(pRExC_state, &accum);
704                 while (OP(scan) == code) {
705                     I32 deltanext, minnext, f = 0, fake;
706                     struct regnode_charclass_class this_class;
707
708                     num++;
709                     data_fake.flags = 0;
710                     if (data) {             
711                         data_fake.whilem_c = data->whilem_c;
712                         data_fake.last_closep = data->last_closep;
713                     }
714                     else
715                         data_fake.last_closep = &fake;
716                     next = regnext(scan);
717                     scan = NEXTOPER(scan);
718                     if (code != BRANCH)
719                         scan = NEXTOPER(scan);
720                     if (flags & SCF_DO_STCLASS) {
721                         cl_init(pRExC_state, &this_class);
722                         data_fake.start_class = &this_class;
723                         f = SCF_DO_STCLASS_AND;
724                     }               
725                     if (flags & SCF_WHILEM_VISITED_POS)
726                         f |= SCF_WHILEM_VISITED_POS;
727                     /* we suppose the run is continuous, last=next...*/
728                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
729                                           next, &data_fake, f);
730                     if (min1 > minnext) 
731                         min1 = minnext;
732                     if (max1 < minnext + deltanext)
733                         max1 = minnext + deltanext;
734                     if (deltanext == I32_MAX)
735                         is_inf = is_inf_internal = 1;
736                     scan = next;
737                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
738                         pars++;
739                     if (data && (data_fake.flags & SF_HAS_EVAL))
740                         data->flags |= SF_HAS_EVAL;
741                     if (data)
742                         data->whilem_c = data_fake.whilem_c;
743                     if (flags & SCF_DO_STCLASS)
744                         cl_or(pRExC_state, &accum, &this_class);
745                     if (code == SUSPEND) 
746                         break;
747                 }
748                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
749                     min1 = 0;
750                 if (flags & SCF_DO_SUBSTR) {
751                     data->pos_min += min1;
752                     data->pos_delta += max1 - min1;
753                     if (max1 != min1 || is_inf)
754                         data->longest = &(data->longest_float);
755                 }
756                 min += min1;
757                 delta += max1 - min1;
758                 if (flags & SCF_DO_STCLASS_OR) {
759                     cl_or(pRExC_state, data->start_class, &accum);
760                     if (min1) {
761                         cl_and(data->start_class, &and_with);
762                         flags &= ~SCF_DO_STCLASS;
763                     }
764                 }
765                 else if (flags & SCF_DO_STCLASS_AND) {
766                     if (min1) {
767                         cl_and(data->start_class, &accum);
768                         flags &= ~SCF_DO_STCLASS;
769                     }
770                     else {
771                         /* Switch to OR mode: cache the old value of 
772                          * data->start_class */
773                         StructCopy(data->start_class, &and_with,
774                                    struct regnode_charclass_class);
775                         flags &= ~SCF_DO_STCLASS_AND;
776                         StructCopy(&accum, data->start_class,
777                                    struct regnode_charclass_class);
778                         flags |= SCF_DO_STCLASS_OR;
779                         data->start_class->flags |= ANYOF_EOS;
780                     }
781                 }
782             }
783             else if (code == BRANCHJ)   /* single branch is optimized. */
784                 scan = NEXTOPER(NEXTOPER(scan));
785             else                        /* single branch is optimized. */
786                 scan = NEXTOPER(scan);
787             continue;
788         }
789         else if (OP(scan) == EXACT) {
790             I32 l = STR_LEN(scan);
791             if (UTF) {
792                 unsigned char *s = (unsigned char *)STRING(scan);
793                 unsigned char *e = s + l;
794                 I32 newl = 0;
795                 while (s < e) {
796                     newl++;
797                     s += UTF8SKIP(s);
798                 }
799                 l = newl;
800             }
801             min += l;
802             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
803                 /* The code below prefers earlier match for fixed
804                    offset, later match for variable offset.  */
805                 if (data->last_end == -1) { /* Update the start info. */
806                     data->last_start_min = data->pos_min;
807                     data->last_start_max = is_inf
808                         ? I32_MAX : data->pos_min + data->pos_delta; 
809                 }
810                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
811                 data->last_end = data->pos_min + l;
812                 data->pos_min += l; /* As in the first entry. */
813                 data->flags &= ~SF_BEFORE_EOL;
814             }
815             if (flags & SCF_DO_STCLASS_AND) {
816                 /* Check whether it is compatible with what we know already! */
817                 int compat = 1;
818
819                 if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
820                     && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
821                     && (!(data->start_class->flags & ANYOF_FOLD)
822                         || !ANYOF_BITMAP_TEST(data->start_class,
823                                               PL_fold[*(U8*)STRING(scan)])))
824                     compat = 0;
825                 ANYOF_CLASS_ZERO(data->start_class);
826                 ANYOF_BITMAP_ZERO(data->start_class);
827                 if (compat)
828                     ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
829                 data->start_class->flags &= ~ANYOF_EOS;
830             }
831             else if (flags & SCF_DO_STCLASS_OR) {
832                 /* false positive possible if the class is case-folded */
833                 ANYOF_BITMAP_SET(data->start_class, *STRING(scan));     
834                 data->start_class->flags &= ~ANYOF_EOS;
835                 cl_and(data->start_class, &and_with);
836             }
837             flags &= ~SCF_DO_STCLASS;
838         }
839         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
840             I32 l = STR_LEN(scan);
841
842             /* Search for fixed substrings supports EXACT only. */
843             if (flags & SCF_DO_SUBSTR) 
844                 scan_commit(pRExC_state, data);
845             if (UTF) {
846                 unsigned char *s = (unsigned char *)STRING(scan);
847                 unsigned char *e = s + l;
848                 I32 newl = 0;
849                 while (s < e) {
850                     newl++;
851                     s += UTF8SKIP(s);
852                 }
853                 l = newl;
854             }
855             min += l;
856             if (data && (flags & SCF_DO_SUBSTR))
857                 data->pos_min += l;
858             if (flags & SCF_DO_STCLASS_AND) {
859                 /* Check whether it is compatible with what we know already! */
860                 int compat = 1;
861
862                 if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) 
863                     && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
864                     && !ANYOF_BITMAP_TEST(data->start_class, 
865                                           PL_fold[*(U8*)STRING(scan)]))
866                     compat = 0;
867                 ANYOF_CLASS_ZERO(data->start_class);
868                 ANYOF_BITMAP_ZERO(data->start_class);
869                 if (compat) {
870                     ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
871                     data->start_class->flags &= ~ANYOF_EOS;
872                     data->start_class->flags |= ANYOF_FOLD;
873                     if (OP(scan) == EXACTFL)
874                         data->start_class->flags |= ANYOF_LOCALE;
875                 }
876             }
877             else if (flags & SCF_DO_STCLASS_OR) {
878                 if (data->start_class->flags & ANYOF_FOLD) {
879                     /* false positive possible if the class is case-folded.
880                        Assume that the locale settings are the same... */
881                     ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); 
882                     data->start_class->flags &= ~ANYOF_EOS;
883                 }
884                 cl_and(data->start_class, &and_with);
885             }
886             flags &= ~SCF_DO_STCLASS;
887         }
888         else if (strchr((char*)PL_varies,OP(scan))) {
889             I32 mincount, maxcount, minnext, deltanext, fl;
890             I32 f = flags, pos_before = 0;
891             regnode *oscan = scan;
892             struct regnode_charclass_class this_class;
893             struct regnode_charclass_class *oclass = NULL;
894
895             switch (PL_regkind[(U8)OP(scan)]) {
896             case WHILEM:                /* End of (?:...)* . */
897                 scan = NEXTOPER(scan);
898                 goto finish;
899             case PLUS:
900                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
901                     next = NEXTOPER(scan);
902                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
903                         mincount = 1; 
904                         maxcount = REG_INFTY; 
905                         next = regnext(scan);
906                         scan = NEXTOPER(scan);
907                         goto do_curly;
908                     }
909                 }
910                 if (flags & SCF_DO_SUBSTR)
911                     data->pos_min++;
912                 min++;
913                 /* Fall through. */
914             case STAR:
915                 if (flags & SCF_DO_STCLASS) {
916                     mincount = 0;
917                     maxcount = REG_INFTY; 
918                     next = regnext(scan);
919                     scan = NEXTOPER(scan);
920                     goto do_curly;
921                 }
922                 is_inf = is_inf_internal = 1; 
923                 scan = regnext(scan);
924                 if (flags & SCF_DO_SUBSTR) {
925                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
926                     data->longest = &(data->longest_float);
927                 }
928                 goto optimize_curly_tail;
929             case CURLY:
930                 mincount = ARG1(scan); 
931                 maxcount = ARG2(scan);
932                 next = regnext(scan);
933                 if (OP(scan) == CURLYX) {
934                     I32 lp = (data ? *(data->last_closep) : 0);
935
936                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
937                 }
938                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
939               do_curly:
940                 if (flags & SCF_DO_SUBSTR) {
941                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
942                     pos_before = data->pos_min;
943                 }
944                 if (data) {
945                     fl = data->flags;
946                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
947                     if (is_inf)
948                         data->flags |= SF_IS_INF;
949                 }
950                 if (flags & SCF_DO_STCLASS) {
951                     cl_init(pRExC_state, &this_class);
952                     oclass = data->start_class;
953                     data->start_class = &this_class;
954                     f |= SCF_DO_STCLASS_AND;
955                     f &= ~SCF_DO_STCLASS_OR;
956                 }
957                 /* These are the cases when once a subexpression
958                    fails at a particular position, it cannot succeed
959                    even after backtracking at the enclosing scope.
960                    
961                    XXXX what if minimal match and we are at the
962                         initial run of {n,m}? */
963                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
964                     f &= ~SCF_WHILEM_VISITED_POS;
965
966                 /* This will finish on WHILEM, setting scan, or on NULL: */
967                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, 
968                                       mincount == 0 
969                                         ? (f & ~SCF_DO_SUBSTR) : f);
970
971                 if (flags & SCF_DO_STCLASS)
972                     data->start_class = oclass;
973                 if (mincount == 0 || minnext == 0) {
974                     if (flags & SCF_DO_STCLASS_OR) {
975                         cl_or(pRExC_state, data->start_class, &this_class);
976                     }
977                     else if (flags & SCF_DO_STCLASS_AND) {
978                         /* Switch to OR mode: cache the old value of 
979                          * data->start_class */
980                         StructCopy(data->start_class, &and_with,
981                                    struct regnode_charclass_class);
982                         flags &= ~SCF_DO_STCLASS_AND;
983                         StructCopy(&this_class, data->start_class,
984                                    struct regnode_charclass_class);
985                         flags |= SCF_DO_STCLASS_OR;
986                         data->start_class->flags |= ANYOF_EOS;
987                     }
988                 } else {                /* Non-zero len */
989                     if (flags & SCF_DO_STCLASS_OR) {
990                         cl_or(pRExC_state, data->start_class, &this_class);
991                         cl_and(data->start_class, &and_with);
992                     }
993                     else if (flags & SCF_DO_STCLASS_AND)
994                         cl_and(data->start_class, &this_class);
995                     flags &= ~SCF_DO_STCLASS;
996                 }
997                 if (!scan)              /* It was not CURLYX, but CURLY. */
998                     scan = next;
999                 if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) 
1000                     && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1001                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1002                 {
1003                     vWARN(RExC_parse,
1004                           "Quantifier unexpected on zero-length expression");
1005                 }
1006
1007                 min += minnext * mincount;
1008                 is_inf_internal |= ((maxcount == REG_INFTY 
1009                                      && (minnext + deltanext) > 0)
1010                                     || deltanext == I32_MAX);
1011                 is_inf |= is_inf_internal;
1012                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1013
1014                 /* Try powerful optimization CURLYX => CURLYN. */
1015                 if (  OP(oscan) == CURLYX && data 
1016                       && data->flags & SF_IN_PAR
1017                       && !(data->flags & SF_HAS_EVAL)
1018                       && !deltanext && minnext == 1 ) {
1019                     /* Try to optimize to CURLYN.  */
1020                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1021                     regnode *nxt1 = nxt, *nxt2;
1022
1023                     /* Skip open. */
1024                     nxt = regnext(nxt);
1025                     if (!strchr((char*)PL_simple,OP(nxt))
1026                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1027                              && STR_LEN(nxt) == 1)) 
1028                         goto nogo;
1029                     nxt2 = nxt;
1030                     nxt = regnext(nxt);
1031                     if (OP(nxt) != CLOSE) 
1032                         goto nogo;
1033                     /* Now we know that nxt2 is the only contents: */
1034                     oscan->flags = ARG(nxt);
1035                     OP(oscan) = CURLYN;
1036                     OP(nxt1) = NOTHING; /* was OPEN. */
1037 #ifdef DEBUGGING
1038                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1039                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1040                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1041                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1042                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1043                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1044 #endif 
1045                 }
1046               nogo:
1047
1048                 /* Try optimization CURLYX => CURLYM. */
1049                 if (  OP(oscan) == CURLYX && data 
1050                       && !(data->flags & SF_HAS_PAR)
1051                       && !(data->flags & SF_HAS_EVAL)
1052                       && !deltanext  ) {
1053                     /* XXXX How to optimize if data == 0? */
1054                     /* Optimize to a simpler form.  */
1055                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1056                     regnode *nxt2;
1057
1058                     OP(oscan) = CURLYM;
1059                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1060                             && (OP(nxt2) != WHILEM)) 
1061                         nxt = nxt2;
1062                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1063                     /* Need to optimize away parenths. */
1064                     if (data->flags & SF_IN_PAR) {
1065                         /* Set the parenth number.  */
1066                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1067
1068                         if (OP(nxt) != CLOSE) 
1069                             FAIL("Panic opt close");
1070                         oscan->flags = ARG(nxt);
1071                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1072                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1073 #ifdef DEBUGGING
1074                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1075                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1076                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1077                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1078 #endif 
1079 #if 0
1080                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1081                             regnode *nnxt = regnext(nxt1);
1082                             
1083                             if (nnxt == nxt) {
1084                                 if (reg_off_by_arg[OP(nxt1)])
1085                                     ARG_SET(nxt1, nxt2 - nxt1);
1086                                 else if (nxt2 - nxt1 < U16_MAX)
1087                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1088                                 else
1089                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1090                             }
1091                             nxt1 = nnxt;
1092                         }
1093 #endif
1094                         /* Optimize again: */
1095                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt, 
1096                                     NULL, 0);
1097                     }
1098                     else
1099                         oscan->flags = 0;
1100                 }
1101                 else if ((OP(oscan) == CURLYX)
1102                          && (flags & SCF_WHILEM_VISITED_POS)
1103                          /* See the comment on a similar expression above.
1104                             However, this time it not a subexpression
1105                             we care about, but the expression itself. */
1106                          && (maxcount == REG_INFTY)
1107                          && data && ++data->whilem_c < 16) {
1108                     /* This stays as CURLYX, we can put the count/of pair. */
1109                     /* Find WHILEM (as in regexec.c) */
1110                     regnode *nxt = oscan + NEXT_OFF(oscan);
1111
1112                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1113                         nxt += ARG(nxt);
1114                     PREVOPER(nxt)->flags = data->whilem_c
1115                         | (RExC_whilem_seen << 4); /* On WHILEM */
1116                 }
1117                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) 
1118                     pars++;
1119                 if (flags & SCF_DO_SUBSTR) {
1120                     SV *last_str = Nullsv;
1121                     int counted = mincount != 0;
1122
1123                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1124                         I32 b = pos_before >= data->last_start_min 
1125                             ? pos_before : data->last_start_min;
1126                         STRLEN l;
1127                         char *s = SvPV(data->last_found, l);
1128                         I32 old = b - data->last_start_min;
1129
1130                         if (UTF)
1131                             old = utf8_hop((U8*)s, old) - (U8*)s;
1132                         
1133                         l -= old;
1134                         /* Get the added string: */
1135                         last_str = newSVpvn(s  + old, l);
1136                         if (deltanext == 0 && pos_before == b) {
1137                             /* What was added is a constant string */
1138                             if (mincount > 1) {
1139                                 SvGROW(last_str, (mincount * l) + 1);
1140                                 repeatcpy(SvPVX(last_str) + l, 
1141                                           SvPVX(last_str), l, mincount - 1);
1142                                 SvCUR(last_str) *= mincount;
1143                                 /* Add additional parts. */
1144                                 SvCUR_set(data->last_found, 
1145                                           SvCUR(data->last_found) - l);
1146                                 sv_catsv(data->last_found, last_str);
1147                                 data->last_end += l * (mincount - 1);
1148                             }
1149                         } else {
1150                             /* start offset must point into the last copy */
1151                             data->last_start_min += minnext * (mincount - 1);
1152                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1153                                 * (minnext + data->pos_delta);
1154                         }
1155                     }
1156                     /* It is counted once already... */
1157                     data->pos_min += minnext * (mincount - counted);
1158                     data->pos_delta += - counted * deltanext +
1159                         (minnext + deltanext) * maxcount - minnext * mincount;
1160                     if (mincount != maxcount) {
1161                          /* Cannot extend fixed substrings found inside
1162                             the group.  */
1163                         scan_commit(pRExC_state,data);
1164                         if (mincount && last_str) {
1165                             sv_setsv(data->last_found, last_str);
1166                             data->last_end = data->pos_min;
1167                             data->last_start_min = 
1168                                 data->pos_min - CHR_SVLEN(last_str);
1169                             data->last_start_max = is_inf 
1170                                 ? I32_MAX 
1171                                 : data->pos_min + data->pos_delta
1172                                 - CHR_SVLEN(last_str);
1173                         }
1174                         data->longest = &(data->longest_float);
1175                     }
1176                     SvREFCNT_dec(last_str);
1177                 }
1178                 if (data && (fl & SF_HAS_EVAL))
1179                     data->flags |= SF_HAS_EVAL;
1180               optimize_curly_tail:
1181                 if (OP(oscan) != CURLYX) {
1182                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1183                            && NEXT_OFF(next))
1184                         NEXT_OFF(oscan) += NEXT_OFF(next);
1185                 }
1186                 continue;
1187             default:                    /* REF and CLUMP only? */
1188                 if (flags & SCF_DO_SUBSTR) {
1189                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1190                     data->longest = &(data->longest_float);
1191                 }
1192                 is_inf = is_inf_internal = 1;
1193                 if (flags & SCF_DO_STCLASS_OR)
1194                     cl_anything(pRExC_state, data->start_class);
1195                 flags &= ~SCF_DO_STCLASS;
1196                 break;
1197             }
1198         }
1199         else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) {
1200             int value;
1201
1202             if (flags & SCF_DO_SUBSTR) {
1203                 scan_commit(pRExC_state,data);
1204                 data->pos_min++;
1205             }
1206             min++;
1207             if (flags & SCF_DO_STCLASS) {
1208                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1209
1210                 /* Some of the logic below assumes that switching
1211                    locale on will only add false positives. */
1212                 switch (PL_regkind[(U8)OP(scan)]) {
1213                 case ANYUTF8:
1214                 case SANY:
1215                 case SANYUTF8:
1216                 case ALNUMUTF8:
1217                 case ANYOFUTF8:
1218                 case ALNUMLUTF8:
1219                 case NALNUMUTF8:
1220                 case NALNUMLUTF8:
1221                 case SPACEUTF8:
1222                 case NSPACEUTF8:
1223                 case SPACELUTF8:
1224                 case NSPACELUTF8:
1225                 case DIGITUTF8:
1226                 case NDIGITUTF8:
1227                 default:
1228                   do_default:
1229                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1230                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1231                         cl_anything(pRExC_state, data->start_class);
1232                     break;
1233                 case REG_ANY:
1234                     if (OP(scan) == SANY)
1235                         goto do_default;
1236                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1237                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1238                                  || (data->start_class->flags & ANYOF_CLASS));
1239                         cl_anything(pRExC_state, data->start_class);
1240                     }
1241                     if (flags & SCF_DO_STCLASS_AND || !value)
1242                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1243                     break;
1244                 case ANYOF:
1245                     if (flags & SCF_DO_STCLASS_AND)
1246                         cl_and(data->start_class,
1247                                (struct regnode_charclass_class*)scan);
1248                     else
1249                         cl_or(pRExC_state, data->start_class,
1250                               (struct regnode_charclass_class*)scan);
1251                     break;
1252                 case ALNUM:
1253                     if (flags & SCF_DO_STCLASS_AND) {
1254                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1255                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1256                             for (value = 0; value < 256; value++)
1257                                 if (!isALNUM(value))
1258                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1259                         }
1260                     }
1261                     else {
1262                         if (data->start_class->flags & ANYOF_LOCALE)
1263                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1264                         else {
1265                             for (value = 0; value < 256; value++)
1266                                 if (isALNUM(value))
1267                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1268                         }
1269                     }
1270                     break;
1271                 case ALNUML:
1272                     if (flags & SCF_DO_STCLASS_AND) {
1273                         if (data->start_class->flags & ANYOF_LOCALE)
1274                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1275                     }
1276                     else {
1277                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1278                         data->start_class->flags |= ANYOF_LOCALE;
1279                     }
1280                     break;
1281                 case NALNUM:
1282                     if (flags & SCF_DO_STCLASS_AND) {
1283                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1284                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1285                             for (value = 0; value < 256; value++)
1286                                 if (isALNUM(value))
1287                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1288                         }
1289                     }
1290                     else {
1291                         if (data->start_class->flags & ANYOF_LOCALE)
1292                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1293                         else {
1294                             for (value = 0; value < 256; value++)
1295                                 if (!isALNUM(value))
1296                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1297                         }
1298                     }
1299                     break;
1300                 case NALNUML:
1301                     if (flags & SCF_DO_STCLASS_AND) {
1302                         if (data->start_class->flags & ANYOF_LOCALE)
1303                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1304                     }
1305                     else {
1306                         data->start_class->flags |= ANYOF_LOCALE;
1307                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1308                     }
1309                     break;
1310                 case SPACE:
1311                     if (flags & SCF_DO_STCLASS_AND) {
1312                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1313                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1314                             for (value = 0; value < 256; value++)
1315                                 if (!isSPACE(value))
1316                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1317                         }
1318                     }
1319                     else {
1320                         if (data->start_class->flags & ANYOF_LOCALE)
1321                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1322                         else {
1323                             for (value = 0; value < 256; value++)
1324                                 if (isSPACE(value))
1325                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1326                         }
1327                     }
1328                     break;
1329                 case SPACEL:
1330                     if (flags & SCF_DO_STCLASS_AND) {
1331                         if (data->start_class->flags & ANYOF_LOCALE)
1332                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1333                     }
1334                     else {
1335                         data->start_class->flags |= ANYOF_LOCALE;
1336                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1337                     }
1338                     break;
1339                 case NSPACE:
1340                     if (flags & SCF_DO_STCLASS_AND) {
1341                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1342                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1343                             for (value = 0; value < 256; value++)
1344                                 if (isSPACE(value))
1345                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1346                         }
1347                     }
1348                     else {
1349                         if (data->start_class->flags & ANYOF_LOCALE)
1350                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1351                         else {
1352                             for (value = 0; value < 256; value++)
1353                                 if (!isSPACE(value))
1354                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1355                         }
1356                     }
1357                     break;
1358                 case NSPACEL:
1359                     if (flags & SCF_DO_STCLASS_AND) {
1360                         if (data->start_class->flags & ANYOF_LOCALE) {
1361                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1362                             for (value = 0; value < 256; value++)
1363                                 if (!isSPACE(value))
1364                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1365                         }
1366                     }
1367                     else {
1368                         data->start_class->flags |= ANYOF_LOCALE;
1369                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1370                     }
1371                     break;
1372                 case DIGIT:
1373                     if (flags & SCF_DO_STCLASS_AND) {
1374                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1375                         for (value = 0; value < 256; value++)
1376                             if (!isDIGIT(value))
1377                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1378                     }
1379                     else {
1380                         if (data->start_class->flags & ANYOF_LOCALE)
1381                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1382                         else {
1383                             for (value = 0; value < 256; value++)
1384                                 if (isDIGIT(value))
1385                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1386                         }
1387                     }
1388                     break;
1389                 case NDIGIT:
1390                     if (flags & SCF_DO_STCLASS_AND) {
1391                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1392                         for (value = 0; value < 256; value++)
1393                             if (isDIGIT(value))
1394                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1395                     }
1396                     else {
1397                         if (data->start_class->flags & ANYOF_LOCALE)
1398                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1399                         else {
1400                             for (value = 0; value < 256; value++)
1401                                 if (!isDIGIT(value))
1402                                     ANYOF_BITMAP_SET(data->start_class, value);                     
1403                         }
1404                     }
1405                     break;
1406                 }
1407                 if (flags & SCF_DO_STCLASS_OR)
1408                     cl_and(data->start_class, &and_with);
1409                 flags &= ~SCF_DO_STCLASS;
1410             }
1411         }
1412         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1413             data->flags |= (OP(scan) == MEOL
1414                             ? SF_BEFORE_MEOL
1415                             : SF_BEFORE_SEOL);
1416         }
1417         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1418                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1419                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1420                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1421             /* Lookahead/lookbehind */
1422             I32 deltanext, minnext, fake = 0;
1423             regnode *nscan;
1424             struct regnode_charclass_class intrnl;
1425             int f = 0;
1426
1427             data_fake.flags = 0;
1428             if (data) {             
1429                 data_fake.whilem_c = data->whilem_c;
1430                 data_fake.last_closep = data->last_closep;
1431             }
1432             else
1433                 data_fake.last_closep = &fake;
1434             if ( flags & SCF_DO_STCLASS && !scan->flags
1435                  && OP(scan) == IFMATCH ) { /* Lookahead */
1436                 cl_init(pRExC_state, &intrnl);
1437                 data_fake.start_class = &intrnl;
1438                 f |= SCF_DO_STCLASS_AND;
1439             }
1440             if (flags & SCF_WHILEM_VISITED_POS)
1441                 f |= SCF_WHILEM_VISITED_POS;
1442             next = regnext(scan);
1443             nscan = NEXTOPER(NEXTOPER(scan));
1444             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1445             if (scan->flags) {
1446                 if (deltanext) {
1447                     vFAIL("Variable length lookbehind not implemented");
1448                 }
1449                 else if (minnext > U8_MAX) {
1450                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1451                 }
1452                 scan->flags = minnext;
1453             }
1454             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1455                 pars++;
1456             if (data && (data_fake.flags & SF_HAS_EVAL))
1457                 data->flags |= SF_HAS_EVAL;
1458             if (data)
1459                 data->whilem_c = data_fake.whilem_c;
1460             if (f & SCF_DO_STCLASS_AND) {
1461                 int was = (data->start_class->flags & ANYOF_EOS);
1462
1463                 cl_and(data->start_class, &intrnl);
1464                 if (was)
1465                     data->start_class->flags |= ANYOF_EOS;
1466             }
1467         }
1468         else if (OP(scan) == OPEN) {
1469             pars++;
1470         }
1471         else if (OP(scan) == CLOSE) {
1472             if (ARG(scan) == is_par) {
1473                 next = regnext(scan);
1474
1475                 if ( next && (OP(next) != WHILEM) && next < last)
1476                     is_par = 0;         /* Disable optimization */
1477             }
1478             if (data)
1479                 *(data->last_closep) = ARG(scan);
1480         }
1481         else if (OP(scan) == EVAL) {
1482                 if (data)
1483                     data->flags |= SF_HAS_EVAL;
1484         }
1485         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1486                 if (flags & SCF_DO_SUBSTR) {
1487                     scan_commit(pRExC_state,data);
1488                     data->longest = &(data->longest_float);
1489                 }
1490                 is_inf = is_inf_internal = 1;
1491                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1492                     cl_anything(pRExC_state, data->start_class);
1493                 flags &= ~SCF_DO_STCLASS;
1494         }
1495         /* Else: zero-length, ignore. */
1496         scan = regnext(scan);
1497     }
1498
1499   finish:
1500     *scanp = scan;
1501     *deltap = is_inf_internal ? I32_MAX : delta;
1502     if (flags & SCF_DO_SUBSTR && is_inf) 
1503         data->pos_delta = I32_MAX - data->pos_min;
1504     if (is_par > U8_MAX)
1505         is_par = 0;
1506     if (is_par && pars==1 && data) {
1507         data->flags |= SF_IN_PAR;
1508         data->flags &= ~SF_HAS_PAR;
1509     }
1510     else if (pars && data) {
1511         data->flags |= SF_HAS_PAR;
1512         data->flags &= ~SF_IN_PAR;
1513     }
1514     if (flags & SCF_DO_STCLASS_OR)
1515         cl_and(data->start_class, &and_with);
1516     return min;
1517 }
1518
1519 STATIC I32
1520 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1521 {
1522     if (RExC_rx->data) {
1523         Renewc(RExC_rx->data, 
1524                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1), 
1525                char, struct reg_data);
1526         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1527         RExC_rx->data->count += n;
1528     }
1529     else {
1530         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1531              char, struct reg_data);
1532         New(1208, RExC_rx->data->what, n, U8);
1533         RExC_rx->data->count = n;
1534     }
1535     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1536     return RExC_rx->data->count - n;
1537 }
1538
1539 void
1540 Perl_reginitcolors(pTHX)
1541 {
1542     int i = 0;
1543     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1544             
1545     if (s) {
1546         PL_colors[0] = s = savepv(s);
1547         while (++i < 6) {
1548             s = strchr(s, '\t');
1549             if (s) {
1550                 *s = '\0';
1551                 PL_colors[i] = ++s;
1552             }
1553             else
1554                 PL_colors[i] = s = "";
1555         }
1556     } else {
1557         while (i < 6) 
1558             PL_colors[i++] = "";
1559     }
1560     PL_colorset = 1;
1561 }
1562
1563
1564 /*
1565  - pregcomp - compile a regular expression into internal code
1566  *
1567  * We can't allocate space until we know how big the compiled form will be,
1568  * but we can't compile it (and thus know how big it is) until we've got a
1569  * place to put the code.  So we cheat:  we compile it twice, once with code
1570  * generation turned off and size counting turned on, and once "for real".
1571  * This also means that we don't allocate space until we are sure that the
1572  * thing really will compile successfully, and we never have to move the
1573  * code and thus invalidate pointers into it.  (Note that it has to be in
1574  * one piece because free() must be able to free it all.) [NB: not true in perl]
1575  *
1576  * Beware that the optimization-preparation code in here knows about some
1577  * of the structure of the compiled regexp.  [I'll say.]
1578  */
1579 regexp *
1580 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1581 {
1582     register regexp *r;
1583     regnode *scan;
1584     regnode *first;
1585     I32 flags;
1586     I32 minlen = 0;
1587     I32 sawplus = 0;
1588     I32 sawopen = 0;
1589     scan_data_t data;
1590     RExC_state_t RExC_state;
1591     RExC_state_t *pRExC_state = &RExC_state;
1592
1593     if (exp == NULL)
1594         FAIL("NULL regexp argument");
1595
1596     /* XXXX This looks very suspicious... */
1597     if (pm->op_pmdynflags & PMdf_UTF8) {
1598         PL_reg_flags |= RF_utf8;
1599     }
1600     else
1601         PL_reg_flags = 0;
1602
1603     RExC_precomp = savepvn(exp, xend - exp);
1604     DEBUG_r(if (!PL_colorset) reginitcolors());
1605     DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1606                       PL_colors[4],PL_colors[5],PL_colors[0],
1607                       (int)(xend - exp), RExC_precomp, PL_colors[1]));
1608     RExC_flags16 = pm->op_pmflags;
1609     RExC_sawback = 0;
1610
1611     RExC_seen = 0;
1612     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1613     RExC_seen_evals = 0;
1614     RExC_extralen = 0;
1615
1616     /* First pass: determine size, legality. */
1617     RExC_parse = exp;
1618     RExC_end = xend;
1619     RExC_naughty = 0;
1620     RExC_npar = 1;
1621     RExC_size = 0L;
1622     RExC_emit = &PL_regdummy;
1623     RExC_whilem_seen = 0;
1624 #if 0 /* REGC() is (currently) a NOP at the first pass.
1625        * Clever compilers notice this and complain. --jhi */
1626     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1627 #endif
1628     if (reg(pRExC_state, 0, &flags) == NULL) {
1629         Safefree(RExC_precomp);
1630         RExC_precomp = Nullch;
1631         return(NULL);
1632     }
1633     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1634
1635     /* Small enough for pointer-storage convention?
1636        If extralen==0, this means that we will not need long jumps. */
1637     if (RExC_size >= 0x10000L && RExC_extralen)
1638         RExC_size += RExC_extralen;
1639     else
1640         RExC_extralen = 0;
1641     if (RExC_whilem_seen > 15)
1642         RExC_whilem_seen = 15;
1643
1644     /* Allocate space and initialize. */
1645     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1646          char, regexp);
1647     if (r == NULL)
1648         FAIL("Regexp out of space");
1649
1650 #ifdef DEBUGGING
1651     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1652     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1653 #endif
1654     r->refcnt = 1;
1655     r->prelen = xend - exp;
1656     r->precomp = RExC_precomp;
1657     r->subbeg = NULL;
1658     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1659     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1660
1661     r->substrs = 0;                     /* Useful during FAIL. */
1662     r->startp = 0;                      /* Useful during FAIL. */
1663     r->endp = 0;                        /* Useful during FAIL. */
1664
1665     RExC_rx = r;
1666
1667     /* Second pass: emit code. */
1668     RExC_parse = exp;
1669     RExC_end = xend;
1670     RExC_naughty = 0;
1671     RExC_npar = 1;
1672     RExC_emit = r->program;
1673     /* Store the count of eval-groups for security checks: */
1674     RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1675     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1676     r->data = 0;
1677     if (reg(pRExC_state, 0, &flags) == NULL)
1678         return(NULL);
1679
1680     /* Dig out information for optimizations. */
1681     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1682     pm->op_pmflags = RExC_flags16;
1683     if (UTF)
1684         r->reganch |= ROPT_UTF8;
1685     r->regstclass = NULL;
1686     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1687         r->reganch |= ROPT_NAUGHTY;
1688     scan = r->program + 1;              /* First BRANCH. */
1689
1690     /* XXXX To minimize changes to RE engine we always allocate
1691        3-units-long substrs field. */
1692     Newz(1004, r->substrs, 1, struct reg_substr_data);
1693
1694     StructCopy(&zero_scan_data, &data, scan_data_t);
1695     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1696     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1697         I32 fake;
1698         STRLEN longest_float_length, longest_fixed_length;
1699         struct regnode_charclass_class ch_class;
1700         int stclass_flag;
1701         I32 last_close = 0;
1702
1703         first = scan;
1704         /* Skip introductions and multiplicators >= 1. */
1705         while ((OP(first) == OPEN && (sawopen = 1)) ||
1706                /* An OR of *one* alternative - should not happen now. */
1707             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1708             (OP(first) == PLUS) ||
1709             (OP(first) == MINMOD) ||
1710                /* An {n,m} with n>0 */
1711             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1712                 if (OP(first) == PLUS)
1713                     sawplus = 1;
1714                 else
1715                     first += regarglen[(U8)OP(first)];
1716                 first = NEXTOPER(first);
1717         }
1718
1719         /* Starting-point info. */
1720       again:
1721         if (PL_regkind[(U8)OP(first)] == EXACT) {
1722             if (OP(first) == EXACT);    /* Empty, get anchored substr later. */
1723             else if ((OP(first) == EXACTF || OP(first) == EXACTFL)
1724                      && !UTF)
1725                 r->regstclass = first;
1726         }
1727         else if (strchr((char*)PL_simple,OP(first)))
1728             r->regstclass = first;
1729         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1730                  PL_regkind[(U8)OP(first)] == NBOUND)
1731             r->regstclass = first;
1732         else if (PL_regkind[(U8)OP(first)] == BOL) {
1733             r->reganch |= (OP(first) == MBOL
1734                            ? ROPT_ANCH_MBOL
1735                            : (OP(first) == SBOL
1736                               ? ROPT_ANCH_SBOL
1737                               : ROPT_ANCH_BOL));
1738             first = NEXTOPER(first);
1739             goto again;
1740         }
1741         else if (OP(first) == GPOS) {
1742             r->reganch |= ROPT_ANCH_GPOS;
1743             first = NEXTOPER(first);
1744             goto again;
1745         }
1746         else if ((OP(first) == STAR &&
1747             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1748             !(r->reganch & ROPT_ANCH) )
1749         {
1750             /* turn .* into ^.* with an implied $*=1 */
1751             int type = OP(NEXTOPER(first));
1752
1753             if (type == REG_ANY || type == ANYUTF8)
1754                 type = ROPT_ANCH_MBOL;
1755             else
1756                 type = ROPT_ANCH_SBOL;
1757
1758             r->reganch |= type | ROPT_IMPLICIT;
1759             first = NEXTOPER(first);
1760             goto again;
1761         }
1762         if (sawplus && (!sawopen || !RExC_sawback) 
1763             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1764             /* x+ must match at the 1st pos of run of x's */
1765             r->reganch |= ROPT_SKIP;
1766
1767         /* Scan is after the zeroth branch, first is atomic matcher. */
1768         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", 
1769                               (IV)(first - scan + 1)));
1770         /*
1771         * If there's something expensive in the r.e., find the
1772         * longest literal string that must appear and make it the
1773         * regmust.  Resolve ties in favor of later strings, since
1774         * the regstart check works with the beginning of the r.e.
1775         * and avoiding duplication strengthens checking.  Not a
1776         * strong reason, but sufficient in the absence of others.
1777         * [Now we resolve ties in favor of the earlier string if
1778         * it happens that c_offset_min has been invalidated, since the
1779         * earlier string may buy us something the later one won't.]
1780         */
1781         minlen = 0;
1782
1783         data.longest_fixed = newSVpvn("",0);
1784         data.longest_float = newSVpvn("",0);
1785         data.last_found = newSVpvn("",0);
1786         data.longest = &(data.longest_fixed);
1787         first = scan;
1788         if (!r->regstclass) {
1789             cl_init(pRExC_state, &ch_class);
1790             data.start_class = &ch_class;
1791             stclass_flag = SCF_DO_STCLASS_AND;
1792         } else                          /* XXXX Check for BOUND? */
1793             stclass_flag = 0;
1794         data.last_closep = &last_close;
1795
1796         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1797                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1798         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1799              && data.last_start_min == 0 && data.last_end > 0 
1800              && !RExC_seen_zerolen
1801              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1802             r->reganch |= ROPT_CHECK_ALL;
1803         scan_commit(pRExC_state, &data);
1804         SvREFCNT_dec(data.last_found);
1805
1806         longest_float_length = CHR_SVLEN(data.longest_float);
1807         if (longest_float_length
1808             || (data.flags & SF_FL_BEFORE_EOL
1809                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1810                     || (RExC_flags16 & PMf_MULTILINE)))) {
1811             int t;
1812
1813             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1814                 && data.offset_fixed == data.offset_float_min
1815                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1816                     goto remove_float;          /* As in (a)+. */
1817
1818             r->float_substr = data.longest_float;
1819             r->float_min_offset = data.offset_float_min;
1820             r->float_max_offset = data.offset_float_max;
1821             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1822                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1823                            || (RExC_flags16 & PMf_MULTILINE)));
1824             fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
1825         }
1826         else {
1827           remove_float:
1828             r->float_substr = Nullsv;
1829             SvREFCNT_dec(data.longest_float);
1830             longest_float_length = 0;
1831         }
1832
1833         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1834         if (longest_fixed_length
1835             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1836                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1837                     || (RExC_flags16 & PMf_MULTILINE)))) {
1838             int t;
1839
1840             r->anchored_substr = data.longest_fixed;
1841             r->anchored_offset = data.offset_fixed;
1842             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1843                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
1844                      || (RExC_flags16 & PMf_MULTILINE)));
1845             fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
1846         }
1847         else {
1848             r->anchored_substr = Nullsv;
1849             SvREFCNT_dec(data.longest_fixed);
1850             longest_fixed_length = 0;
1851         }
1852         if (r->regstclass 
1853             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8
1854                 || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY))
1855             r->regstclass = NULL;
1856         if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
1857             && !(data.start_class->flags & ANYOF_EOS)
1858             && !cl_is_anything(data.start_class)) {
1859             SV *sv;
1860             I32 n = add_data(pRExC_state, 1, "f");
1861
1862             New(1006, RExC_rx->data->data[n], 1, 
1863                 struct regnode_charclass_class);
1864             StructCopy(data.start_class,
1865                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
1866                        struct regnode_charclass_class);
1867             r->regstclass = (regnode*)RExC_rx->data->data[n];
1868             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
1869             DEBUG_r((sv = sv_newmortal(),
1870                      regprop(sv, (regnode*)data.start_class),
1871                      PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
1872                                    SvPVX(sv))));
1873         }
1874
1875         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
1876         if (longest_fixed_length > longest_float_length) {
1877             r->check_substr = r->anchored_substr;
1878             r->check_offset_min = r->check_offset_max = r->anchored_offset;
1879             if (r->reganch & ROPT_ANCH_SINGLE)
1880                 r->reganch |= ROPT_NOSCAN;
1881         }
1882         else {
1883             r->check_substr = r->float_substr;
1884             r->check_offset_min = data.offset_float_min;
1885             r->check_offset_max = data.offset_float_max;
1886         }
1887         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
1888            This should be changed ASAP!  */
1889         if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
1890             r->reganch |= RE_USE_INTUIT;
1891             if (SvTAIL(r->check_substr))
1892                 r->reganch |= RE_INTUIT_TAIL;
1893         }
1894     }
1895     else {
1896         /* Several toplevels. Best we can is to set minlen. */
1897         I32 fake;
1898         struct regnode_charclass_class ch_class;
1899         I32 last_close = 0;
1900         
1901         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
1902         scan = r->program + 1;
1903         cl_init(pRExC_state, &ch_class);
1904         data.start_class = &ch_class;
1905         data.last_closep = &last_close;
1906         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
1907         r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
1908         if (!(data.start_class->flags & ANYOF_EOS)
1909             && !cl_is_anything(data.start_class)) {
1910             SV *sv;
1911             I32 n = add_data(pRExC_state, 1, "f");
1912
1913             New(1006, RExC_rx->data->data[n], 1, 
1914                 struct regnode_charclass_class);
1915             StructCopy(data.start_class,
1916                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
1917                        struct regnode_charclass_class);
1918             r->regstclass = (regnode*)RExC_rx->data->data[n];
1919             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
1920             DEBUG_r((sv = sv_newmortal(),
1921                      regprop(sv, (regnode*)data.start_class),
1922                      PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
1923                                    SvPVX(sv))));
1924         }
1925     }
1926
1927     r->minlen = minlen;
1928     if (RExC_seen & REG_SEEN_GPOS) 
1929         r->reganch |= ROPT_GPOS_SEEN;
1930     if (RExC_seen & REG_SEEN_LOOKBEHIND)
1931         r->reganch |= ROPT_LOOKBEHIND_SEEN;
1932     if (RExC_seen & REG_SEEN_EVAL)
1933         r->reganch |= ROPT_EVAL_SEEN;
1934     Newz(1002, r->startp, RExC_npar, I32);
1935     Newz(1002, r->endp, RExC_npar, I32);
1936     PL_regdata = r->data; /* for regprop() ANYOFUTF8 */
1937     DEBUG_r(regdump(r));
1938     return(r);
1939 }
1940
1941 /*
1942  - reg - regular expression, i.e. main body or parenthesized thing
1943  *
1944  * Caller must absorb opening parenthesis.
1945  *
1946  * Combining parenthesis handling with the base level of regular expression
1947  * is a trifle forced, but the need to tie the tails of the branches to what
1948  * follows makes it hard to avoid.
1949  */
1950 STATIC regnode *
1951 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
1952     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
1953 {
1954     register regnode *ret;              /* Will be the head of the group. */
1955     register regnode *br;
1956     register regnode *lastbr;
1957     register regnode *ender = 0;
1958     register I32 parno = 0;
1959     I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
1960     char *oregcomp_parse = RExC_parse;
1961     char c;
1962
1963     *flagp = 0;                         /* Tentatively. */
1964
1965     /* Make an OPEN node, if parenthesized. */
1966     if (paren) {
1967         if (*RExC_parse == '?') {
1968             U16 posflags = 0, negflags = 0;
1969             U16 *flagsp = &posflags;
1970             int logical = 0;
1971             char *seqstart = RExC_parse;
1972
1973             RExC_parse++;
1974             paren = *RExC_parse++;
1975             ret = NULL;                 /* For look-ahead/behind. */
1976             switch (paren) {
1977             case '<':
1978                 RExC_seen |= REG_SEEN_LOOKBEHIND;
1979                 if (*RExC_parse == '!') 
1980                     paren = ',';
1981                 if (*RExC_parse != '=' && *RExC_parse != '!') 
1982                     goto unknown;
1983                 RExC_parse++;
1984             case '=':
1985             case '!':
1986                 RExC_seen_zerolen++;
1987             case ':':
1988             case '>':
1989                 break;
1990             case '$':
1991             case '@':
1992                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
1993                 break;
1994             case '#':
1995                 while (*RExC_parse && *RExC_parse != ')')
1996                     RExC_parse++;
1997                 if (*RExC_parse != ')')
1998                     FAIL("Sequence (?#... not terminated");
1999                 nextchar(pRExC_state);
2000                 *flagp = TRYAGAIN;
2001                 return NULL;
2002             case 'p':
2003                 if (SIZE_ONLY)
2004                     vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})");
2005                 /* FALL THROUGH*/
2006             case '?':
2007                 logical = 1;
2008                 paren = *RExC_parse++;
2009                 /* FALL THROUGH */
2010             case '{':
2011             {
2012                 I32 count = 1, n = 0;
2013                 char c;
2014                 char *s = RExC_parse;
2015                 SV *sv;
2016                 OP_4tree *sop, *rop;
2017
2018                 RExC_seen_zerolen++;
2019                 RExC_seen |= REG_SEEN_EVAL;
2020                 while (count && (c = *RExC_parse)) {
2021                     if (c == '\\' && RExC_parse[1])
2022                         RExC_parse++;
2023                     else if (c == '{') 
2024                         count++;
2025                     else if (c == '}') 
2026                         count--;
2027                     RExC_parse++;
2028                 }
2029                 if (*RExC_parse != ')')
2030                 {
2031                     RExC_parse = s;                 
2032                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2033                 }
2034                 if (!SIZE_ONLY) {
2035                     AV *av;
2036                     
2037                     if (RExC_parse - 1 - s) 
2038                         sv = newSVpvn(s, RExC_parse - 1 - s);
2039                     else
2040                         sv = newSVpvn("", 0);
2041
2042                     ENTER;
2043                     Perl_save_re_context(aTHX);
2044                     rop = sv_compile_2op(sv, &sop, "re", &av);
2045                     LEAVE;
2046
2047                     n = add_data(pRExC_state, 3, "nop");
2048                     RExC_rx->data->data[n] = (void*)rop;
2049                     RExC_rx->data->data[n+1] = (void*)sop;
2050                     RExC_rx->data->data[n+2] = (void*)av;
2051                     SvREFCNT_dec(sv);
2052                 }
2053                 else {                                          /* First pass */
2054                     if (PL_reginterp_cnt < ++RExC_seen_evals
2055                         && PL_curcop != &PL_compiling)
2056                         /* No compiled RE interpolated, has runtime
2057                            components ===> unsafe.  */
2058                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2059                     if (PL_tainted)
2060                         FAIL("Eval-group in insecure regular expression");
2061                 }
2062                 
2063                 nextchar(pRExC_state);
2064                 if (logical) {
2065                     ret = reg_node(pRExC_state, LOGICAL);
2066                     if (!SIZE_ONLY)
2067                         ret->flags = 2;
2068                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2069                     return ret;
2070                 }
2071                 return reganode(pRExC_state, EVAL, n);
2072             }
2073             case '(':
2074             {
2075                 if (RExC_parse[0] == '?') {
2076                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!' 
2077                         || RExC_parse[1] == '<' 
2078                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2079                         I32 flag;
2080                         
2081                         ret = reg_node(pRExC_state, LOGICAL);
2082                         if (!SIZE_ONLY)
2083                             ret->flags = 1;
2084                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2085                         goto insert_if;
2086                     } 
2087                 }
2088                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2089                     parno = atoi(RExC_parse++);
2090
2091                     while (isDIGIT(*RExC_parse))
2092                         RExC_parse++;
2093                     ret = reganode(pRExC_state, GROUPP, parno);
2094                     if ((c = *nextchar(pRExC_state)) != ')')
2095                         vFAIL("Switch condition not recognized");
2096                   insert_if:
2097                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2098                     br = regbranch(pRExC_state, &flags, 1);
2099                     if (br == NULL)
2100                         br = reganode(pRExC_state, LONGJMP, 0);
2101                     else
2102                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2103                     c = *nextchar(pRExC_state);
2104                     if (flags&HASWIDTH)
2105                         *flagp |= HASWIDTH;
2106                     if (c == '|') {
2107                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2108                         regbranch(pRExC_state, &flags, 1);
2109                         regtail(pRExC_state, ret, lastbr);
2110                         if (flags&HASWIDTH)
2111                             *flagp |= HASWIDTH;
2112                         c = *nextchar(pRExC_state);
2113                     }
2114                     else
2115                         lastbr = NULL;
2116                     if (c != ')')
2117                         vFAIL("Switch (?(condition)... contains too many branches");
2118                     ender = reg_node(pRExC_state, TAIL);
2119                     regtail(pRExC_state, br, ender);
2120                     if (lastbr) {
2121                         regtail(pRExC_state, lastbr, ender);
2122                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2123                     }
2124                     else
2125                         regtail(pRExC_state, ret, ender);
2126                     return ret;
2127                 }
2128                 else {
2129                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2130                 }
2131             }
2132             case 0:
2133                 RExC_parse--; /* for vFAIL to print correctly */
2134                 vFAIL("Sequence (? incomplete");
2135                 break;
2136             default:
2137                 --RExC_parse;
2138               parse_flags:
2139                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2140                     if (*RExC_parse != 'o')
2141                         pmflag(flagsp, *RExC_parse);
2142                     ++RExC_parse;
2143                 }
2144                 if (*RExC_parse == '-') {
2145                     flagsp = &negflags;
2146                     ++RExC_parse;
2147                     goto parse_flags;
2148                 }
2149                 RExC_flags16 |= posflags;
2150                 RExC_flags16 &= ~negflags;
2151                 if (*RExC_parse == ':') {
2152                     RExC_parse++;
2153                     paren = ':';
2154                     break;
2155                 }               
2156               unknown:
2157                 if (*RExC_parse != ')') {
2158                     RExC_parse++;
2159                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2160                 }
2161                 nextchar(pRExC_state);
2162                 *flagp = TRYAGAIN;
2163                 return NULL;
2164             }
2165         }
2166         else {
2167             parno = RExC_npar;
2168             RExC_npar++;
2169             ret = reganode(pRExC_state, OPEN, parno);
2170             open = 1;
2171         }
2172     }
2173     else
2174         ret = NULL;
2175
2176     /* Pick up the branches, linking them together. */
2177     br = regbranch(pRExC_state, &flags, 1);
2178     if (br == NULL)
2179         return(NULL);
2180     if (*RExC_parse == '|') {
2181         if (!SIZE_ONLY && RExC_extralen) {
2182             reginsert(pRExC_state, BRANCHJ, br);
2183         }
2184         else
2185             reginsert(pRExC_state, BRANCH, br);
2186         have_branch = 1;
2187         if (SIZE_ONLY)
2188             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2189     }
2190     else if (paren == ':') {
2191         *flagp |= flags&SIMPLE;
2192     }
2193     if (open) {                         /* Starts with OPEN. */
2194         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2195     }
2196     else if (paren != '?')              /* Not Conditional */
2197         ret = br;
2198     if (flags&HASWIDTH)
2199         *flagp |= HASWIDTH;
2200     *flagp |= flags&SPSTART;
2201     lastbr = br;
2202     while (*RExC_parse == '|') {
2203         if (!SIZE_ONLY && RExC_extralen) {
2204             ender = reganode(pRExC_state, LONGJMP,0);
2205             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2206         }
2207         if (SIZE_ONLY)
2208             RExC_extralen += 2;         /* Account for LONGJMP. */
2209         nextchar(pRExC_state);
2210         br = regbranch(pRExC_state, &flags, 0);
2211         if (br == NULL)
2212             return(NULL);
2213         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2214         lastbr = br;
2215         if (flags&HASWIDTH)
2216             *flagp |= HASWIDTH;
2217         *flagp |= flags&SPSTART;
2218     }
2219
2220     if (have_branch || paren != ':') {
2221         /* Make a closing node, and hook it on the end. */
2222         switch (paren) {
2223         case ':':
2224             ender = reg_node(pRExC_state, TAIL);
2225             break;
2226         case 1:
2227             ender = reganode(pRExC_state, CLOSE, parno);
2228             break;
2229         case '<':
2230         case ',':
2231         case '=':
2232         case '!':
2233             *flagp &= ~HASWIDTH;
2234             /* FALL THROUGH */
2235         case '>':
2236             ender = reg_node(pRExC_state, SUCCEED);
2237             break;
2238         case 0:
2239             ender = reg_node(pRExC_state, END);
2240             break;
2241         }
2242         regtail(pRExC_state, lastbr, ender);
2243
2244         if (have_branch) {
2245             /* Hook the tails of the branches to the closing node. */
2246             for (br = ret; br != NULL; br = regnext(br)) {
2247                 regoptail(pRExC_state, br, ender);
2248             }
2249         }
2250     }
2251
2252     {
2253         char *p;
2254         static char parens[] = "=!<,>";
2255
2256         if (paren && (p = strchr(parens, paren))) {
2257             int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2258             int flag = (p - parens) > 1;
2259
2260             if (paren == '>')
2261                 node = SUSPEND, flag = 0;
2262             reginsert(pRExC_state, node,ret);
2263             ret->flags = flag;
2264             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2265         }
2266     }
2267
2268     /* Check for proper termination. */
2269     if (paren) {
2270         RExC_flags16 = oregflags;
2271         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2272             RExC_parse = oregcomp_parse;
2273             vFAIL("Unmatched (");
2274         }
2275     }
2276     else if (!paren && RExC_parse < RExC_end) {
2277         if (*RExC_parse == ')') {
2278             RExC_parse++;
2279             vFAIL("Unmatched )");
2280         }
2281         else
2282             FAIL("Junk on end of regexp");      /* "Can't happen". */
2283         /* NOTREACHED */
2284     }
2285
2286     return(ret);
2287 }
2288
2289 /*
2290  - regbranch - one alternative of an | operator
2291  *
2292  * Implements the concatenation operator.
2293  */
2294 STATIC regnode *
2295 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2296 {
2297     register regnode *ret;
2298     register regnode *chain = NULL;
2299     register regnode *latest;
2300     I32 flags = 0, c = 0;
2301
2302     if (first) 
2303         ret = NULL;
2304     else {
2305         if (!SIZE_ONLY && RExC_extralen) 
2306             ret = reganode(pRExC_state, BRANCHJ,0);
2307         else
2308             ret = reg_node(pRExC_state, BRANCH);
2309     }
2310         
2311     if (!first && SIZE_ONLY) 
2312         RExC_extralen += 1;                     /* BRANCHJ */
2313     
2314     *flagp = WORST;                     /* Tentatively. */
2315
2316     RExC_parse--;
2317     nextchar(pRExC_state);
2318     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2319         flags &= ~TRYAGAIN;
2320         latest = regpiece(pRExC_state, &flags);
2321         if (latest == NULL) {
2322             if (flags & TRYAGAIN)
2323                 continue;
2324             return(NULL);
2325         }
2326         else if (ret == NULL)
2327             ret = latest;
2328         *flagp |= flags&HASWIDTH;
2329         if (chain == NULL)      /* First piece. */
2330             *flagp |= flags&SPSTART;
2331         else {
2332             RExC_naughty++;
2333             regtail(pRExC_state, chain, latest);
2334         }
2335         chain = latest;
2336         c++;
2337     }
2338     if (chain == NULL) {        /* Loop ran zero times. */
2339         chain = reg_node(pRExC_state, NOTHING);
2340         if (ret == NULL)
2341             ret = chain;
2342     }
2343     if (c == 1) {
2344         *flagp |= flags&SIMPLE;
2345     }
2346
2347     return(ret);
2348 }
2349
2350 /*
2351  - regpiece - something followed by possible [*+?]
2352  *
2353  * Note that the branching code sequences used for ? and the general cases
2354  * of * and + are somewhat optimized:  they use the same NOTHING node as
2355  * both the endmarker for their branch list and the body of the last branch.
2356  * It might seem that this node could be dispensed with entirely, but the
2357  * endmarker role is not redundant.
2358  */
2359 STATIC regnode *
2360 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2361 {
2362     register regnode *ret;
2363     register char op;
2364     register char *next;
2365     I32 flags;
2366     char *origparse = RExC_parse;
2367     char *maxpos;
2368     I32 min;
2369     I32 max = REG_INFTY;
2370
2371     ret = regatom(pRExC_state, &flags);
2372     if (ret == NULL) {
2373         if (flags & TRYAGAIN)
2374             *flagp |= TRYAGAIN;
2375         return(NULL);
2376     }
2377
2378     op = *RExC_parse;
2379
2380     if (op == '{' && regcurly(RExC_parse)) {
2381         next = RExC_parse + 1;
2382         maxpos = Nullch;
2383         while (isDIGIT(*next) || *next == ',') {
2384             if (*next == ',') {
2385                 if (maxpos)
2386                     break;
2387                 else
2388                     maxpos = next;
2389             }
2390             next++;
2391         }
2392         if (*next == '}') {             /* got one */
2393             if (!maxpos)
2394                 maxpos = next;
2395             RExC_parse++;
2396             min = atoi(RExC_parse);
2397             if (*maxpos == ',')
2398                 maxpos++;
2399             else
2400                 maxpos = RExC_parse;
2401             max = atoi(maxpos);
2402             if (!max && *maxpos != '0')
2403                 max = REG_INFTY;                /* meaning "infinity" */
2404             else if (max >= REG_INFTY)
2405                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2406             RExC_parse = next;
2407             nextchar(pRExC_state);
2408
2409         do_curly:
2410             if ((flags&SIMPLE)) {
2411                 RExC_naughty += 2 + RExC_naughty / 2;
2412                 reginsert(pRExC_state, CURLY, ret);
2413             }
2414             else {
2415                 regnode *w = reg_node(pRExC_state, WHILEM);
2416
2417                 w->flags = 0;
2418                 regtail(pRExC_state, ret, w);
2419                 if (!SIZE_ONLY && RExC_extralen) {
2420                     reginsert(pRExC_state, LONGJMP,ret);
2421                     reginsert(pRExC_state, NOTHING,ret);
2422                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2423                 }
2424                 reginsert(pRExC_state, CURLYX,ret);
2425                 if (!SIZE_ONLY && RExC_extralen)
2426                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2427                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2428                 if (SIZE_ONLY)
2429                     RExC_whilem_seen++, RExC_extralen += 3;
2430                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2431             }
2432             ret->flags = 0;
2433
2434             if (min > 0)
2435                 *flagp = WORST;
2436             if (max > 0)
2437                 *flagp |= HASWIDTH;
2438             if (max && max < min)
2439                 vFAIL("Can't do {n,m} with n > m");
2440             if (!SIZE_ONLY) {
2441                 ARG1_SET(ret, min);
2442                 ARG2_SET(ret, max);
2443             }
2444
2445             goto nest_check;
2446         }
2447     }
2448
2449     if (!ISMULT1(op)) {
2450         *flagp = flags;
2451         return(ret);
2452     }
2453
2454 #if 0                           /* Now runtime fix should be reliable. */
2455
2456     /* if this is reinstated, don't forget to put this back into perldiag:
2457
2458             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2459
2460            (F) The part of the regexp subject to either the * or + quantifier
2461            could match an empty string. The {#} shows in the regular
2462            expression about where the problem was discovered.
2463
2464     */
2465
2466     if (!(flags&HASWIDTH) && op != '?')
2467       vFAIL("Regexp *+ operand could be empty");
2468 #endif 
2469
2470     nextchar(pRExC_state);
2471
2472     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2473
2474     if (op == '*' && (flags&SIMPLE)) {
2475         reginsert(pRExC_state, STAR, ret);
2476         ret->flags = 0;
2477         RExC_naughty += 4;
2478     }
2479     else if (op == '*') {
2480         min = 0;
2481         goto do_curly;
2482     }
2483     else if (op == '+' && (flags&SIMPLE)) {
2484         reginsert(pRExC_state, PLUS, ret);
2485         ret->flags = 0;
2486         RExC_naughty += 3;
2487     }
2488     else if (op == '+') {
2489         min = 1;
2490         goto do_curly;
2491     }
2492     else if (op == '?') {
2493         min = 0; max = 1;
2494         goto do_curly;
2495     }
2496   nest_check:
2497     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2498         vWARN3(RExC_parse,
2499                "%.*s matches null string many times",
2500                RExC_parse - origparse,
2501                origparse);
2502     }
2503
2504     if (*RExC_parse == '?') {
2505         nextchar(pRExC_state);
2506         reginsert(pRExC_state, MINMOD, ret);
2507         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2508     }
2509     if (ISMULT2(RExC_parse)) {
2510         RExC_parse++;
2511         vFAIL("Nested quantifiers");
2512     }
2513
2514     return(ret);
2515 }
2516
2517 /*
2518  - regatom - the lowest level
2519  *
2520  * Optimization:  gobbles an entire sequence of ordinary characters so that
2521  * it can turn them into a single node, which is smaller to store and
2522  * faster to run.  Backslashed characters are exceptions, each becoming a
2523  * separate node; the code is simpler that way and it's not worth fixing.
2524  *
2525  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2526 STATIC regnode *
2527 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2528 {
2529     register regnode *ret = 0;
2530     I32 flags;
2531
2532     *flagp = WORST;             /* Tentatively. */
2533
2534 tryagain:
2535     switch (*RExC_parse) {
2536     case '^':
2537         RExC_seen_zerolen++;
2538         nextchar(pRExC_state);
2539         if (RExC_flags16 & PMf_MULTILINE)
2540             ret = reg_node(pRExC_state, MBOL);
2541         else if (RExC_flags16 & PMf_SINGLELINE)
2542             ret = reg_node(pRExC_state, SBOL);
2543         else
2544             ret = reg_node(pRExC_state, BOL);
2545         break;
2546     case '$':
2547         nextchar(pRExC_state);
2548         if (*RExC_parse) 
2549             RExC_seen_zerolen++;
2550         if (RExC_flags16 & PMf_MULTILINE)
2551             ret = reg_node(pRExC_state, MEOL);
2552         else if (RExC_flags16 & PMf_SINGLELINE)
2553             ret = reg_node(pRExC_state, SEOL);
2554         else
2555             ret = reg_node(pRExC_state, EOL);
2556         break;
2557     case '.':
2558         nextchar(pRExC_state);
2559         if (UTF) {
2560             if (RExC_flags16 & PMf_SINGLELINE)
2561                 ret = reg_node(pRExC_state, SANYUTF8);
2562             else
2563                 ret = reg_node(pRExC_state, ANYUTF8);
2564             *flagp |= HASWIDTH;
2565         }
2566         else {
2567             if (RExC_flags16 & PMf_SINGLELINE)
2568                 ret = reg_node(pRExC_state, SANY);
2569             else
2570                 ret = reg_node(pRExC_state, REG_ANY);
2571             *flagp |= HASWIDTH|SIMPLE;
2572         }
2573         RExC_naughty++;
2574         break;
2575     case '[':
2576     {
2577         char *oregcomp_parse = ++RExC_parse;
2578         ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state));
2579         if (*RExC_parse != ']') {
2580             RExC_parse = oregcomp_parse;
2581             vFAIL("Unmatched [");
2582         }
2583         nextchar(pRExC_state);
2584         *flagp |= HASWIDTH|SIMPLE;
2585         break;
2586     }
2587     case '(':
2588         nextchar(pRExC_state);
2589         ret = reg(pRExC_state, 1, &flags);
2590         if (ret == NULL) {
2591                 if (flags & TRYAGAIN) {
2592                     if (RExC_parse == RExC_end) {
2593                          /* Make parent create an empty node if needed. */
2594                         *flagp |= TRYAGAIN;
2595                         return(NULL);
2596                     }
2597                     goto tryagain;
2598                 }
2599                 return(NULL);
2600         }
2601         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2602         break;
2603     case '|':
2604     case ')':
2605         if (flags & TRYAGAIN) {
2606             *flagp |= TRYAGAIN;
2607             return NULL;
2608         }
2609         vFAIL("Internal urp");
2610                                 /* Supposed to be caught earlier. */
2611         break;
2612     case '{':
2613         if (!regcurly(RExC_parse)) {
2614             RExC_parse++;
2615             goto defchar;
2616         }
2617         /* FALL THROUGH */
2618     case '?':
2619     case '+':
2620     case '*':
2621         RExC_parse++;
2622         vFAIL("Quantifier follows nothing");
2623         break;
2624     case '\\':
2625         switch (*++RExC_parse) {
2626         case 'A':
2627             RExC_seen_zerolen++;
2628             ret = reg_node(pRExC_state, SBOL);
2629             *flagp |= SIMPLE;
2630             nextchar(pRExC_state);
2631             break;
2632         case 'G':
2633             ret = reg_node(pRExC_state, GPOS);
2634             RExC_seen |= REG_SEEN_GPOS;
2635             *flagp |= SIMPLE;
2636             nextchar(pRExC_state);
2637             break;
2638         case 'Z':
2639             ret = reg_node(pRExC_state, SEOL);
2640             *flagp |= SIMPLE;
2641             nextchar(pRExC_state);
2642             break;
2643         case 'z':
2644             ret = reg_node(pRExC_state, EOS);
2645             *flagp |= SIMPLE;
2646             RExC_seen_zerolen++;                /* Do not optimize RE away */
2647             nextchar(pRExC_state);
2648             break;
2649         case 'C':
2650             ret = reg_node(pRExC_state, SANY);
2651             *flagp |= HASWIDTH|SIMPLE;
2652             nextchar(pRExC_state);
2653             break;
2654         case 'X':
2655             ret = reg_node(pRExC_state, CLUMP);
2656             *flagp |= HASWIDTH;
2657             nextchar(pRExC_state);
2658             if (UTF && !PL_utf8_mark)
2659                 is_utf8_mark((U8*)"~");         /* preload table */
2660             break;
2661         case 'w':
2662             ret = reg_node(pRExC_state, 
2663                 UTF
2664                     ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
2665                     : (LOC ? ALNUML     : ALNUM));
2666             *flagp |= HASWIDTH|SIMPLE;
2667             nextchar(pRExC_state);
2668             if (UTF && !PL_utf8_alnum)
2669                 is_utf8_alnum((U8*)"a");        /* preload table */
2670             break;
2671         case 'W':
2672             ret = reg_node(pRExC_state, 
2673                 UTF
2674                     ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
2675                     : (LOC ? NALNUML     : NALNUM));
2676             *flagp |= HASWIDTH|SIMPLE;
2677             nextchar(pRExC_state);
2678             if (UTF && !PL_utf8_alnum)
2679                 is_utf8_alnum((U8*)"a");        /* preload table */
2680             break;
2681         case 'b':
2682             RExC_seen_zerolen++;
2683             RExC_seen |= REG_SEEN_LOOKBEHIND;
2684             ret = reg_node(pRExC_state, 
2685                 UTF
2686                     ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
2687                     : (LOC ? BOUNDL     : BOUND));
2688             *flagp |= SIMPLE;
2689             nextchar(pRExC_state);
2690             if (UTF && !PL_utf8_alnum)
2691                 is_utf8_alnum((U8*)"a");        /* preload table */
2692             break;
2693         case 'B':
2694             RExC_seen_zerolen++;
2695             RExC_seen |= REG_SEEN_LOOKBEHIND;
2696             ret = reg_node(pRExC_state, 
2697                 UTF
2698                     ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
2699                     : (LOC ? NBOUNDL     : NBOUND));
2700             *flagp |= SIMPLE;
2701             nextchar(pRExC_state);
2702             if (UTF && !PL_utf8_alnum)
2703                 is_utf8_alnum((U8*)"a");        /* preload table */
2704             break;
2705         case 's':
2706             ret = reg_node(pRExC_state, 
2707                 UTF
2708                     ? (LOC ? SPACELUTF8 : SPACEUTF8)
2709                     : (LOC ? SPACEL     : SPACE));
2710             *flagp |= HASWIDTH|SIMPLE;
2711             nextchar(pRExC_state);
2712             if (UTF && !PL_utf8_space)
2713                 is_utf8_space((U8*)" ");        /* preload table */
2714             break;
2715         case 'S':
2716             ret = reg_node(pRExC_state, 
2717                 UTF
2718                     ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
2719                     : (LOC ? NSPACEL     : NSPACE));
2720             *flagp |= HASWIDTH|SIMPLE;
2721             nextchar(pRExC_state);
2722             if (UTF && !PL_utf8_space)
2723                 is_utf8_space((U8*)" ");        /* preload table */
2724             break;
2725         case 'd':
2726             ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT);
2727             *flagp |= HASWIDTH|SIMPLE;
2728             nextchar(pRExC_state);
2729             if (UTF && !PL_utf8_digit)
2730                 is_utf8_digit((U8*)"1");        /* preload table */
2731             break;
2732         case 'D':
2733             ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT);
2734             *flagp |= HASWIDTH|SIMPLE;
2735             nextchar(pRExC_state);
2736             if (UTF && !PL_utf8_digit)
2737                 is_utf8_digit((U8*)"1");        /* preload table */
2738             break;
2739         case 'p':
2740         case 'P':
2741             {   /* a lovely hack--pretend we saw [\pX] instead */
2742                 char* oldregxend = RExC_end;
2743
2744                 if (RExC_parse[1] == '{') {
2745                     RExC_end = strchr(RExC_parse, '}');
2746                     if (!RExC_end) {
2747                         RExC_parse += 2;
2748                         RExC_end = oldregxend;
2749                         vFAIL("Missing right brace on \\p{}");
2750                     }
2751                     RExC_end++;
2752                 }
2753                 else
2754                     RExC_end = RExC_parse + 2;
2755                 RExC_parse--;
2756
2757                 ret = regclassutf8(pRExC_state);
2758
2759                 RExC_end = oldregxend;
2760                 RExC_parse--;
2761                 nextchar(pRExC_state);
2762                 *flagp |= HASWIDTH|SIMPLE;
2763             }
2764             break;
2765         case 'n':
2766         case 'r':
2767         case 't':
2768         case 'f':
2769         case 'e':
2770         case 'a':
2771         case 'x':
2772         case 'c':
2773         case '0':
2774             goto defchar;
2775         case '1': case '2': case '3': case '4':
2776         case '5': case '6': case '7': case '8': case '9':
2777             {
2778                 I32 num = atoi(RExC_parse);
2779
2780                 if (num > 9 && num >= RExC_npar)
2781                     goto defchar;
2782                 else {
2783                     while (isDIGIT(*RExC_parse))
2784                         RExC_parse++;
2785
2786                     if (!SIZE_ONLY && num > RExC_rx->nparens)
2787                         vFAIL("Reference to nonexistent group");
2788                     RExC_sawback = 1;
2789                     ret = reganode(pRExC_state, FOLD
2790                                    ? (LOC ? REFFL : REFF)
2791                                    : REF, num);
2792                     *flagp |= HASWIDTH;
2793                     RExC_parse--;
2794                     nextchar(pRExC_state);
2795                 }
2796             }
2797             break;
2798         case '\0':
2799             if (RExC_parse >= RExC_end)
2800                 FAIL("Trailing \\");
2801             /* FALL THROUGH */
2802         default:
2803             /* Do not generate `unrecognized' warnings here, we fall
2804                back into the quick-grab loop below */
2805             goto defchar;
2806         }
2807         break;
2808
2809     case '#':
2810         if (RExC_flags16 & PMf_EXTENDED) {
2811             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
2812             if (RExC_parse < RExC_end)
2813                 goto tryagain;
2814         }
2815         /* FALL THROUGH */
2816
2817     default: {
2818             register STRLEN len;
2819             register UV ender;
2820             register char *p;
2821             char *oldp, *s;
2822             STRLEN numlen;
2823
2824             RExC_parse++;
2825
2826         defchar:
2827             ret = reg_node(pRExC_state, FOLD
2828                           ? (LOC ? EXACTFL : EXACTF)
2829                           : EXACT);
2830             s = STRING(ret);
2831             for (len = 0, p = RExC_parse - 1;
2832               len < 127 && p < RExC_end;
2833               len++)
2834             {
2835                 oldp = p;
2836
2837                 if (RExC_flags16 & PMf_EXTENDED)
2838                     p = regwhite(p, RExC_end);
2839                 switch (*p) {
2840                 case '^':
2841                 case '$':
2842                 case '.':
2843                 case '[':
2844                 case '(':
2845                 case ')':
2846                 case '|':
2847                     goto loopdone;
2848                 case '\\':
2849                     switch (*++p) {
2850                     case 'A':
2851                     case 'G':
2852                     case 'Z':
2853                     case 'z':
2854                     case 'w':
2855                     case 'W':
2856                     case 'b':
2857                     case 'B':
2858                     case 's':
2859                     case 'S':
2860                     case 'd':
2861                     case 'D':
2862                     case 'p':
2863                     case 'P':
2864                         --p;
2865                         goto loopdone;
2866                     case 'n':
2867                         ender = '\n';
2868                         p++;
2869                         break;
2870                     case 'r':
2871                         ender = '\r';
2872                         p++;
2873                         break;
2874                     case 't':
2875                         ender = '\t';
2876                         p++;
2877                         break;
2878                     case 'f':
2879                         ender = '\f';
2880                         p++;
2881                         break;
2882                     case 'e':
2883 #ifdef ASCIIish
2884                           ender = '\033';
2885 #else
2886                           ender = '\047';
2887 #endif
2888                         p++;
2889                         break;
2890                     case 'a':
2891 #ifdef ASCIIish
2892                           ender = '\007';
2893 #else
2894                           ender = '\057';
2895 #endif
2896                         p++;
2897                         break;
2898                     case 'x':
2899                         if (*++p == '{') {
2900                             char* e = strchr(p, '}');
2901          
2902                             if (!e) {
2903                                 RExC_parse = p + 1;
2904                                 vFAIL("Missing right brace on \\x{}");
2905                             }
2906                             else {
2907                                 numlen = 1;     /* allow underscores */
2908                                 ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
2909                                 /* numlen is generous */
2910                                 if (numlen + len >= 127) {
2911                                     p--;
2912                                     goto loopdone;
2913                                 }
2914                                 p = e + 1;
2915                             }
2916                         }
2917                         else {
2918                             numlen = 0;         /* disallow underscores */
2919                             ender = (UV)scan_hex(p, 2, &numlen);
2920                             p += numlen;
2921                         }
2922                         break;
2923                     case 'c':
2924                         p++;
2925                         ender = UCHARAT(p++);
2926                         ender = toCTRL(ender);
2927                         break;
2928                     case '0': case '1': case '2': case '3':case '4':
2929                     case '5': case '6': case '7': case '8':case '9':
2930                         if (*p == '0' ||
2931                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
2932                             numlen = 0;         /* disallow underscores */
2933                             ender = (UV)scan_oct(p, 3, &numlen);
2934                             p += numlen;
2935                         }
2936                         else {
2937                             --p;
2938                             goto loopdone;
2939                         }
2940                         break;
2941                     case '\0':
2942                         if (p >= RExC_end)
2943                             FAIL("Trailing \\");
2944                         /* FALL THROUGH */
2945                     default:
2946                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
2947                             vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
2948                         goto normal_default;
2949                     }
2950                     break;
2951                 default:
2952                   normal_default:
2953                     if ((*p & 0xc0) == 0xc0 && UTF) {
2954                         ender = utf8_to_uv((U8*)p, RExC_end - p,
2955                                                &numlen, 0);
2956                         p += numlen;
2957                     }
2958                     else
2959                         ender = *p++;
2960                     break;
2961                 }
2962                 if (RExC_flags16 & PMf_EXTENDED)
2963                     p = regwhite(p, RExC_end);
2964                 if (UTF && FOLD) {
2965                     if (LOC)
2966                         ender = toLOWER_LC_uni(ender);
2967                     else
2968                         ender = toLOWER_uni(ender);
2969                 }
2970                 if (ISMULT2(p)) { /* Back off on ?+*. */
2971                     if (len)
2972                         p = oldp;
2973                     else if (ender >= 0x80 && UTF) {
2974                         reguni(pRExC_state, ender, s, &numlen);
2975                         s += numlen;
2976                         len += numlen;
2977                     }
2978                     else {
2979                         len++;
2980                         REGC(ender, s++);
2981                     }
2982                     break;
2983                 }
2984                 if (ender >= 0x80 && UTF) {
2985                     reguni(pRExC_state, ender, s, &numlen);
2986                     s += numlen;
2987                     len += numlen - 1;
2988                 }
2989                 else
2990                     REGC(ender, s++);
2991             }
2992         loopdone:
2993             RExC_parse = p - 1;
2994             nextchar(pRExC_state);
2995             {
2996                 /* len is STRLEN which is unsigned, need to copy to signed */
2997                 IV iv = len;
2998                 if (iv < 0)
2999                     vFAIL("Internal disaster");
3000             }
3001             if (len > 0)
3002                 *flagp |= HASWIDTH;
3003             if (len == 1)
3004                 *flagp |= SIMPLE;
3005             if (!SIZE_ONLY)
3006                 STR_LEN(ret) = len;
3007             if (SIZE_ONLY)
3008                 RExC_size += STR_SZ(len);
3009             else
3010                 RExC_emit += STR_SZ(len);
3011         }
3012         break;
3013     }
3014
3015     return(ret);
3016 }
3017
3018 STATIC char *
3019 S_regwhite(pTHX_ char *p, char *e)
3020 {
3021     while (p < e) {
3022         if (isSPACE(*p))
3023             ++p;
3024         else if (*p == '#') {
3025             do {
3026                 p++;
3027             } while (p < e && *p != '\n');
3028         }
3029         else
3030             break;
3031     }
3032     return p;
3033 }
3034
3035 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3036    Character classes ([:foo:]) can also be negated ([:^foo:]).
3037    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3038    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3039    but trigger warnings because they are currently unimplemented. */
3040 STATIC I32
3041 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3042 {
3043     char *posixcc = 0;
3044     I32 namedclass = OOB_NAMEDCLASS;
3045
3046     if (value == '[' && RExC_parse + 1 < RExC_end &&
3047         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3048         (*RExC_parse == ':' ||
3049          *RExC_parse == '=' ||
3050          *RExC_parse == '.')) {
3051         char  c = *RExC_parse;
3052         char* s = RExC_parse++;
3053             
3054         while (RExC_parse < RExC_end && *RExC_parse != c)
3055             RExC_parse++;
3056         if (RExC_parse == RExC_end)
3057             /* Grandfather lone [:, [=, [. */
3058             RExC_parse = s;
3059         else {
3060             char* t = RExC_parse++; /* skip over the c */
3061
3062             if (*RExC_parse == ']') {
3063                 RExC_parse++; /* skip over the ending ] */
3064                 posixcc = s + 1;
3065                 if (*s == ':') {
3066                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3067                     I32 skip = 5; /* the most common skip */
3068
3069                     switch (*posixcc) {
3070                     case 'a':
3071                         if (strnEQ(posixcc, "alnum", 5))
3072                             namedclass =
3073                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3074                         else if (strnEQ(posixcc, "alpha", 5))
3075                             namedclass =
3076                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3077                         else if (strnEQ(posixcc, "ascii", 5))
3078                             namedclass =
3079                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3080                         break;
3081                     case 'b':
3082                         if (strnEQ(posixcc, "blank", 5))
3083                             namedclass =
3084                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3085                         break;
3086                     case 'c':
3087                         if (strnEQ(posixcc, "cntrl", 5))
3088                             namedclass =
3089                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3090                         break;
3091                     case 'd':
3092                         if (strnEQ(posixcc, "digit", 5))
3093                             namedclass =
3094                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3095                         break;
3096                     case 'g':
3097                         if (strnEQ(posixcc, "graph", 5))
3098                             namedclass =
3099                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3100                         break;
3101                     case 'l':
3102                         if (strnEQ(posixcc, "lower", 5))
3103                             namedclass =
3104                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3105                         break;
3106                     case 'p':
3107                         if (strnEQ(posixcc, "print", 5))
3108                             namedclass =
3109                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3110                         else if (strnEQ(posixcc, "punct", 5))
3111                             namedclass =
3112                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3113                         break;
3114                     case 's':
3115                         if (strnEQ(posixcc, "space", 5))
3116                             namedclass =
3117                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3118                         break;
3119                     case 'u':
3120                         if (strnEQ(posixcc, "upper", 5))
3121                             namedclass =
3122                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3123                         break;
3124                     case 'w': /* this is not POSIX, this is the Perl \w */
3125                         if (strnEQ(posixcc, "word", 4)) {
3126                             namedclass =
3127                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3128                             skip = 4;
3129                         }
3130                         break;
3131                     case 'x':
3132                         if (strnEQ(posixcc, "xdigit", 6)) {
3133                             namedclass =
3134                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3135                             skip = 6;
3136                         }
3137                         break;
3138                     }
3139                     if (namedclass == OOB_NAMEDCLASS ||
3140                         posixcc[skip] != ':' ||
3141                         posixcc[skip+1] != ']')
3142                     {
3143                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3144                                       t - s - 1, s + 1);
3145                     }
3146                 } else if (!SIZE_ONLY) {
3147                     /* [[=foo=]] and [[.foo.]] are still future. */
3148
3149                     /* adjust RExC_parse so the warning shows after
3150                        the class closes */
3151                     while (*RExC_parse && *RExC_parse != ']')
3152                         RExC_parse++;
3153                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3154                 }
3155             } else {
3156                 /* Maternal grandfather:
3157                  * "[:" ending in ":" but not in ":]" */
3158                 RExC_parse = s;
3159             }
3160         }
3161     }
3162
3163     return namedclass;
3164 }
3165
3166 STATIC void
3167 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3168 {
3169     if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
3170         (*RExC_parse == ':' ||
3171          *RExC_parse == '=' ||
3172          *RExC_parse == '.')) {
3173         char *s = RExC_parse;
3174         char  c = *s++;
3175
3176         while(*s && isALNUM(*s))
3177             s++;
3178         if (*s && c == *s && s[1] == ']') {
3179             vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3180
3181             /* [[=foo=]] and [[.foo.]] are still future. */
3182             if (c == '=' || c == '.')
3183             {
3184                 /* adjust RExC_parse so the error shows after
3185                    the class closes */
3186                 while (*RExC_parse && *RExC_parse++ != ']')
3187                     ;
3188                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3189             }
3190         }
3191     }
3192 }
3193
3194 STATIC regnode *
3195 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3196 {
3197     register U32 value;
3198     register I32 lastvalue = OOB_CHAR8;
3199     register I32 range = 0;
3200     register regnode *ret;
3201     STRLEN numlen;
3202     I32 namedclass;
3203     char *rangebegin;
3204     bool need_class = 0;
3205
3206     ret = reg_node(pRExC_state, ANYOF);
3207     if (SIZE_ONLY)
3208         RExC_size += ANYOF_SKIP;
3209     else {
3210         ret->flags = 0;
3211         ANYOF_BITMAP_ZERO(ret);
3212         RExC_emit += ANYOF_SKIP;
3213         if (FOLD)
3214             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3215         if (LOC)
3216             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3217     }
3218     if (*RExC_parse == '^') {   /* Complement of range. */
3219         RExC_naughty++;
3220         RExC_parse++;
3221         if (!SIZE_ONLY)
3222             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3223     }
3224
3225     if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
3226         checkposixcc(pRExC_state);
3227
3228     if (*RExC_parse == ']' || *RExC_parse == '-')
3229         goto skipcond;          /* allow 1st char to be ] or - */
3230     while (RExC_parse < RExC_end && *RExC_parse != ']') {
3231        skipcond:
3232         namedclass = OOB_NAMEDCLASS;
3233         if (!range)
3234             rangebegin = RExC_parse;
3235         value = UCHARAT(RExC_parse++);
3236         if (value == '[')
3237             namedclass = regpposixcc(pRExC_state, value);
3238         else if (value == '\\') {
3239             value = UCHARAT(RExC_parse++);
3240             /* Some compilers cannot handle switching on 64-bit integer
3241              * values, therefore the 'value' cannot be an UV. --jhi */
3242             switch (value) {
3243             case 'w':   namedclass = ANYOF_ALNUM;       break;
3244             case 'W':   namedclass = ANYOF_NALNUM;      break;
3245             case 's':   namedclass = ANYOF_SPACE;       break;
3246             case 'S':   namedclass = ANYOF_NSPACE;      break;
3247             case 'd':   namedclass = ANYOF_DIGIT;       break;
3248             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3249             case 'n':   value = '\n';                   break;
3250             case 'r':   value = '\r';                   break;
3251             case 't':   value = '\t';                   break;
3252             case 'f':   value = '\f';                   break;
3253             case 'b':   value = '\b';                   break;
3254 #ifdef ASCIIish
3255             case 'e':   value = '\033';                 break;
3256             case 'a':   value = '\007';                 break;
3257 #else
3258             case 'e':   value = '\047';                 break;
3259             case 'a':   value = '\057';                 break;
3260 #endif
3261             case 'x':
3262                 numlen = 0;             /* disallow underscores */
3263                 value = (UV)scan_hex(RExC_parse, 2, &numlen);
3264                 RExC_parse += numlen;
3265                 break;
3266             case 'c':
3267                 value = UCHARAT(RExC_parse++);
3268                 value = toCTRL(value);
3269                 break;
3270             case '0': case '1': case '2': case '3': case '4':
3271             case '5': case '6': case '7': case '8': case '9':
3272                 numlen = 0;             /* disallow underscores */
3273                 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3274                 RExC_parse += numlen;
3275                 break;
3276             default:
3277                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3278
3279                     vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value);
3280                 break;
3281             }
3282         }
3283         if (namedclass > OOB_NAMEDCLASS) {
3284             if (!need_class && !SIZE_ONLY)
3285                 ANYOF_CLASS_ZERO(ret);
3286             need_class = 1;
3287             if (range) { /* a-\d, a-[:digit:] */
3288                 if (!SIZE_ONLY) {
3289                     if (ckWARN(WARN_REGEXP))
3290                         vWARN4(RExC_parse,
3291                                "False [] range \"%*.*s\"",
3292                                RExC_parse - rangebegin,
3293                                RExC_parse - rangebegin,
3294                                rangebegin);
3295                     ANYOF_BITMAP_SET(ret, lastvalue);
3296                     ANYOF_BITMAP_SET(ret, '-');
3297                 }
3298                 range = 0; /* this is not a true range */
3299             }
3300             if (!SIZE_ONLY) {
3301                 switch (namedclass) {
3302                 case ANYOF_ALNUM:
3303                     if (LOC)
3304                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3305                     else {
3306                         for (value = 0; value < 256; value++)
3307                             if (isALNUM(value))
3308                                 ANYOF_BITMAP_SET(ret, value);
3309                     }
3310                     break;
3311                 case ANYOF_NALNUM:
3312                     if (LOC)
3313                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3314                     else {
3315                         for (value = 0; value < 256; value++)
3316                             if (!isALNUM(value))
3317                                 ANYOF_BITMAP_SET(ret, value);
3318                     }
3319                     break;
3320                 case ANYOF_SPACE:
3321                     if (LOC)
3322                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3323                     else {
3324                         for (value = 0; value < 256; value++)
3325                             if (isSPACE(value))
3326                                 ANYOF_BITMAP_SET(ret, value);
3327                     }
3328                     break;
3329                 case ANYOF_NSPACE:
3330                     if (LOC)
3331                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3332                     else {
3333                         for (value = 0; value < 256; value++)
3334                             if (!isSPACE(value))
3335                                 ANYOF_BITMAP_SET(ret, value);
3336                     }
3337                     break;
3338                 case ANYOF_DIGIT:
3339                     if (LOC)
3340                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3341                     else {
3342                         for (value = '0'; value <= '9'; value++)
3343                             ANYOF_BITMAP_SET(ret, value);
3344                     }
3345                     break;
3346                 case ANYOF_NDIGIT:
3347                     if (LOC)
3348                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3349                     else {
3350                         for (value = 0; value < '0'; value++)
3351                             ANYOF_BITMAP_SET(ret, value);
3352                         for (value = '9' + 1; value < 256; value++)
3353                             ANYOF_BITMAP_SET(ret, value);
3354                     }
3355                     break;
3356                 case ANYOF_NALNUMC:
3357                     if (LOC)
3358                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3359                     else {
3360                         for (value = 0; value < 256; value++)
3361                             if (!isALNUMC(value))
3362                                 ANYOF_BITMAP_SET(ret, value);
3363                     }
3364                     break;
3365                 case ANYOF_ALNUMC:
3366                     if (LOC)
3367                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3368                     else {
3369                         for (value = 0; value < 256; value++)
3370                             if (isALNUMC(value))
3371                                 ANYOF_BITMAP_SET(ret, value);
3372                     }
3373                     break;
3374                 case ANYOF_ALPHA:
3375                     if (LOC)
3376                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3377                     else {
3378                         for (value = 0; value < 256; value++)
3379                             if (isALPHA(value))
3380                                 ANYOF_BITMAP_SET(ret, value);
3381                     }
3382                     break;
3383                 case ANYOF_NALPHA:
3384                     if (LOC)
3385                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3386                     else {
3387                         for (value = 0; value < 256; value++)
3388                             if (!isALPHA(value))
3389                                 ANYOF_BITMAP_SET(ret, value);
3390                     }
3391                     break;
3392                 case ANYOF_ASCII:
3393                     if (LOC)
3394                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3395                     else {
3396 #ifdef ASCIIish
3397                         for (value = 0; value < 128; value++)
3398                             ANYOF_BITMAP_SET(ret, value);
3399 #else  /* EBCDIC */
3400                         for (value = 0; value < 256; value++)
3401                             if (isASCII(value))
3402                                 ANYOF_BITMAP_SET(ret, value);
3403 #endif /* EBCDIC */
3404                     }
3405                     break;
3406                 case ANYOF_NASCII:
3407                     if (LOC)
3408                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3409                     else {
3410 #ifdef ASCIIish
3411                         for (value = 128; value < 256; value++)
3412                             ANYOF_BITMAP_SET(ret, value);
3413 #else  /* EBCDIC */
3414                         for (value = 0; value < 256; value++)
3415                             if (!isASCII(value))
3416                                 ANYOF_BITMAP_SET(ret, value);
3417 #endif /* EBCDIC */
3418                     }
3419                     break;
3420                 case ANYOF_BLANK:
3421                     if (LOC)
3422                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3423                     else {
3424                         for (value = 0; value < 256; value++)
3425                             if (isBLANK(value))
3426                                 ANYOF_BITMAP_SET(ret, value);
3427                     }
3428                     break;
3429                 case ANYOF_NBLANK:
3430                     if (LOC)
3431                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3432                     else {
3433                         for (value = 0; value < 256; value++)
3434                             if (!isBLANK(value))
3435                                 ANYOF_BITMAP_SET(ret, value);
3436                     }
3437                     break;
3438                 case ANYOF_CNTRL:
3439                     if (LOC)
3440                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3441                     else {
3442                         for (value = 0; value < 256; value++)
3443                             if (isCNTRL(value))
3444                                 ANYOF_BITMAP_SET(ret, value);
3445                     }
3446                     lastvalue = OOB_CHAR8;
3447                     break;
3448                 case ANYOF_NCNTRL:
3449                     if (LOC)
3450                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3451                     else {
3452                         for (value = 0; value < 256; value++)
3453                             if (!isCNTRL(value))
3454                                 ANYOF_BITMAP_SET(ret, value);
3455                     }
3456                     break;
3457                 case ANYOF_GRAPH:
3458                     if (LOC)
3459                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3460                     else {
3461                         for (value = 0; value < 256; value++)
3462                             if (isGRAPH(value))
3463                                 ANYOF_BITMAP_SET(ret, value);
3464                     }
3465                     break;
3466                 case ANYOF_NGRAPH:
3467                     if (LOC)
3468                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3469                     else {
3470                         for (value = 0; value < 256; value++)
3471                             if (!isGRAPH(value))
3472                                 ANYOF_BITMAP_SET(ret, value);
3473                     }
3474                     break;
3475                 case ANYOF_LOWER:
3476                     if (LOC)
3477                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3478                     else {
3479                         for (value = 0; value < 256; value++)
3480                             if (isLOWER(value))
3481                                 ANYOF_BITMAP_SET(ret, value);
3482                     }
3483                     break;
3484                 case ANYOF_NLOWER:
3485                     if (LOC)
3486                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3487                     else {
3488                         for (value = 0; value < 256; value++)
3489                             if (!isLOWER(value))
3490                                 ANYOF_BITMAP_SET(ret, value);
3491                     }
3492                     break;
3493                 case ANYOF_PRINT:
3494                     if (LOC)
3495                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3496                     else {
3497                         for (value = 0; value < 256; value++)
3498                             if (isPRINT(value))
3499                                 ANYOF_BITMAP_SET(ret, value);
3500                     }
3501                     break;
3502                 case ANYOF_NPRINT:
3503                     if (LOC)
3504                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3505                     else {
3506                         for (value = 0; value < 256; value++)
3507                             if (!isPRINT(value))
3508                                 ANYOF_BITMAP_SET(ret, value);
3509                     }
3510                     break;
3511                 case ANYOF_PSXSPC:
3512                     if (LOC)
3513                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3514                     else {
3515                         for (value = 0; value < 256; value++)
3516                             if (isPSXSPC(value))
3517                                 ANYOF_BITMAP_SET(ret, value);
3518                     }
3519                     break;
3520                 case ANYOF_NPSXSPC:
3521                     if (LOC)
3522                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3523                     else {
3524                         for (value = 0; value < 256; value++)
3525                             if (!isPSXSPC(value))
3526                                 ANYOF_BITMAP_SET(ret, value);
3527                     }
3528                     break;
3529                 case ANYOF_PUNCT:
3530                     if (LOC)
3531                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3532                     else {
3533                         for (value = 0; value < 256; value++)
3534                             if (isPUNCT(value))
3535                                 ANYOF_BITMAP_SET(ret, value);
3536                     }
3537                     break;
3538                 case ANYOF_NPUNCT:
3539                     if (LOC)
3540                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3541                     else {
3542                         for (value = 0; value < 256; value++)
3543                             if (!isPUNCT(value))
3544                                 ANYOF_BITMAP_SET(ret, value);
3545                     }
3546                     break;
3547                 case ANYOF_UPPER:
3548                     if (LOC)
3549                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
3550                     else {
3551                         for (value = 0; value < 256; value++)
3552                             if (isUPPER(value))
3553                                 ANYOF_BITMAP_SET(ret, value);
3554                     }
3555                     break;
3556                 case ANYOF_NUPPER:
3557                     if (LOC)
3558                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
3559                     else {
3560                         for (value = 0; value < 256; value++)
3561                             if (!isUPPER(value))
3562                                 ANYOF_BITMAP_SET(ret, value);
3563                     }
3564                     break;
3565                 case ANYOF_XDIGIT:
3566                     if (LOC)
3567                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
3568                     else {
3569                         for (value = 0; value < 256; value++)
3570                             if (isXDIGIT(value))
3571                                 ANYOF_BITMAP_SET(ret, value);
3572                     }
3573                     break;
3574                 case ANYOF_NXDIGIT:
3575                     if (LOC)
3576                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
3577                     else {
3578                         for (value = 0; value < 256; value++)
3579                             if (!isXDIGIT(value))
3580                                 ANYOF_BITMAP_SET(ret, value);
3581                     }
3582                     break;
3583                 default:
3584                     vFAIL("Invalid [::] class");
3585                     break;
3586                 }
3587                 if (LOC)
3588                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
3589                 continue;
3590             }
3591         }
3592         if (range) {
3593             if (lastvalue > value) /* b-a */ {
3594                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
3595                               RExC_parse - rangebegin,
3596                               RExC_parse - rangebegin,
3597                               rangebegin);
3598             }
3599             range = 0;
3600         }
3601         else {
3602             lastvalue = value;
3603             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
3604                 RExC_parse[1] != ']') {
3605                 RExC_parse++;
3606                 if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
3607                     if (ckWARN(WARN_REGEXP))
3608                         vWARN4(RExC_parse,
3609                                "False [] range \"%*.*s\"",
3610                                RExC_parse - rangebegin,
3611                                RExC_parse - rangebegin,
3612                                rangebegin);
3613                     if (!SIZE_ONLY)
3614                         ANYOF_BITMAP_SET(ret, '-');
3615                 } else
3616                     range = 1;
3617                 continue;       /* do it next time */
3618             }
3619         }
3620         /* now is the next time */
3621         if (!SIZE_ONLY) {
3622 #ifndef ASCIIish /* EBCDIC, for example. */
3623             if ((isLOWER(lastvalue) && isLOWER(value)) ||
3624                 (isUPPER(lastvalue) && isUPPER(value)))
3625             {
3626                 I32 i;
3627                 if (isLOWER(lastvalue)) {
3628                     for (i = lastvalue; i <= value; i++)
3629                         if (isLOWER(i))
3630                             ANYOF_BITMAP_SET(ret, i);
3631                 } else {
3632                     for (i = lastvalue; i <= value; i++)
3633                         if (isUPPER(i))
3634                             ANYOF_BITMAP_SET(ret, i);
3635                 }
3636             }
3637             else
3638 #endif
3639                 for ( ; lastvalue <= value; lastvalue++)
3640                     ANYOF_BITMAP_SET(ret, lastvalue);
3641         }
3642         range = 0;
3643     }
3644     if (need_class) {
3645         if (SIZE_ONLY)
3646             RExC_size += ANYOF_CLASS_ADD_SKIP;
3647         else
3648             RExC_emit += ANYOF_CLASS_ADD_SKIP;
3649     }
3650     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
3651     if (!SIZE_ONLY &&
3652         (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
3653         for (value = 0; value < 256; ++value) {
3654             if (ANYOF_BITMAP_TEST(ret, value)) {
3655                 I32 cf = PL_fold[value];
3656                 ANYOF_BITMAP_SET(ret, cf);
3657             }
3658         }
3659         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
3660     }
3661     /* optimize inverted simple patterns (e.g. [^a-z]) */
3662     if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
3663         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
3664             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
3665         ANYOF_FLAGS(ret) = 0;
3666     }
3667     return ret;
3668 }
3669
3670 STATIC regnode *
3671 S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
3672 {
3673     register char *e;
3674     register U32 value;
3675     register U32 lastvalue = OOB_UTF8;
3676     register I32 range = 0;
3677     register regnode *ret;
3678     STRLEN numlen;
3679     I32 n;
3680     SV *listsv;
3681     U8 flags = 0;
3682     I32 namedclass;
3683     char *rangebegin;
3684
3685     if (*RExC_parse == '^') {   /* Complement of range. */
3686         RExC_naughty++;
3687         RExC_parse++;
3688         if (!SIZE_ONLY)
3689             flags |= ANYOF_INVERT;
3690     }
3691     if (!SIZE_ONLY) {
3692         if (FOLD)
3693             flags |= ANYOF_FOLD;
3694         if (LOC)
3695             flags |= ANYOF_LOCALE;
3696         listsv = newSVpvn("# comment\n", 10);
3697     }
3698
3699     if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
3700         checkposixcc(pRExC_state);
3701
3702     if (*RExC_parse == ']' || *RExC_parse == '-')
3703         goto skipcond;          /* allow 1st char to be ] or - */
3704
3705     while (RExC_parse < RExC_end && *RExC_parse != ']') {
3706        skipcond:
3707         namedclass = OOB_NAMEDCLASS;
3708         if (!range)
3709             rangebegin = RExC_parse;
3710         value = utf8_to_uv((U8*)RExC_parse,
3711                                RExC_end - RExC_parse,
3712                                &numlen, 0);
3713         RExC_parse += numlen;
3714         if (value == '[')
3715             namedclass = regpposixcc(pRExC_state, value);
3716         else if (value == '\\') {
3717             value = (U32)utf8_to_uv((U8*)RExC_parse,
3718                                         RExC_end - RExC_parse,
3719                                         &numlen, 0);
3720             RExC_parse += numlen;
3721             /* Some compilers cannot handle switching on 64-bit integer
3722              * values, therefore value cannot be an UV.  Yes, this will
3723              * be a problem later if we want switch on Unicode.  --jhi */
3724             switch (value) {
3725             case 'w':           namedclass = ANYOF_ALNUM;               break;
3726             case 'W':           namedclass = ANYOF_NALNUM;              break;
3727             case 's':           namedclass = ANYOF_SPACE;               break;
3728             case 'S':           namedclass = ANYOF_NSPACE;              break;
3729             case 'd':           namedclass = ANYOF_DIGIT;               break;
3730             case 'D':           namedclass = ANYOF_NDIGIT;              break;
3731             case 'p':
3732             case 'P':
3733                 if (*RExC_parse == '{') {
3734                     e = strchr(RExC_parse++, '}');
3735                     if (!e)
3736                         vFAIL("Missing right brace on \\p{}");
3737                     n = e - RExC_parse;
3738                 }
3739                 else {
3740                     e = RExC_parse;
3741                     n = 1;
3742                 }
3743                 if (!SIZE_ONLY) {
3744                     if (value == 'p')
3745                         Perl_sv_catpvf(aTHX_ listsv,
3746                                        "+utf8::%.*s\n", (int)n, RExC_parse);
3747                     else
3748                         Perl_sv_catpvf(aTHX_ listsv,
3749                                        "!utf8::%.*s\n", (int)n, RExC_parse);
3750                 }
3751                 RExC_parse = e + 1;
3752                 lastvalue = OOB_UTF8;
3753                 continue;
3754             case 'n':           value = '\n';           break;
3755             case 'r':           value = '\r';           break;
3756             case 't':           value = '\t';           break;
3757             case 'f':           value = '\f';           break;
3758             case 'b':           value = '\b';           break;
3759 #ifdef ASCIIish
3760             case 'e':           value = '\033';         break;
3761             case 'a':           value = '\007';         break;
3762 #else
3763             case 'e':           value = '\047';         break;
3764             case 'a':           value = '\057';         break;
3765 #endif
3766             case 'x':
3767                 if (*RExC_parse == '{') {
3768                     e = strchr(RExC_parse++, '}');
3769                     if (!e) 
3770                         vFAIL("Missing right brace on \\x{}");
3771                     numlen = 1;         /* allow underscores */
3772                     value = (UV)scan_hex(RExC_parse,
3773                                      e - RExC_parse,
3774                                      &numlen);
3775                     RExC_parse = e + 1;
3776                 }
3777                 else {
3778                     numlen = 0;         /* disallow underscores */
3779                     value = (UV)scan_hex(RExC_parse, 2, &numlen);
3780                     RExC_parse += numlen;
3781                 }
3782                 break;
3783             case 'c':
3784                 value = UCHARAT(RExC_parse++);
3785                 value = toCTRL(value);
3786                 break;
3787             case '0': case '1': case '2': case '3': case '4':
3788             case '5': case '6': case '7': case '8': case '9':
3789                 numlen = 0;             /* disallow underscores */
3790                 value = (UV)scan_oct(--RExC_parse, 3, &numlen);
3791                 RExC_parse += numlen;
3792                 break;
3793             default:
3794                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3795                     vWARN2(RExC_parse,
3796                            "Unrecognized escape \\%c in character class passed through",
3797                            (int)value);
3798                 break;
3799             }
3800         }
3801         if (namedclass > OOB_NAMEDCLASS) {
3802             if (range) { /* a-\d, a-[:digit:] */
3803                 if (!SIZE_ONLY) {
3804                     if (ckWARN(WARN_REGEXP))
3805                         vWARN4(RExC_parse,
3806                                "False [] range \"%*.*s\"",
3807                                RExC_parse - rangebegin,
3808                                RExC_parse - rangebegin,
3809                                rangebegin);
3810                     Perl_sv_catpvf(aTHX_ listsv,
3811                                    /* 0x002D is Unicode for '-' */
3812                                    "%04"UVxf"\n002D\n", (UV)lastvalue);
3813                 }
3814                 range = 0;
3815             }
3816             if (!SIZE_ONLY) {
3817                 switch (namedclass) {
3818                 case ANYOF_ALNUM:
3819                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    break;
3820                 case ANYOF_NALNUM:
3821                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");    break;
3822                 case ANYOF_ALNUMC:
3823                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");   break;
3824                 case ANYOF_NALNUMC:
3825                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");   break;
3826                 case ANYOF_ALPHA:
3827                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");   break;
3828                 case ANYOF_NALPHA:
3829                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");   break;
3830                 case ANYOF_ASCII:
3831                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");   break;
3832                 case ANYOF_NASCII:
3833                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");   break;
3834                 case ANYOF_CNTRL:
3835                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");   break;
3836                 case ANYOF_NCNTRL:
3837                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");   break;
3838                 case ANYOF_GRAPH:
3839                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");   break;
3840                 case ANYOF_NGRAPH:
3841                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");   break;
3842                 case ANYOF_DIGIT:
3843                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");   break;
3844                 case ANYOF_NDIGIT:
3845                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");   break;
3846                 case ANYOF_LOWER:
3847                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");   break;
3848                 case ANYOF_NLOWER:
3849                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");   break;
3850                 case ANYOF_PRINT:
3851                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");   break;
3852                 case ANYOF_NPRINT:
3853                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");   break;
3854                 case ANYOF_PUNCT:
3855                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");   break;
3856                 case ANYOF_NPUNCT:
3857                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");   break;
3858                 case ANYOF_SPACE:
3859                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break;
3860                 case ANYOF_NSPACE:
3861                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break;
3862                 case ANYOF_BLANK:
3863                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");   break;
3864                 case ANYOF_NBLANK:
3865                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");   break;
3866                 case ANYOF_PSXSPC:
3867                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");   break;
3868                 case ANYOF_NPSXSPC:
3869                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");   break;
3870                 case ANYOF_UPPER:
3871                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");   break;
3872                 case ANYOF_NUPPER:
3873                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");   break;
3874                 case ANYOF_XDIGIT:
3875                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");  break;
3876                 case ANYOF_NXDIGIT:
3877                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");  break;
3878                 }
3879                 continue;
3880             }
3881         }
3882         if (range) {