This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* This file contains functions for compiling a regular expression.  See
9  * also regexec.c which funnily enough, contains functions for executing
10  * a regular expression.
11  *
12  * This file is also copied at build time to ext/re/re_comp.c, where
13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14  * This causes the main functions to be compiled under new names and with
15  * debugging support added, which makes "use re 'debug'" work.
16  */
17
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19  * confused with the original package (see point 3 below).  Thanks, Henry!
20  */
21
22 /* Additional note: this code is very heavily munged from Henry's version
23  * in places.  In some spots I've traded clarity for efficiency, so don't
24  * blame Henry for some of the lack of readability.
25  */
26
27 /* The names of the functions have been changed from regcomp and
28  * regexec to  pregcomp and pregexec in order to avoid conflicts
29  * with the POSIX routines of the same names.
30 */
31
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 #  ifndef PERL_IN_XSUB_RE
35 #    define PERL_IN_XSUB_RE
36 #  endif
37 /* need access to debugger hooks */
38 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39 #    define DEBUGGING
40 #  endif
41 #endif
42
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 #  define Perl_pregcomp my_regcomp
46 #  define Perl_regdump my_regdump
47 #  define Perl_regprop my_regprop
48 #  define Perl_pregfree my_regfree
49 #  define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 #  define Perl_regnext my_regnext
52 #  define Perl_save_re_context my_save_re_context
53 #  define Perl_reginitcolors my_reginitcolors
54
55 #  define PERL_NO_GET_CONTEXT
56 #endif
57
58 /*
59  * pregcomp and pregexec -- regsub and regerror are not used in perl
60  *
61  *      Copyright (c) 1986 by University of Toronto.
62  *      Written by Henry Spencer.  Not derived from licensed software.
63  *
64  *      Permission is granted to anyone to use this software for any
65  *      purpose on any computer system, and to redistribute it freely,
66  *      subject to the following restrictions:
67  *
68  *      1. The author is not responsible for the consequences of use of
69  *              this software, no matter how awful, even if they arise
70  *              from defects in it.
71  *
72  *      2. The origin of this software must not be misrepresented, either
73  *              by explicit claim or by omission.
74  *
75  *      3. Altered versions must be plainly marked as such, and must not
76  *              be misrepresented as being the original software.
77  *
78  *
79  ****    Alterations to Henry's code are...
80  ****
81  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82  ****    2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
83  ****
84  ****    You may distribute under the terms of either the GNU General Public
85  ****    License or the Artistic License, as specified in the README file.
86
87  *
88  * Beware that some of this code is subtly aware of the way operator
89  * precedence is structured in regular expressions.  Serious changes in
90  * regular-expression syntax might require a total rethink.
91  */
92 #include "EXTERN.h"
93 #define PERL_IN_REGCOMP_C
94 #include "perl.h"
95
96 #ifndef PERL_IN_XSUB_RE
97 #  include "INTERN.h"
98 #endif
99
100 #define REG_COMP_C
101 #include "regcomp.h"
102
103 #ifdef op
104 #undef op
105 #endif /* op */
106
107 #ifdef MSDOS
108 #  if defined(BUGGY_MSC6)
109  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 #    pragma optimize("a",off)
111  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 #    pragma optimize("w",on )
113 #  endif /* BUGGY_MSC6 */
114 #endif /* MSDOS */
115
116 #ifndef STATIC
117 #define STATIC  static
118 #endif
119
120 typedef struct RExC_state_t {
121     U32         flags;                  /* are we folding, multilining? */
122     char        *precomp;               /* uncompiled string. */
123     regexp      *rx;
124     char        *start;                 /* Start of input for compile */
125     char        *end;                   /* End of input for compile */
126     char        *parse;                 /* Input-scan pointer. */
127     I32         whilem_seen;            /* number of WHILEM in this expr */
128     regnode     *emit_start;            /* Start of emitted-code area */
129     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
130     I32         naughty;                /* How bad is this pattern? */
131     I32         sawback;                /* Did we see \1, ...? */
132     U32         seen;
133     I32         size;                   /* Code size. */
134     I32         npar;                   /* () count. */
135     I32         extralen;
136     I32         seen_zerolen;
137     I32         seen_evals;
138     I32         utf8;
139 #if ADD_TO_REGEXEC
140     char        *starttry;              /* -Dr: where regtry was called. */
141 #define RExC_starttry   (pRExC_state->starttry)
142 #endif
143 } RExC_state_t;
144
145 #define RExC_flags      (pRExC_state->flags)
146 #define RExC_precomp    (pRExC_state->precomp)
147 #define RExC_rx         (pRExC_state->rx)
148 #define RExC_start      (pRExC_state->start)
149 #define RExC_end        (pRExC_state->end)
150 #define RExC_parse      (pRExC_state->parse)
151 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
152 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit       (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty    (pRExC_state->naughty)
156 #define RExC_sawback    (pRExC_state->sawback)
157 #define RExC_seen       (pRExC_state->seen)
158 #define RExC_size       (pRExC_state->size)
159 #define RExC_npar       (pRExC_state->npar)
160 #define RExC_extralen   (pRExC_state->extralen)
161 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8       (pRExC_state->utf8)
164
165 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167         ((*s) == '{' && regcurly(s)))
168
169 #ifdef SPSTART
170 #undef SPSTART          /* dratted cpp namespace... */
171 #endif
172 /*
173  * Flags to be passed up and down.
174  */
175 #define WORST           0       /* Worst case. */
176 #define HASWIDTH        0x1     /* Known to match non-null strings. */
177 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART         0x4     /* Starts with * or +. */
179 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
180
181 /* Length of a variant. */
182
183 typedef struct scan_data_t {
184     I32 len_min;
185     I32 len_delta;
186     I32 pos_min;
187     I32 pos_delta;
188     SV *last_found;
189     I32 last_end;                       /* min value, <0 unless valid. */
190     I32 last_start_min;
191     I32 last_start_max;
192     SV **longest;                       /* Either &l_fixed, or &l_float. */
193     SV *longest_fixed;
194     I32 offset_fixed;
195     SV *longest_float;
196     I32 offset_float_min;
197     I32 offset_float_max;
198     I32 flags;
199     I32 whilem_c;
200     I32 *last_closep;
201     struct regnode_charclass_class *start_class;
202 } scan_data_t;
203
204 /*
205  * Forward declarations for pregcomp()'s friends.
206  */
207
208 static const scan_data_t zero_scan_data =
209   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
210
211 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL          0x1
213 #define SF_BEFORE_MEOL          0x2
214 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
216
217 #ifdef NO_UNARY_PLUS
218 #  define SF_FIX_SHIFT_EOL      (0+2)
219 #  define SF_FL_SHIFT_EOL               (0+4)
220 #else
221 #  define SF_FIX_SHIFT_EOL      (+2)
222 #  define SF_FL_SHIFT_EOL               (+4)
223 #endif
224
225 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
227
228 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF               0x40
231 #define SF_HAS_PAR              0x80
232 #define SF_IN_PAR               0x100
233 #define SF_HAS_EVAL             0x200
234 #define SCF_DO_SUBSTR           0x400
235 #define SCF_DO_STCLASS_AND      0x0800
236 #define SCF_DO_STCLASS_OR       0x1000
237 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS  0x2000
239
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
243
244 #define OOB_UNICODE             12345678
245 #define OOB_NAMEDCLASS          -1
246
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
249
250
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
253
254 /*
255  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257  * op/pragma/warn/regcomp.
258  */
259 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
261
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
263
264 /*
265  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266  * arg. Show regex, up to a maximum length. If it's too long, chop and add
267  * "...".
268  */
269 #define FAIL(msg) STMT_START {                                          \
270     const char *ellipses = "";                                          \
271     IV len = RExC_end - RExC_precomp;                                   \
272                                                                         \
273     if (!SIZE_ONLY)                                                     \
274         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
275     if (len > RegexLengthToShowInErrorMessages) {                       \
276         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
277         len = RegexLengthToShowInErrorMessages - 10;                    \
278         ellipses = "...";                                               \
279     }                                                                   \
280     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
281             msg, (int)len, RExC_precomp, ellipses);                     \
282 } STMT_END
283
284 /*
285  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286  * args. Show regex, up to a maximum length. If it's too long, chop and add
287  * "...".
288  */
289 #define FAIL2(pat,msg) STMT_START {                                     \
290     const char *ellipses = "";                                          \
291     IV len = RExC_end - RExC_precomp;                                   \
292                                                                         \
293     if (!SIZE_ONLY)                                                     \
294         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
295     if (len > RegexLengthToShowInErrorMessages) {                       \
296         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
297         len = RegexLengthToShowInErrorMessages - 10;                    \
298         ellipses = "...";                                               \
299     }                                                                   \
300     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
301             msg, (int)len, RExC_precomp, ellipses);                     \
302 } STMT_END
303
304
305 /*
306  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
307  */
308 #define Simple_vFAIL(m) STMT_START {                                    \
309     const IV offset = RExC_parse - RExC_precomp;                        \
310     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
311             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
312 } STMT_END
313
314 /*
315  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
316  */
317 #define vFAIL(m) STMT_START {                           \
318     if (!SIZE_ONLY)                                     \
319         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
320     Simple_vFAIL(m);                                    \
321 } STMT_END
322
323 /*
324  * Like Simple_vFAIL(), but accepts two arguments.
325  */
326 #define Simple_vFAIL2(m,a1) STMT_START {                        \
327     const IV offset = RExC_parse - RExC_precomp;                        \
328     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
329             (int)offset, RExC_precomp, RExC_precomp + offset);  \
330 } STMT_END
331
332 /*
333  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
334  */
335 #define vFAIL2(m,a1) STMT_START {                       \
336     if (!SIZE_ONLY)                                     \
337         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
338     Simple_vFAIL2(m, a1);                               \
339 } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
346     const IV offset = RExC_parse - RExC_precomp;                \
347     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
348             (int)offset, RExC_precomp, RExC_precomp + offset);  \
349 } STMT_END
350
351 /*
352  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
353  */
354 #define vFAIL3(m,a1,a2) STMT_START {                    \
355     if (!SIZE_ONLY)                                     \
356         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
357     Simple_vFAIL3(m, a1, a2);                           \
358 } STMT_END
359
360 /*
361  * Like Simple_vFAIL(), but accepts four arguments.
362  */
363 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
364     const IV offset = RExC_parse - RExC_precomp;                \
365     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
366             (int)offset, RExC_precomp, RExC_precomp + offset);  \
367 } STMT_END
368
369 #define vWARN(loc,m) STMT_START {                                       \
370     const IV offset = loc - RExC_precomp;                               \
371     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
372             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
373 } STMT_END
374
375 #define vWARNdep(loc,m) STMT_START {                                    \
376     const IV offset = loc - RExC_precomp;                               \
377     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
378             "%s" REPORT_LOCATION,                                       \
379             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
380 } STMT_END
381
382
383 #define vWARN2(loc, m, a1) STMT_START {                                 \
384     const IV offset = loc - RExC_precomp;                               \
385     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
386             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
387 } STMT_END
388
389 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
390     const IV offset = loc - RExC_precomp;                               \
391     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
392             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
393 } STMT_END
394
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
396     const IV offset = loc - RExC_precomp;                               \
397     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
398             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
399 } STMT_END
400
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
402     const IV offset = loc - RExC_precomp;                               \
403     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
404             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
405 } STMT_END
406
407
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START {                  \
410     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
411 } STMT_END
412
413 /* Macros for recording node offsets.   20001227 mjd@plover.com 
414  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
415  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
416  * Element 0 holds the number n.
417  */
418
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
421
422
423 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
424     if (! SIZE_ONLY) {                                                  \
425         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
426                 __LINE__, (node), (byte)));                             \
427         if((node) < 0) {                                                \
428             Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
429         } else {                                                        \
430             RExC_offsets[2*(node)-1] = (byte);                          \
431         }                                                               \
432     }                                                                   \
433 } STMT_END
434
435 #define Set_Node_Offset(node,byte) \
436     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
438
439 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
440     if (! SIZE_ONLY) {                                                  \
441         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
442                 __LINE__, (int)(node), (int)(len)));                    \
443         if((node) < 0) {                                                \
444             Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
445         } else {                                                        \
446             RExC_offsets[2*(node)] = (len);                             \
447         }                                                               \
448     }                                                                   \
449 } STMT_END
450
451 #define Set_Node_Length(node,len) \
452     Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455     Set_Node_Length(node, RExC_parse - parse_start)
456
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
460
461 static void clear_re(pTHX_ void *r);
462
463 /* Mark that we cannot extend a found fixed substring at this point.
464    Updata the longest found anchored substring and the longest found
465    floating substrings if needed. */
466
467 STATIC void
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
469 {
470     const STRLEN l = CHR_SVLEN(data->last_found);
471     const STRLEN old_l = CHR_SVLEN(*data->longest);
472
473     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474         SvSetMagicSV(*data->longest, data->last_found);
475         if (*data->longest == data->longest_fixed) {
476             data->offset_fixed = l ? data->last_start_min : data->pos_min;
477             if (data->flags & SF_BEFORE_EOL)
478                 data->flags
479                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
480             else
481                 data->flags &= ~SF_FIX_BEFORE_EOL;
482         }
483         else {
484             data->offset_float_min = l ? data->last_start_min : data->pos_min;
485             data->offset_float_max = (l
486                                       ? data->last_start_max
487                                       : data->pos_min + data->pos_delta);
488             if ((U32)data->offset_float_max > (U32)I32_MAX)
489                 data->offset_float_max = I32_MAX;
490             if (data->flags & SF_BEFORE_EOL)
491                 data->flags
492                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
493             else
494                 data->flags &= ~SF_FL_BEFORE_EOL;
495         }
496     }
497     SvCUR_set(data->last_found, 0);
498     {
499         SV * const sv = data->last_found;
500         MAGIC * const mg =
501             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502         if (mg)
503             mg->mg_len = 0;
504     }
505     data->last_end = -1;
506     data->flags &= ~SF_BEFORE_EOL;
507 }
508
509 /* Can match anything (initialization) */
510 STATIC void
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
512 {
513     ANYOF_CLASS_ZERO(cl);
514     ANYOF_BITMAP_SETALL(cl);
515     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
516     if (LOC)
517         cl->flags |= ANYOF_LOCALE;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC int
522 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
523 {
524     int value;
525
526     for (value = 0; value <= ANYOF_MAX; value += 2)
527         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
528             return 1;
529     if (!(cl->flags & ANYOF_UNICODE_ALL))
530         return 0;
531     if (!ANYOF_BITMAP_TESTALLSET(cl))
532         return 0;
533     return 1;
534 }
535
536 /* Can match anything (initialization) */
537 STATIC void
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
539 {
540     Zero(cl, 1, struct regnode_charclass_class);
541     cl->type = ANYOF;
542     cl_anything(pRExC_state, cl);
543 }
544
545 STATIC void
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
547 {
548     Zero(cl, 1, struct regnode_charclass_class);
549     cl->type = ANYOF;
550     cl_anything(pRExC_state, cl);
551     if (LOC)
552         cl->flags |= ANYOF_LOCALE;
553 }
554
555 /* 'And' a given class with another one.  Can create false positives */
556 /* We assume that cl is not inverted */
557 STATIC void
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559         const struct regnode_charclass_class *and_with)
560 {
561     if (!(and_with->flags & ANYOF_CLASS)
562         && !(cl->flags & ANYOF_CLASS)
563         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564         && !(and_with->flags & ANYOF_FOLD)
565         && !(cl->flags & ANYOF_FOLD)) {
566         int i;
567
568         if (and_with->flags & ANYOF_INVERT)
569             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570                 cl->bitmap[i] &= ~and_with->bitmap[i];
571         else
572             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573                 cl->bitmap[i] &= and_with->bitmap[i];
574     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575     if (!(and_with->flags & ANYOF_EOS))
576         cl->flags &= ~ANYOF_EOS;
577
578     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579         !(and_with->flags & ANYOF_INVERT)) {
580         cl->flags &= ~ANYOF_UNICODE_ALL;
581         cl->flags |= ANYOF_UNICODE;
582         ARG_SET(cl, ARG(and_with));
583     }
584     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585         !(and_with->flags & ANYOF_INVERT))
586         cl->flags &= ~ANYOF_UNICODE_ALL;
587     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588         !(and_with->flags & ANYOF_INVERT))
589         cl->flags &= ~ANYOF_UNICODE;
590 }
591
592 /* 'OR' a given class with another one.  Can create false positives */
593 /* We assume that cl is not inverted */
594 STATIC void
595 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
596 {
597     if (or_with->flags & ANYOF_INVERT) {
598         /* We do not use
599          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600          *   <= (B1 | !B2) | (CL1 | !CL2)
601          * which is wasteful if CL2 is small, but we ignore CL2:
602          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603          * XXXX Can we handle case-fold?  Unclear:
604          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
606          */
607         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608              && !(or_with->flags & ANYOF_FOLD)
609              && !(cl->flags & ANYOF_FOLD) ) {
610             int i;
611
612             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613                 cl->bitmap[i] |= ~or_with->bitmap[i];
614         } /* XXXX: logic is complicated otherwise */
615         else {
616             cl_anything(pRExC_state, cl);
617         }
618     } else {
619         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621              && (!(or_with->flags & ANYOF_FOLD)
622                  || (cl->flags & ANYOF_FOLD)) ) {
623             int i;
624
625             /* OR char bitmap and class bitmap separately */
626             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627                 cl->bitmap[i] |= or_with->bitmap[i];
628             if (or_with->flags & ANYOF_CLASS) {
629                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630                     cl->classflags[i] |= or_with->classflags[i];
631                 cl->flags |= ANYOF_CLASS;
632             }
633         }
634         else { /* XXXX: logic is complicated, leave it along for a moment. */
635             cl_anything(pRExC_state, cl);
636         }
637     }
638     if (or_with->flags & ANYOF_EOS)
639         cl->flags |= ANYOF_EOS;
640
641     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642         ARG(cl) != ARG(or_with)) {
643         cl->flags |= ANYOF_UNICODE_ALL;
644         cl->flags &= ~ANYOF_UNICODE;
645     }
646     if (or_with->flags & ANYOF_UNICODE_ALL) {
647         cl->flags |= ANYOF_UNICODE_ALL;
648         cl->flags &= ~ANYOF_UNICODE;
649     }
650 }
651
652 /*
653  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
654  * These need to be revisited when a newer toolchain becomes available.
655  */
656 #if defined(__sparc64__) && defined(__GNUC__)
657 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
658 #       undef  SPARC64_GCC_WORKAROUND
659 #       define SPARC64_GCC_WORKAROUND 1
660 #   endif
661 #endif
662
663 /* REx optimizer.  Converts nodes into quickier variants "in place".
664    Finds fixed substrings.  */
665
666 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
667    to the position after last scanned or to NULL. */
668
669 STATIC I32
670 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
671                         /* scanp: Start here (read-write). */
672                         /* deltap: Write maxlen-minlen here. */
673                         /* last: Stop before this one. */
674 {
675     I32 min = 0, pars = 0, code;
676     regnode *scan = *scanp, *next;
677     I32 delta = 0;
678     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
679     int is_inf_internal = 0;            /* The studied chunk is infinite */
680     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
681     scan_data_t data_fake;
682     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
683
684     while (scan && OP(scan) != END && scan < last) {
685         /* Peephole optimizer: */
686
687         if (PL_regkind[(U8)OP(scan)] == EXACT) {
688             /* Merge several consecutive EXACTish nodes into one. */
689             regnode *n = regnext(scan);
690             U32 stringok = 1;
691 #ifdef DEBUGGING
692             regnode *stop = scan;
693 #endif
694
695             next = scan + NODE_SZ_STR(scan);
696             /* Skip NOTHING, merge EXACT*. */
697             while (n &&
698                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
699                      (stringok && (OP(n) == OP(scan))))
700                    && NEXT_OFF(n)
701                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
702                 if (OP(n) == TAIL || n > next)
703                     stringok = 0;
704                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
705                     NEXT_OFF(scan) += NEXT_OFF(n);
706                     next = n + NODE_STEP_REGNODE;
707 #ifdef DEBUGGING
708                     if (stringok)
709                         stop = n;
710 #endif
711                     n = regnext(n);
712                 }
713                 else if (stringok) {
714                     const int oldl = STR_LEN(scan);
715                     regnode *nnext = regnext(n);
716
717                     if (oldl + STR_LEN(n) > U8_MAX)
718                         break;
719                     NEXT_OFF(scan) += NEXT_OFF(n);
720                     STR_LEN(scan) += STR_LEN(n);
721                     next = n + NODE_SZ_STR(n);
722                     /* Now we can overwrite *n : */
723                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
724 #ifdef DEBUGGING
725                     stop = next - 1;
726 #endif
727                     n = nnext;
728                 }
729             }
730
731             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
732 /*
733   Two problematic code points in Unicode casefolding of EXACT nodes:
734
735    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
736    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
737
738    which casefold to
739
740    Unicode                      UTF-8
741
742    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
743    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
744
745    This means that in case-insensitive matching (or "loose matching",
746    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
747    length of the above casefolded versions) can match a target string
748    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
749    This would rather mess up the minimum length computation.
750
751    What we'll do is to look for the tail four bytes, and then peek
752    at the preceding two bytes to see whether we need to decrease
753    the minimum length by four (six minus two).
754
755    Thanks to the design of UTF-8, there cannot be false matches:
756    A sequence of valid UTF-8 bytes cannot be a subsequence of
757    another valid sequence of UTF-8 bytes.
758
759 */
760                  char *s0 = STRING(scan), *s, *t;
761                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
762                  const char * const t0 = "\xcc\x88\xcc\x81";
763                  const char * const t1 = t0 + 3;
764
765                  for (s = s0 + 2;
766                       s < s2 && (t = ninstr(s, s1, t0, t1));
767                       s = t + 4) {
768                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
769                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
770                            min -= 4;
771                  }
772             }
773
774 #ifdef DEBUGGING
775             /* Allow dumping */
776             n = scan + NODE_SZ_STR(scan);
777             while (n <= stop) {
778                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
779                     OP(n) = OPTIMIZED;
780                     NEXT_OFF(n) = 0;
781                 }
782                 n++;
783             }
784 #endif
785         }
786         /* Follow the next-chain of the current node and optimize
787            away all the NOTHINGs from it.  */
788         if (OP(scan) != CURLYX) {
789             const int max = (reg_off_by_arg[OP(scan)]
790                        ? I32_MAX
791                        /* I32 may be smaller than U16 on CRAYs! */
792                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
793             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
794             int noff;
795             regnode *n = scan;
796         
797             /* Skip NOTHING and LONGJMP. */
798             while ((n = regnext(n))
799                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
800                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
801                    && off + noff < max)
802                 off += noff;
803             if (reg_off_by_arg[OP(scan)])
804                 ARG(scan) = off;
805             else
806                 NEXT_OFF(scan) = off;
807         }
808         /* The principal pseudo-switch.  Cannot be a switch, since we
809            look into several different things.  */
810         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
811                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
812             next = regnext(scan);
813             code = OP(scan);
814         
815             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
816                 I32 max1 = 0, min1 = I32_MAX, num = 0;
817                 struct regnode_charclass_class accum;
818                 
819                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
820                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
821                 if (flags & SCF_DO_STCLASS)
822                     cl_init_zero(pRExC_state, &accum);
823                 while (OP(scan) == code) {
824                     I32 deltanext, minnext, f = 0, fake;
825                     struct regnode_charclass_class this_class;
826
827                     num++;
828                     data_fake.flags = 0;
829                     if (data) {         
830                         data_fake.whilem_c = data->whilem_c;
831                         data_fake.last_closep = data->last_closep;
832                     }
833                     else
834                         data_fake.last_closep = &fake;
835                     next = regnext(scan);
836                     scan = NEXTOPER(scan);
837                     if (code != BRANCH)
838                         scan = NEXTOPER(scan);
839                     if (flags & SCF_DO_STCLASS) {
840                         cl_init(pRExC_state, &this_class);
841                         data_fake.start_class = &this_class;
842                         f = SCF_DO_STCLASS_AND;
843                     }           
844                     if (flags & SCF_WHILEM_VISITED_POS)
845                         f |= SCF_WHILEM_VISITED_POS;
846                     /* we suppose the run is continuous, last=next...*/
847                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
848                                           next, &data_fake, f);
849                     if (min1 > minnext)
850                         min1 = minnext;
851                     if (max1 < minnext + deltanext)
852                         max1 = minnext + deltanext;
853                     if (deltanext == I32_MAX)
854                         is_inf = is_inf_internal = 1;
855                     scan = next;
856                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
857                         pars++;
858                     if (data && (data_fake.flags & SF_HAS_EVAL))
859                         data->flags |= SF_HAS_EVAL;
860                     if (data)
861                         data->whilem_c = data_fake.whilem_c;
862                     if (flags & SCF_DO_STCLASS)
863                         cl_or(pRExC_state, &accum, &this_class);
864                     if (code == SUSPEND)
865                         break;
866                 }
867                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
868                     min1 = 0;
869                 if (flags & SCF_DO_SUBSTR) {
870                     data->pos_min += min1;
871                     data->pos_delta += max1 - min1;
872                     if (max1 != min1 || is_inf)
873                         data->longest = &(data->longest_float);
874                 }
875                 min += min1;
876                 delta += max1 - min1;
877                 if (flags & SCF_DO_STCLASS_OR) {
878                     cl_or(pRExC_state, data->start_class, &accum);
879                     if (min1) {
880                         cl_and(data->start_class, &and_with);
881                         flags &= ~SCF_DO_STCLASS;
882                     }
883                 }
884                 else if (flags & SCF_DO_STCLASS_AND) {
885                     if (min1) {
886                         cl_and(data->start_class, &accum);
887                         flags &= ~SCF_DO_STCLASS;
888                     }
889                     else {
890                         /* Switch to OR mode: cache the old value of
891                          * data->start_class */
892                         StructCopy(data->start_class, &and_with,
893                                    struct regnode_charclass_class);
894                         flags &= ~SCF_DO_STCLASS_AND;
895                         StructCopy(&accum, data->start_class,
896                                    struct regnode_charclass_class);
897                         flags |= SCF_DO_STCLASS_OR;
898                         data->start_class->flags |= ANYOF_EOS;
899                     }
900                 }
901
902             }
903             else if (code == BRANCHJ)   /* single branch is optimized. */
904                 scan = NEXTOPER(NEXTOPER(scan));
905             else                        /* single branch is optimized. */
906                 scan = NEXTOPER(scan);
907             continue;
908         }
909         else if (OP(scan) == EXACT) {
910             I32 l = STR_LEN(scan);
911             UV uc = *((U8*)STRING(scan));
912             if (UTF) {
913                 const U8 * const s = (U8*)STRING(scan);
914                 l = utf8_length((U8 *)s, (U8 *)s + l);
915                 uc = utf8_to_uvchr((U8 *)s, NULL);
916             }
917             min += l;
918             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919                 /* The code below prefers earlier match for fixed
920                    offset, later match for variable offset.  */
921                 if (data->last_end == -1) { /* Update the start info. */
922                     data->last_start_min = data->pos_min;
923                     data->last_start_max = is_inf
924                         ? I32_MAX : data->pos_min + data->pos_delta;
925                 }
926                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
927                 if (UTF)
928                     SvUTF8_on(data->last_found);
929                 {
930                     SV * const sv = data->last_found;
931                     MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
932                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
933                     if (mg && mg->mg_len >= 0)
934                         mg->mg_len += utf8_length((U8*)STRING(scan),
935                                                   (U8*)STRING(scan)+STR_LEN(scan));
936                 }
937                 data->last_end = data->pos_min + l;
938                 data->pos_min += l; /* As in the first entry. */
939                 data->flags &= ~SF_BEFORE_EOL;
940             }
941             if (flags & SCF_DO_STCLASS_AND) {
942                 /* Check whether it is compatible with what we know already! */
943                 int compat = 1;
944
945                 if (uc >= 0x100 ||
946                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
948                     && (!(data->start_class->flags & ANYOF_FOLD)
949                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
950                     )
951                     compat = 0;
952                 ANYOF_CLASS_ZERO(data->start_class);
953                 ANYOF_BITMAP_ZERO(data->start_class);
954                 if (compat)
955                     ANYOF_BITMAP_SET(data->start_class, uc);
956                 data->start_class->flags &= ~ANYOF_EOS;
957                 if (uc < 0x100)
958                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959             }
960             else if (flags & SCF_DO_STCLASS_OR) {
961                 /* false positive possible if the class is case-folded */
962                 if (uc < 0x100)
963                     ANYOF_BITMAP_SET(data->start_class, uc);
964                 else
965                     data->start_class->flags |= ANYOF_UNICODE_ALL;
966                 data->start_class->flags &= ~ANYOF_EOS;
967                 cl_and(data->start_class, &and_with);
968             }
969             flags &= ~SCF_DO_STCLASS;
970         }
971         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972             I32 l = STR_LEN(scan);
973             UV uc = *((U8*)STRING(scan));
974
975             /* Search for fixed substrings supports EXACT only. */
976             if (flags & SCF_DO_SUBSTR)
977                 scan_commit(pRExC_state, data);
978             if (UTF) {
979                 U8 *s = (U8 *)STRING(scan);
980                 l = utf8_length(s, s + l);
981                 uc = utf8_to_uvchr(s, NULL);
982             }
983             min += l;
984             if (data && (flags & SCF_DO_SUBSTR))
985                 data->pos_min += l;
986             if (flags & SCF_DO_STCLASS_AND) {
987                 /* Check whether it is compatible with what we know already! */
988                 int compat = 1;
989
990                 if (uc >= 0x100 ||
991                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
993                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994                     compat = 0;
995                 ANYOF_CLASS_ZERO(data->start_class);
996                 ANYOF_BITMAP_ZERO(data->start_class);
997                 if (compat) {
998                     ANYOF_BITMAP_SET(data->start_class, uc);
999                     data->start_class->flags &= ~ANYOF_EOS;
1000                     data->start_class->flags |= ANYOF_FOLD;
1001                     if (OP(scan) == EXACTFL)
1002                         data->start_class->flags |= ANYOF_LOCALE;
1003                 }
1004             }
1005             else if (flags & SCF_DO_STCLASS_OR) {
1006                 if (data->start_class->flags & ANYOF_FOLD) {
1007                     /* false positive possible if the class is case-folded.
1008                        Assume that the locale settings are the same... */
1009                     if (uc < 0x100)
1010                         ANYOF_BITMAP_SET(data->start_class, uc);
1011                     data->start_class->flags &= ~ANYOF_EOS;
1012                 }
1013                 cl_and(data->start_class, &and_with);
1014             }
1015             flags &= ~SCF_DO_STCLASS;
1016         }
1017         else if (strchr((const char*)PL_varies,OP(scan))) {
1018             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019             I32 f = flags, pos_before = 0;
1020             regnode *oscan = scan;
1021             struct regnode_charclass_class this_class;
1022             struct regnode_charclass_class *oclass = NULL;
1023             I32 next_is_eval = 0;
1024
1025             switch (PL_regkind[(U8)OP(scan)]) {
1026             case WHILEM:                /* End of (?:...)* . */
1027                 scan = NEXTOPER(scan);
1028                 goto finish;
1029             case PLUS:
1030                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031                     next = NEXTOPER(scan);
1032                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033                         mincount = 1;
1034                         maxcount = REG_INFTY;
1035                         next = regnext(scan);
1036                         scan = NEXTOPER(scan);
1037                         goto do_curly;
1038                     }
1039                 }
1040                 if (flags & SCF_DO_SUBSTR)
1041                     data->pos_min++;
1042                 min++;
1043                 /* Fall through. */
1044             case STAR:
1045                 if (flags & SCF_DO_STCLASS) {
1046                     mincount = 0;
1047                     maxcount = REG_INFTY;
1048                     next = regnext(scan);
1049                     scan = NEXTOPER(scan);
1050                     goto do_curly;
1051                 }
1052                 is_inf = is_inf_internal = 1;
1053                 scan = regnext(scan);
1054                 if (flags & SCF_DO_SUBSTR) {
1055                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056                     data->longest = &(data->longest_float);
1057                 }
1058                 goto optimize_curly_tail;
1059             case CURLY:
1060                 mincount = ARG1(scan);
1061                 maxcount = ARG2(scan);
1062                 next = regnext(scan);
1063                 if (OP(scan) == CURLYX) {
1064                     I32 lp = (data ? *(data->last_closep) : 0);
1065
1066                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067                 }
1068                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069                 next_is_eval = (OP(scan) == EVAL);
1070               do_curly:
1071                 if (flags & SCF_DO_SUBSTR) {
1072                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073                     pos_before = data->pos_min;
1074                 }
1075                 if (data) {
1076                     fl = data->flags;
1077                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078                     if (is_inf)
1079                         data->flags |= SF_IS_INF;
1080                 }
1081                 if (flags & SCF_DO_STCLASS) {
1082                     cl_init(pRExC_state, &this_class);
1083                     oclass = data->start_class;
1084                     data->start_class = &this_class;
1085                     f |= SCF_DO_STCLASS_AND;
1086                     f &= ~SCF_DO_STCLASS_OR;
1087                 }
1088                 /* These are the cases when once a subexpression
1089                    fails at a particular position, it cannot succeed
1090                    even after backtracking at the enclosing scope.
1091                 
1092                    XXXX what if minimal match and we are at the
1093                         initial run of {n,m}? */
1094                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095                     f &= ~SCF_WHILEM_VISITED_POS;
1096
1097                 /* This will finish on WHILEM, setting scan, or on NULL: */
1098                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099                                       mincount == 0
1100                                         ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102                 if (flags & SCF_DO_STCLASS)
1103                     data->start_class = oclass;
1104                 if (mincount == 0 || minnext == 0) {
1105                     if (flags & SCF_DO_STCLASS_OR) {
1106                         cl_or(pRExC_state, data->start_class, &this_class);
1107                     }
1108                     else if (flags & SCF_DO_STCLASS_AND) {
1109                         /* Switch to OR mode: cache the old value of
1110                          * data->start_class */
1111                         StructCopy(data->start_class, &and_with,
1112                                    struct regnode_charclass_class);
1113                         flags &= ~SCF_DO_STCLASS_AND;
1114                         StructCopy(&this_class, data->start_class,
1115                                    struct regnode_charclass_class);
1116                         flags |= SCF_DO_STCLASS_OR;
1117                         data->start_class->flags |= ANYOF_EOS;
1118                     }
1119                 } else {                /* Non-zero len */
1120                     if (flags & SCF_DO_STCLASS_OR) {
1121                         cl_or(pRExC_state, data->start_class, &this_class);
1122                         cl_and(data->start_class, &and_with);
1123                     }
1124                     else if (flags & SCF_DO_STCLASS_AND)
1125                         cl_and(data->start_class, &this_class);
1126                     flags &= ~SCF_DO_STCLASS;
1127                 }
1128                 if (!scan)              /* It was not CURLYX, but CURLY. */
1129                     scan = next;
1130                 if ( /* ? quantifier ok, except for (?{ ... }) */
1131                     (next_is_eval || !(mincount == 0 && maxcount == 1))
1132                     && (minnext == 0) && (deltanext == 0)
1133                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134                     && maxcount <= REG_INFTY/3 /* Complement check for big count */
1135                     && ckWARN(WARN_REGEXP))
1136                 {
1137                     vWARN(RExC_parse,
1138                           "Quantifier unexpected on zero-length expression");
1139                 }
1140
1141                 min += minnext * mincount;
1142                 is_inf_internal |= ((maxcount == REG_INFTY
1143                                      && (minnext + deltanext) > 0)
1144                                     || deltanext == I32_MAX);
1145                 is_inf |= is_inf_internal;
1146                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148                 /* Try powerful optimization CURLYX => CURLYN. */
1149                 if (  OP(oscan) == CURLYX && data
1150                       && data->flags & SF_IN_PAR
1151                       && !(data->flags & SF_HAS_EVAL)
1152                       && !deltanext && minnext == 1 ) {
1153                     /* Try to optimize to CURLYN.  */
1154                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155                     regnode *nxt1 = nxt;
1156 #ifdef DEBUGGING
1157                     regnode *nxt2;
1158 #endif
1159
1160                     /* Skip open. */
1161                     nxt = regnext(nxt);
1162                     if (!strchr((const char*)PL_simple,OP(nxt))
1163                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164                              && STR_LEN(nxt) == 1))
1165                         goto nogo;
1166 #ifdef DEBUGGING
1167                     nxt2 = nxt;
1168 #endif
1169                     nxt = regnext(nxt);
1170                     if (OP(nxt) != CLOSE)
1171                         goto nogo;
1172                     /* Now we know that nxt2 is the only contents: */
1173                     oscan->flags = (U8)ARG(nxt);
1174                     OP(oscan) = CURLYN;
1175                     OP(nxt1) = NOTHING; /* was OPEN. */
1176 #ifdef DEBUGGING
1177                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1181                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1182                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1183 #endif
1184                 }
1185               nogo:
1186
1187                 /* Try optimization CURLYX => CURLYM. */
1188                 if (  OP(oscan) == CURLYX && data
1189                       && !(data->flags & SF_HAS_PAR)
1190                       && !(data->flags & SF_HAS_EVAL)
1191                       && !deltanext     /* atom is fixed width */
1192                       && minnext != 0   /* CURLYM can't handle zero width */
1193                 ) {
1194                     /* XXXX How to optimize if data == 0? */
1195                     /* Optimize to a simpler form.  */
1196                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197                     regnode *nxt2;
1198
1199                     OP(oscan) = CURLYM;
1200                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1201                             && (OP(nxt2) != WHILEM))
1202                         nxt = nxt2;
1203                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1204                     /* Need to optimize away parenths. */
1205                     if (data->flags & SF_IN_PAR) {
1206                         /* Set the parenth number.  */
1207                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1208
1209                         if (OP(nxt) != CLOSE)
1210                             FAIL("Panic opt close");
1211                         oscan->flags = (U8)ARG(nxt);
1212                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1213                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1214 #ifdef DEBUGGING
1215                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1216                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1217                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1218                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1219 #endif
1220 #if 0
1221                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1222                             regnode *nnxt = regnext(nxt1);
1223                         
1224                             if (nnxt == nxt) {
1225                                 if (reg_off_by_arg[OP(nxt1)])
1226                                     ARG_SET(nxt1, nxt2 - nxt1);
1227                                 else if (nxt2 - nxt1 < U16_MAX)
1228                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1229                                 else
1230                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1231                             }
1232                             nxt1 = nnxt;
1233                         }
1234 #endif
1235                         /* Optimize again: */
1236                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1237                                     NULL, 0);
1238                     }
1239                     else
1240                         oscan->flags = 0;
1241                 }
1242                 else if ((OP(oscan) == CURLYX)
1243                          && (flags & SCF_WHILEM_VISITED_POS)
1244                          /* See the comment on a similar expression above.
1245                             However, this time it not a subexpression
1246                             we care about, but the expression itself. */
1247                          && (maxcount == REG_INFTY)
1248                          && data && ++data->whilem_c < 16) {
1249                     /* This stays as CURLYX, we can put the count/of pair. */
1250                     /* Find WHILEM (as in regexec.c) */
1251                     regnode *nxt = oscan + NEXT_OFF(oscan);
1252
1253                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1254                         nxt += ARG(nxt);
1255                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1256                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1257                 }
1258                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1259                     pars++;
1260                 if (flags & SCF_DO_SUBSTR) {
1261                     SV *last_str = Nullsv;
1262                     int counted = mincount != 0;
1263
1264                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1265 #if defined(SPARC64_GCC_WORKAROUND)
1266                         I32 b = 0;
1267                         STRLEN l = 0;
1268                         const char *s = NULL;
1269                         I32 old = 0;
1270
1271                         if (pos_before >= data->last_start_min)
1272                             b = pos_before;
1273                         else
1274                             b = data->last_start_min;
1275
1276                         l = 0;
1277                         s = SvPV_const(data->last_found, l);
1278                         old = b - data->last_start_min;
1279
1280 #else
1281                         I32 b = pos_before >= data->last_start_min
1282                             ? pos_before : data->last_start_min;
1283                         STRLEN l;
1284                         const char *s = SvPV_const(data->last_found, l);
1285                         I32 old = b - data->last_start_min;
1286 #endif
1287
1288                         if (UTF)
1289                             old = utf8_hop((U8*)s, old) - (U8*)s;
1290                         
1291                         l -= old;
1292                         /* Get the added string: */
1293                         last_str = newSVpvn(s  + old, l);
1294                         if (UTF)
1295                             SvUTF8_on(last_str);
1296                         if (deltanext == 0 && pos_before == b) {
1297                             /* What was added is a constant string */
1298                             if (mincount > 1) {
1299                                 SvGROW(last_str, (mincount * l) + 1);
1300                                 repeatcpy(SvPVX(last_str) + l,
1301                                           SvPVX_const(last_str), l, mincount - 1);
1302                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1303                                 /* Add additional parts. */
1304                                 SvCUR_set(data->last_found,
1305                                           SvCUR(data->last_found) - l);
1306                                 sv_catsv(data->last_found, last_str);
1307                                 {
1308                                     SV * sv = data->last_found;
1309                                     MAGIC *mg =
1310                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1311                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1312                                     if (mg && mg->mg_len >= 0)
1313                                         mg->mg_len += CHR_SVLEN(last_str);
1314                                 }
1315                                 data->last_end += l * (mincount - 1);
1316                             }
1317                         } else {
1318                             /* start offset must point into the last copy */
1319                             data->last_start_min += minnext * (mincount - 1);
1320                             data->last_start_max += is_inf ? I32_MAX
1321                                 : (maxcount - 1) * (minnext + data->pos_delta);
1322                         }
1323                     }
1324                     /* It is counted once already... */
1325                     data->pos_min += minnext * (mincount - counted);
1326                     data->pos_delta += - counted * deltanext +
1327                         (minnext + deltanext) * maxcount - minnext * mincount;
1328                     if (mincount != maxcount) {
1329                          /* Cannot extend fixed substrings found inside
1330                             the group.  */
1331                         scan_commit(pRExC_state,data);
1332                         if (mincount && last_str) {
1333                             SV *sv = data->last_found;
1334                             MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1335                                 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1336
1337                             if (mg)
1338                                 mg->mg_len = -1;
1339                             sv_setsv(sv, last_str);
1340                             data->last_end = data->pos_min;
1341                             data->last_start_min =
1342                                 data->pos_min - CHR_SVLEN(last_str);
1343                             data->last_start_max = is_inf
1344                                 ? I32_MAX
1345                                 : data->pos_min + data->pos_delta
1346                                 - CHR_SVLEN(last_str);
1347                         }
1348                         data->longest = &(data->longest_float);
1349                     }
1350                     SvREFCNT_dec(last_str);
1351                 }
1352                 if (data && (fl & SF_HAS_EVAL))
1353                     data->flags |= SF_HAS_EVAL;
1354               optimize_curly_tail:
1355                 if (OP(oscan) != CURLYX) {
1356                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1357                            && NEXT_OFF(next))
1358                         NEXT_OFF(oscan) += NEXT_OFF(next);
1359                 }
1360                 continue;
1361             default:                    /* REF and CLUMP only? */
1362                 if (flags & SCF_DO_SUBSTR) {
1363                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1364                     data->longest = &(data->longest_float);
1365                 }
1366                 is_inf = is_inf_internal = 1;
1367                 if (flags & SCF_DO_STCLASS_OR)
1368                     cl_anything(pRExC_state, data->start_class);
1369                 flags &= ~SCF_DO_STCLASS;
1370                 break;
1371             }
1372         }
1373         else if (strchr((const char*)PL_simple,OP(scan))) {
1374             int value = 0;
1375
1376             if (flags & SCF_DO_SUBSTR) {
1377                 scan_commit(pRExC_state,data);
1378                 data->pos_min++;
1379             }
1380             min++;
1381             if (flags & SCF_DO_STCLASS) {
1382                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1383
1384                 /* Some of the logic below assumes that switching
1385                    locale on will only add false positives. */
1386                 switch (PL_regkind[(U8)OP(scan)]) {
1387                 case SANY:
1388                 default:
1389                   do_default:
1390                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1391                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1392                         cl_anything(pRExC_state, data->start_class);
1393                     break;
1394                 case REG_ANY:
1395                     if (OP(scan) == SANY)
1396                         goto do_default;
1397                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1398                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1399                                  || (data->start_class->flags & ANYOF_CLASS));
1400                         cl_anything(pRExC_state, data->start_class);
1401                     }
1402                     if (flags & SCF_DO_STCLASS_AND || !value)
1403                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1404                     break;
1405                 case ANYOF:
1406                     if (flags & SCF_DO_STCLASS_AND)
1407                         cl_and(data->start_class,
1408                                (struct regnode_charclass_class*)scan);
1409                     else
1410                         cl_or(pRExC_state, data->start_class,
1411                               (struct regnode_charclass_class*)scan);
1412                     break;
1413                 case ALNUM:
1414                     if (flags & SCF_DO_STCLASS_AND) {
1415                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1416                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417                             for (value = 0; value < 256; value++)
1418                                 if (!isALNUM(value))
1419                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1420                         }
1421                     }
1422                     else {
1423                         if (data->start_class->flags & ANYOF_LOCALE)
1424                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1425                         else {
1426                             for (value = 0; value < 256; value++)
1427                                 if (isALNUM(value))
1428                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1429                         }
1430                     }
1431                     break;
1432                 case ALNUML:
1433                     if (flags & SCF_DO_STCLASS_AND) {
1434                         if (data->start_class->flags & ANYOF_LOCALE)
1435                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1436                     }
1437                     else {
1438                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1439                         data->start_class->flags |= ANYOF_LOCALE;
1440                     }
1441                     break;
1442                 case NALNUM:
1443                     if (flags & SCF_DO_STCLASS_AND) {
1444                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1445                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446                             for (value = 0; value < 256; value++)
1447                                 if (isALNUM(value))
1448                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1449                         }
1450                     }
1451                     else {
1452                         if (data->start_class->flags & ANYOF_LOCALE)
1453                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1454                         else {
1455                             for (value = 0; value < 256; value++)
1456                                 if (!isALNUM(value))
1457                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1458                         }
1459                     }
1460                     break;
1461                 case NALNUML:
1462                     if (flags & SCF_DO_STCLASS_AND) {
1463                         if (data->start_class->flags & ANYOF_LOCALE)
1464                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1465                     }
1466                     else {
1467                         data->start_class->flags |= ANYOF_LOCALE;
1468                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1469                     }
1470                     break;
1471                 case SPACE:
1472                     if (flags & SCF_DO_STCLASS_AND) {
1473                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1474                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475                             for (value = 0; value < 256; value++)
1476                                 if (!isSPACE(value))
1477                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1478                         }
1479                     }
1480                     else {
1481                         if (data->start_class->flags & ANYOF_LOCALE)
1482                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1483                         else {
1484                             for (value = 0; value < 256; value++)
1485                                 if (isSPACE(value))
1486                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1487                         }
1488                     }
1489                     break;
1490                 case SPACEL:
1491                     if (flags & SCF_DO_STCLASS_AND) {
1492                         if (data->start_class->flags & ANYOF_LOCALE)
1493                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1494                     }
1495                     else {
1496                         data->start_class->flags |= ANYOF_LOCALE;
1497                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1498                     }
1499                     break;
1500                 case NSPACE:
1501                     if (flags & SCF_DO_STCLASS_AND) {
1502                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1503                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504                             for (value = 0; value < 256; value++)
1505                                 if (isSPACE(value))
1506                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1507                         }
1508                     }
1509                     else {
1510                         if (data->start_class->flags & ANYOF_LOCALE)
1511                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1512                         else {
1513                             for (value = 0; value < 256; value++)
1514                                 if (!isSPACE(value))
1515                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1516                         }
1517                     }
1518                     break;
1519                 case NSPACEL:
1520                     if (flags & SCF_DO_STCLASS_AND) {
1521                         if (data->start_class->flags & ANYOF_LOCALE) {
1522                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1523                             for (value = 0; value < 256; value++)
1524                                 if (!isSPACE(value))
1525                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1526                         }
1527                     }
1528                     else {
1529                         data->start_class->flags |= ANYOF_LOCALE;
1530                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1531                     }
1532                     break;
1533                 case DIGIT:
1534                     if (flags & SCF_DO_STCLASS_AND) {
1535                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1536                         for (value = 0; value < 256; value++)
1537                             if (!isDIGIT(value))
1538                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539                     }
1540                     else {
1541                         if (data->start_class->flags & ANYOF_LOCALE)
1542                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1543                         else {
1544                             for (value = 0; value < 256; value++)
1545                                 if (isDIGIT(value))
1546                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1547                         }
1548                     }
1549                     break;
1550                 case NDIGIT:
1551                     if (flags & SCF_DO_STCLASS_AND) {
1552                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1553                         for (value = 0; value < 256; value++)
1554                             if (isDIGIT(value))
1555                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1556                     }
1557                     else {
1558                         if (data->start_class->flags & ANYOF_LOCALE)
1559                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1560                         else {
1561                             for (value = 0; value < 256; value++)
1562                                 if (!isDIGIT(value))
1563                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1564                         }
1565                     }
1566                     break;
1567                 }
1568                 if (flags & SCF_DO_STCLASS_OR)
1569                     cl_and(data->start_class, &and_with);
1570                 flags &= ~SCF_DO_STCLASS;
1571             }
1572         }
1573         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1574             data->flags |= (OP(scan) == MEOL
1575                             ? SF_BEFORE_MEOL
1576                             : SF_BEFORE_SEOL);
1577         }
1578         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1579                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1580                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1581                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1582             /* Lookahead/lookbehind */
1583             I32 deltanext, minnext, fake = 0;
1584             regnode *nscan;
1585             struct regnode_charclass_class intrnl;
1586             int f = 0;
1587
1588             data_fake.flags = 0;
1589             if (data) {         
1590                 data_fake.whilem_c = data->whilem_c;
1591                 data_fake.last_closep = data->last_closep;
1592             }
1593             else
1594                 data_fake.last_closep = &fake;
1595             if ( flags & SCF_DO_STCLASS && !scan->flags
1596                  && OP(scan) == IFMATCH ) { /* Lookahead */
1597                 cl_init(pRExC_state, &intrnl);
1598                 data_fake.start_class = &intrnl;
1599                 f |= SCF_DO_STCLASS_AND;
1600             }
1601             if (flags & SCF_WHILEM_VISITED_POS)
1602                 f |= SCF_WHILEM_VISITED_POS;
1603             next = regnext(scan);
1604             nscan = NEXTOPER(NEXTOPER(scan));
1605             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1606             if (scan->flags) {
1607                 if (deltanext) {
1608                     vFAIL("Variable length lookbehind not implemented");
1609                 }
1610                 else if (minnext > U8_MAX) {
1611                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1612                 }
1613                 scan->flags = (U8)minnext;
1614             }
1615             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1616                 pars++;
1617             if (data && (data_fake.flags & SF_HAS_EVAL))
1618                 data->flags |= SF_HAS_EVAL;
1619             if (data)
1620                 data->whilem_c = data_fake.whilem_c;
1621             if (f & SCF_DO_STCLASS_AND) {
1622                 const int was = (data->start_class->flags & ANYOF_EOS);
1623
1624                 cl_and(data->start_class, &intrnl);
1625                 if (was)
1626                     data->start_class->flags |= ANYOF_EOS;
1627             }
1628         }
1629         else if (OP(scan) == OPEN) {
1630             pars++;
1631         }
1632         else if (OP(scan) == CLOSE) {
1633             if ((I32)ARG(scan) == is_par) {
1634                 next = regnext(scan);
1635
1636                 if ( next && (OP(next) != WHILEM) && next < last)
1637                     is_par = 0;         /* Disable optimization */
1638             }
1639             if (data)
1640                 *(data->last_closep) = ARG(scan);
1641         }
1642         else if (OP(scan) == EVAL) {
1643                 if (data)
1644                     data->flags |= SF_HAS_EVAL;
1645         }
1646         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1647                 if (flags & SCF_DO_SUBSTR) {
1648                     scan_commit(pRExC_state,data);
1649                     data->longest = &(data->longest_float);
1650                 }
1651                 is_inf = is_inf_internal = 1;
1652                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1653                     cl_anything(pRExC_state, data->start_class);
1654                 flags &= ~SCF_DO_STCLASS;
1655         }
1656         /* Else: zero-length, ignore. */
1657         scan = regnext(scan);
1658     }
1659
1660   finish:
1661     *scanp = scan;
1662     *deltap = is_inf_internal ? I32_MAX : delta;
1663     if (flags & SCF_DO_SUBSTR && is_inf)
1664         data->pos_delta = I32_MAX - data->pos_min;
1665     if (is_par > U8_MAX)
1666         is_par = 0;
1667     if (is_par && pars==1 && data) {
1668         data->flags |= SF_IN_PAR;
1669         data->flags &= ~SF_HAS_PAR;
1670     }
1671     else if (pars && data) {
1672         data->flags |= SF_HAS_PAR;
1673         data->flags &= ~SF_IN_PAR;
1674     }
1675     if (flags & SCF_DO_STCLASS_OR)
1676         cl_and(data->start_class, &and_with);
1677     return min;
1678 }
1679
1680 STATIC I32
1681 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
1682 {
1683     if (RExC_rx->data) {
1684         Renewc(RExC_rx->data,
1685                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1686                char, struct reg_data);
1687         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1688         RExC_rx->data->count += n;
1689     }
1690     else {
1691         Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1692              char, struct reg_data);
1693         Newx(RExC_rx->data->what, n, U8);
1694         RExC_rx->data->count = n;
1695     }
1696     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1697     return RExC_rx->data->count - n;
1698 }
1699
1700 void
1701 Perl_reginitcolors(pTHX)
1702 {
1703     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1704     if (s) {
1705         char *t = savepv(s);
1706         int i = 0;
1707         PL_colors[0] = t;
1708         while (++i < 6) {
1709             t = strchr(t, '\t');
1710             if (t) {
1711                 *t = '\0';
1712                 PL_colors[i] = ++t;
1713             }
1714             else
1715                 PL_colors[i] = t = (char *)"";
1716         }
1717     } else {
1718         int i = 0;
1719         while (i < 6)
1720             PL_colors[i++] = (char *)"";
1721     }
1722     PL_colorset = 1;
1723 }
1724
1725
1726 /*
1727  - pregcomp - compile a regular expression into internal code
1728  *
1729  * We can't allocate space until we know how big the compiled form will be,
1730  * but we can't compile it (and thus know how big it is) until we've got a
1731  * place to put the code.  So we cheat:  we compile it twice, once with code
1732  * generation turned off and size counting turned on, and once "for real".
1733  * This also means that we don't allocate space until we are sure that the
1734  * thing really will compile successfully, and we never have to move the
1735  * code and thus invalidate pointers into it.  (Note that it has to be in
1736  * one piece because free() must be able to free it all.) [NB: not true in perl]
1737  *
1738  * Beware that the optimization-preparation code in here knows about some
1739  * of the structure of the compiled regexp.  [I'll say.]
1740  */
1741 regexp *
1742 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1743 {
1744     register regexp *r;
1745     regnode *scan;
1746     regnode *first;
1747     I32 flags;
1748     I32 minlen = 0;
1749     I32 sawplus = 0;
1750     I32 sawopen = 0;
1751     scan_data_t data;
1752     RExC_state_t RExC_state;
1753     RExC_state_t *pRExC_state = &RExC_state;
1754
1755     if (exp == NULL)
1756         FAIL("NULL regexp argument");
1757
1758     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1759
1760     RExC_precomp = exp;
1761     DEBUG_r({
1762          if (!PL_colorset) reginitcolors();
1763          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1764                        PL_colors[4],PL_colors[5],PL_colors[0],
1765                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1766     });
1767     RExC_flags = pm->op_pmflags;
1768     RExC_sawback = 0;
1769
1770     RExC_seen = 0;
1771     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1772     RExC_seen_evals = 0;
1773     RExC_extralen = 0;
1774
1775     /* First pass: determine size, legality. */
1776     RExC_parse = exp;
1777     RExC_start = exp;
1778     RExC_end = xend;
1779     RExC_naughty = 0;
1780     RExC_npar = 1;
1781     RExC_size = 0L;
1782     RExC_emit = &PL_regdummy;
1783     RExC_whilem_seen = 0;
1784 #if 0 /* REGC() is (currently) a NOP at the first pass.
1785        * Clever compilers notice this and complain. --jhi */
1786     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1787 #endif
1788     if (reg(pRExC_state, 0, &flags) == NULL) {
1789         RExC_precomp = Nullch;
1790         return(NULL);
1791     }
1792     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1793
1794     /* Small enough for pointer-storage convention?
1795        If extralen==0, this means that we will not need long jumps. */
1796     if (RExC_size >= 0x10000L && RExC_extralen)
1797         RExC_size += RExC_extralen;
1798     else
1799         RExC_extralen = 0;
1800     if (RExC_whilem_seen > 15)
1801         RExC_whilem_seen = 15;
1802
1803     /* Allocate space and initialize. */
1804     Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1805          char, regexp);
1806     if (r == NULL)
1807         FAIL("Regexp out of space");
1808
1809 #ifdef DEBUGGING
1810     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1811     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1812 #endif
1813     r->refcnt = 1;
1814     r->prelen = xend - exp;
1815     r->precomp = savepvn(RExC_precomp, r->prelen);
1816     r->subbeg = NULL;
1817     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1818     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1819     r->lastparen = 0;                   /* mg.c reads this.  */
1820
1821     r->substrs = 0;                     /* Useful during FAIL. */
1822     r->startp = 0;                      /* Useful during FAIL. */
1823     r->endp = 0;                        /* Useful during FAIL. */
1824
1825     Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1826     if (r->offsets) {
1827         r->offsets[0] = RExC_size;
1828     }
1829     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1830                           "%s %"UVuf" bytes for offset annotations.\n",
1831                           r->offsets ? "Got" : "Couldn't get",
1832                           (UV)((2*RExC_size+1) * sizeof(U32))));
1833
1834     RExC_rx = r;
1835
1836     /* Second pass: emit code. */
1837     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1838     RExC_parse = exp;
1839     RExC_end = xend;
1840     RExC_naughty = 0;
1841     RExC_npar = 1;
1842     RExC_emit_start = r->program;
1843     RExC_emit = r->program;
1844     /* Store the count of eval-groups for security checks: */
1845     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1846     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1847     r->data = 0;
1848     if (reg(pRExC_state, 0, &flags) == NULL)
1849         return(NULL);
1850
1851     /* Dig out information for optimizations. */
1852     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1853     pm->op_pmflags = RExC_flags;
1854     if (UTF)
1855         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1856     r->regstclass = NULL;
1857     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1858         r->reganch |= ROPT_NAUGHTY;
1859     scan = r->program + 1;              /* First BRANCH. */
1860
1861     /* XXXX To minimize changes to RE engine we always allocate
1862        3-units-long substrs field. */
1863     Newxz(r->substrs, 1, struct reg_substr_data);
1864
1865     StructCopy(&zero_scan_data, &data, scan_data_t);
1866     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1867     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1868         I32 fake;
1869         STRLEN longest_float_length, longest_fixed_length;
1870         struct regnode_charclass_class ch_class;
1871         int stclass_flag;
1872         I32 last_close = 0;
1873
1874         first = scan;
1875         /* Skip introductions and multiplicators >= 1. */
1876         while ((OP(first) == OPEN && (sawopen = 1)) ||
1877                /* An OR of *one* alternative - should not happen now. */
1878             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1879             (OP(first) == PLUS) ||
1880             (OP(first) == MINMOD) ||
1881                /* An {n,m} with n>0 */
1882             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1883                 if (OP(first) == PLUS)
1884                     sawplus = 1;
1885                 else
1886                     first += regarglen[(U8)OP(first)];
1887                 first = NEXTOPER(first);
1888         }
1889
1890         /* Starting-point info. */
1891       again:
1892         if (PL_regkind[(U8)OP(first)] == EXACT) {
1893             if (OP(first) == EXACT)
1894                 ;       /* Empty, get anchored substr later. */
1895             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1896                 r->regstclass = first;
1897         }
1898         else if (strchr((const char*)PL_simple,OP(first)))
1899             r->regstclass = first;
1900         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1901                  PL_regkind[(U8)OP(first)] == NBOUND)
1902             r->regstclass = first;
1903         else if (PL_regkind[(U8)OP(first)] == BOL) {
1904             r->reganch |= (OP(first) == MBOL
1905                            ? ROPT_ANCH_MBOL
1906                            : (OP(first) == SBOL
1907                               ? ROPT_ANCH_SBOL
1908                               : ROPT_ANCH_BOL));
1909             first = NEXTOPER(first);
1910             goto again;
1911         }
1912         else if (OP(first) == GPOS) {
1913             r->reganch |= ROPT_ANCH_GPOS;
1914             first = NEXTOPER(first);
1915             goto again;
1916         }
1917         else if (!sawopen && (OP(first) == STAR &&
1918             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1919             !(r->reganch & ROPT_ANCH) )
1920         {
1921             /* turn .* into ^.* with an implied $*=1 */
1922             const int type =
1923                 (OP(NEXTOPER(first)) == REG_ANY)
1924                     ? ROPT_ANCH_MBOL
1925                     : ROPT_ANCH_SBOL;
1926             r->reganch |= type | ROPT_IMPLICIT;
1927             first = NEXTOPER(first);
1928             goto again;
1929         }
1930         if (sawplus && (!sawopen || !RExC_sawback)
1931             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1932             /* x+ must match at the 1st pos of run of x's */
1933             r->reganch |= ROPT_SKIP;
1934
1935         /* Scan is after the zeroth branch, first is atomic matcher. */
1936         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1937                               (IV)(first - scan + 1)));
1938         /*
1939         * If there's something expensive in the r.e., find the
1940         * longest literal string that must appear and make it the
1941         * regmust.  Resolve ties in favor of later strings, since
1942         * the regstart check works with the beginning of the r.e.
1943         * and avoiding duplication strengthens checking.  Not a
1944         * strong reason, but sufficient in the absence of others.
1945         * [Now we resolve ties in favor of the earlier string if
1946         * it happens that c_offset_min has been invalidated, since the
1947         * earlier string may buy us something the later one won't.]
1948         */
1949         minlen = 0;
1950
1951         data.longest_fixed = newSVpvn("",0);
1952         data.longest_float = newSVpvn("",0);
1953         data.last_found = newSVpvn("",0);
1954         data.longest = &(data.longest_fixed);
1955         first = scan;
1956         if (!r->regstclass) {
1957             cl_init(pRExC_state, &ch_class);
1958             data.start_class = &ch_class;
1959             stclass_flag = SCF_DO_STCLASS_AND;
1960         } else                          /* XXXX Check for BOUND? */
1961             stclass_flag = 0;
1962         data.last_closep = &last_close;
1963
1964         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1965                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1966         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1967              && data.last_start_min == 0 && data.last_end > 0
1968              && !RExC_seen_zerolen
1969              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1970             r->reganch |= ROPT_CHECK_ALL;
1971         scan_commit(pRExC_state, &data);
1972         SvREFCNT_dec(data.last_found);
1973
1974         longest_float_length = CHR_SVLEN(data.longest_float);
1975         if (longest_float_length
1976             || (data.flags & SF_FL_BEFORE_EOL
1977                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1978                     || (RExC_flags & PMf_MULTILINE)))) {
1979             int t;
1980
1981             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1982                 && data.offset_fixed == data.offset_float_min
1983                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1984                     goto remove_float;          /* As in (a)+. */
1985
1986             if (SvUTF8(data.longest_float)) {
1987                 r->float_utf8 = data.longest_float;
1988                 r->float_substr = Nullsv;
1989             } else {
1990                 r->float_substr = data.longest_float;
1991                 r->float_utf8 = Nullsv;
1992             }
1993             r->float_min_offset = data.offset_float_min;
1994             r->float_max_offset = data.offset_float_max;
1995             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1996                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1997                            || (RExC_flags & PMf_MULTILINE)));
1998             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1999         }
2000         else {
2001           remove_float:
2002             r->float_substr = r->float_utf8 = Nullsv;
2003             SvREFCNT_dec(data.longest_float);
2004             longest_float_length = 0;
2005         }
2006
2007         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2008         if (longest_fixed_length
2009             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2010                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2011                     || (RExC_flags & PMf_MULTILINE)))) {
2012             int t;
2013
2014             if (SvUTF8(data.longest_fixed)) {
2015                 r->anchored_utf8 = data.longest_fixed;
2016                 r->anchored_substr = Nullsv;
2017             } else {
2018                 r->anchored_substr = data.longest_fixed;
2019                 r->anchored_utf8 = Nullsv;
2020             }
2021             r->anchored_offset = data.offset_fixed;
2022             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2023                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2024                      || (RExC_flags & PMf_MULTILINE)));
2025             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2026         }
2027         else {
2028             r->anchored_substr = r->anchored_utf8 = Nullsv;
2029             SvREFCNT_dec(data.longest_fixed);
2030             longest_fixed_length = 0;
2031         }
2032         if (r->regstclass
2033             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2034             r->regstclass = NULL;
2035         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2036             && stclass_flag
2037             && !(data.start_class->flags & ANYOF_EOS)
2038             && !cl_is_anything(data.start_class))
2039         {
2040             const I32 n = add_data(pRExC_state, 1, "f");
2041
2042             Newx(RExC_rx->data->data[n], 1,
2043                 struct regnode_charclass_class);
2044             StructCopy(data.start_class,
2045                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2046                        struct regnode_charclass_class);
2047             r->regstclass = (regnode*)RExC_rx->data->data[n];
2048             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2049             PL_regdata = r->data; /* for regprop() */
2050             DEBUG_r({ SV *sv = sv_newmortal();
2051                       regprop(sv, (regnode*)data.start_class);
2052                       PerlIO_printf(Perl_debug_log,
2053                                     "synthetic stclass \"%s\".\n",
2054                                     SvPVX_const(sv));});
2055         }
2056
2057         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2058         if (longest_fixed_length > longest_float_length) {
2059             r->check_substr = r->anchored_substr;
2060             r->check_utf8 = r->anchored_utf8;
2061             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2062             if (r->reganch & ROPT_ANCH_SINGLE)
2063                 r->reganch |= ROPT_NOSCAN;
2064         }
2065         else {
2066             r->check_substr = r->float_substr;
2067             r->check_utf8 = r->float_utf8;
2068             r->check_offset_min = data.offset_float_min;
2069             r->check_offset_max = data.offset_float_max;
2070         }
2071         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2072            This should be changed ASAP!  */
2073         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2074             r->reganch |= RE_USE_INTUIT;
2075             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2076                 r->reganch |= RE_INTUIT_TAIL;
2077         }
2078     }
2079     else {
2080         /* Several toplevels. Best we can is to set minlen. */
2081         I32 fake;
2082         struct regnode_charclass_class ch_class;
2083         I32 last_close = 0;
2084         
2085         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2086         scan = r->program + 1;
2087         cl_init(pRExC_state, &ch_class);
2088         data.start_class = &ch_class;
2089         data.last_closep = &last_close;
2090         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2091         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2092                 = r->float_substr = r->float_utf8 = Nullsv;
2093         if (!(data.start_class->flags & ANYOF_EOS)
2094             && !cl_is_anything(data.start_class))
2095         {
2096             const I32 n = add_data(pRExC_state, 1, "f");
2097
2098             Newx(RExC_rx->data->data[n], 1,
2099                 struct regnode_charclass_class);
2100             StructCopy(data.start_class,
2101                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2102                        struct regnode_charclass_class);
2103             r->regstclass = (regnode*)RExC_rx->data->data[n];
2104             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2105             DEBUG_r({ SV* sv = sv_newmortal();
2106                       regprop(sv, (regnode*)data.start_class);
2107                       PerlIO_printf(Perl_debug_log,
2108                                     "synthetic stclass \"%s\".\n",
2109                                     SvPVX_const(sv));});
2110         }
2111     }
2112
2113     r->minlen = minlen;
2114     if (RExC_seen & REG_SEEN_GPOS)
2115         r->reganch |= ROPT_GPOS_SEEN;
2116     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2117         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2118     if (RExC_seen & REG_SEEN_EVAL)
2119         r->reganch |= ROPT_EVAL_SEEN;
2120     if (RExC_seen & REG_SEEN_CANY)
2121         r->reganch |= ROPT_CANY_SEEN;
2122     Newxz(r->startp, RExC_npar, I32);
2123     Newxz(r->endp, RExC_npar, I32);
2124     PL_regdata = r->data; /* for regprop() */
2125     DEBUG_r(regdump(r));
2126     return(r);
2127 }
2128
2129 /*
2130  - reg - regular expression, i.e. main body or parenthesized thing
2131  *
2132  * Caller must absorb opening parenthesis.
2133  *
2134  * Combining parenthesis handling with the base level of regular expression
2135  * is a trifle forced, but the need to tie the tails of the branches to what
2136  * follows makes it hard to avoid.
2137  */
2138 STATIC regnode *
2139 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2140     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2141 {
2142     register regnode *ret;              /* Will be the head of the group. */
2143     register regnode *br;
2144     register regnode *lastbr;
2145     register regnode *ender = 0;
2146     register I32 parno = 0;
2147     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2148
2149     /* for (?g), (?gc), and (?o) warnings; warning
2150        about (?c) will warn about (?g) -- japhy    */
2151
2152     I32 wastedflags = 0x00,
2153         wasted_o    = 0x01,
2154         wasted_g    = 0x02,
2155         wasted_gc   = 0x02 | 0x04,
2156         wasted_c    = 0x04;
2157
2158     char * parse_start = RExC_parse; /* MJD */
2159     char * const oregcomp_parse = RExC_parse;
2160     char c;
2161
2162     *flagp = 0;                         /* Tentatively. */
2163
2164
2165     /* Make an OPEN node, if parenthesized. */
2166     if (paren) {
2167         if (*RExC_parse == '?') { /* (?...) */
2168             U32 posflags = 0, negflags = 0;
2169             U32 *flagsp = &posflags;
2170             int logical = 0;
2171             const char * const seqstart = RExC_parse;
2172
2173             RExC_parse++;
2174             paren = *RExC_parse++;
2175             ret = NULL;                 /* For look-ahead/behind. */
2176             switch (paren) {
2177             case '<':           /* (?<...) */
2178                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2179                 if (*RExC_parse == '!')
2180                     paren = ',';
2181                 if (*RExC_parse != '=' && *RExC_parse != '!')
2182                     goto unknown;
2183                 RExC_parse++;
2184             case '=':           /* (?=...) */
2185             case '!':           /* (?!...) */
2186                 RExC_seen_zerolen++;
2187             case ':':           /* (?:...) */
2188             case '>':           /* (?>...) */
2189                 break;
2190             case '$':           /* (?$...) */
2191             case '@':           /* (?@...) */
2192                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2193                 break;
2194             case '#':           /* (?#...) */
2195                 while (*RExC_parse && *RExC_parse != ')')
2196                     RExC_parse++;
2197                 if (*RExC_parse != ')')
2198                     FAIL("Sequence (?#... not terminated");
2199                 nextchar(pRExC_state);
2200                 *flagp = TRYAGAIN;
2201                 return NULL;
2202             case 'p':           /* (?p...) */
2203                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2204                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2205                 /* FALL THROUGH*/
2206             case '?':           /* (??...) */
2207                 logical = 1;
2208                 if (*RExC_parse != '{')
2209                     goto unknown;
2210                 paren = *RExC_parse++;
2211                 /* FALL THROUGH */
2212             case '{':           /* (?{...}) */
2213             {
2214                 I32 count = 1, n = 0;
2215                 char c;
2216                 char *s = RExC_parse;
2217                 SV *sv;
2218                 OP_4tree *sop, *rop;
2219
2220                 RExC_seen_zerolen++;
2221                 RExC_seen |= REG_SEEN_EVAL;
2222                 while (count && (c = *RExC_parse)) {
2223                     if (c == '\\' && RExC_parse[1])
2224                         RExC_parse++;
2225                     else if (c == '{')
2226                         count++;
2227                     else if (c == '}')
2228                         count--;
2229                     RExC_parse++;
2230                 }
2231                 if (*RExC_parse != ')')
2232                 {
2233                     RExC_parse = s;             
2234                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2235                 }
2236                 if (!SIZE_ONLY) {
2237                     PAD *pad;
2238                 
2239                     if (RExC_parse - 1 - s)
2240                         sv = newSVpvn(s, RExC_parse - 1 - s);
2241                     else
2242                         sv = newSVpvn("", 0);
2243
2244                     ENTER;
2245                     Perl_save_re_context(aTHX);
2246                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2247                     sop->op_private |= OPpREFCOUNTED;
2248                     /* re_dup will OpREFCNT_inc */
2249                     OpREFCNT_set(sop, 1);
2250                     LEAVE;
2251
2252                     n = add_data(pRExC_state, 3, "nop");
2253                     RExC_rx->data->data[n] = (void*)rop;
2254                     RExC_rx->data->data[n+1] = (void*)sop;
2255                     RExC_rx->data->data[n+2] = (void*)pad;
2256                     SvREFCNT_dec(sv);
2257                 }
2258                 else {                                          /* First pass */
2259                     if (PL_reginterp_cnt < ++RExC_seen_evals
2260                         && IN_PERL_RUNTIME)
2261                         /* No compiled RE interpolated, has runtime
2262                            components ===> unsafe.  */
2263                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2264                     if (PL_tainting && PL_tainted)
2265                         FAIL("Eval-group in insecure regular expression");
2266                 }
2267                 
2268                 nextchar(pRExC_state);
2269                 if (logical) {
2270                     ret = reg_node(pRExC_state, LOGICAL);
2271                     if (!SIZE_ONLY)
2272                         ret->flags = 2;
2273                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2274                     /* deal with the length of this later - MJD */
2275                     return ret;
2276                 }
2277                 ret = reganode(pRExC_state, EVAL, n);
2278                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2279                 Set_Node_Offset(ret, parse_start);
2280                 return ret;
2281             }
2282             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2283             {
2284                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2285                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2286                         || RExC_parse[1] == '<'
2287                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2288                         I32 flag;
2289                         
2290                         ret = reg_node(pRExC_state, LOGICAL);
2291                         if (!SIZE_ONLY)
2292                             ret->flags = 1;
2293                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2294                         goto insert_if;
2295                     }
2296                 }
2297                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2298                     /* (?(1)...) */
2299                     parno = atoi(RExC_parse++);
2300
2301                     while (isDIGIT(*RExC_parse))
2302                         RExC_parse++;
2303                     ret = reganode(pRExC_state, GROUPP, parno);
2304
2305                     if ((c = *nextchar(pRExC_state)) != ')')
2306                         vFAIL("Switch condition not recognized");
2307                   insert_if:
2308                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2309                     br = regbranch(pRExC_state, &flags, 1);
2310                     if (br == NULL)
2311                         br = reganode(pRExC_state, LONGJMP, 0);
2312                     else
2313                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2314                     c = *nextchar(pRExC_state);
2315                     if (flags&HASWIDTH)
2316                         *flagp |= HASWIDTH;
2317                     if (c == '|') {
2318                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2319                         regbranch(pRExC_state, &flags, 1);
2320                         regtail(pRExC_state, ret, lastbr);
2321                         if (flags&HASWIDTH)
2322                             *flagp |= HASWIDTH;
2323                         c = *nextchar(pRExC_state);
2324                     }
2325                     else
2326                         lastbr = NULL;
2327                     if (c != ')')
2328                         vFAIL("Switch (?(condition)... contains too many branches");
2329                     ender = reg_node(pRExC_state, TAIL);
2330                     regtail(pRExC_state, br, ender);
2331                     if (lastbr) {
2332                         regtail(pRExC_state, lastbr, ender);
2333                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2334                     }
2335                     else
2336                         regtail(pRExC_state, ret, ender);
2337                     return ret;
2338                 }
2339                 else {
2340                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2341                 }
2342             }
2343             case 0:
2344                 RExC_parse--; /* for vFAIL to print correctly */
2345                 vFAIL("Sequence (? incomplete");
2346                 break;
2347             default:
2348                 --RExC_parse;
2349               parse_flags:      /* (?i) */
2350                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2351                     /* (?g), (?gc) and (?o) are useless here
2352                        and must be globally applied -- japhy */
2353
2354                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2355                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2356                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2357                             if (! (wastedflags & wflagbit) ) {
2358                                 wastedflags |= wflagbit;
2359                                 vWARN5(
2360                                     RExC_parse + 1,
2361                                     "Useless (%s%c) - %suse /%c modifier",
2362                                     flagsp == &negflags ? "?-" : "?",
2363                                     *RExC_parse,
2364                                     flagsp == &negflags ? "don't " : "",
2365                                     *RExC_parse
2366                                 );
2367                             }
2368                         }
2369                     }
2370                     else if (*RExC_parse == 'c') {
2371                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2372                             if (! (wastedflags & wasted_c) ) {
2373                                 wastedflags |= wasted_gc;
2374                                 vWARN3(
2375                                     RExC_parse + 1,
2376                                     "Useless (%sc) - %suse /gc modifier",
2377                                     flagsp == &negflags ? "?-" : "?",
2378                                     flagsp == &negflags ? "don't " : ""
2379                                 );
2380                             }
2381                         }
2382                     }
2383                     else { pmflag(flagsp, *RExC_parse); }
2384
2385                     ++RExC_parse;
2386                 }
2387                 if (*RExC_parse == '-') {
2388                     flagsp = &negflags;
2389                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2390                     ++RExC_parse;
2391                     goto parse_flags;
2392                 }
2393                 RExC_flags |= posflags;
2394                 RExC_flags &= ~negflags;
2395                 if (*RExC_parse == ':') {
2396                     RExC_parse++;
2397                     paren = ':';
2398                     break;
2399                 }               
2400               unknown:
2401                 if (*RExC_parse != ')') {
2402                     RExC_parse++;
2403                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2404                 }
2405                 nextchar(pRExC_state);
2406                 *flagp = TRYAGAIN;
2407                 return NULL;
2408             }
2409         }
2410         else {                  /* (...) */
2411             parno = RExC_npar;
2412             RExC_npar++;
2413             ret = reganode(pRExC_state, OPEN, parno);
2414             Set_Node_Length(ret, 1); /* MJD */
2415             Set_Node_Offset(ret, RExC_parse); /* MJD */
2416             open = 1;
2417         }
2418     }
2419     else                        /* ! paren */
2420         ret = NULL;
2421
2422     /* Pick up the branches, linking them together. */
2423     parse_start = RExC_parse;   /* MJD */
2424     br = regbranch(pRExC_state, &flags, 1);
2425     /*     branch_len = (paren != 0); */
2426
2427     if (br == NULL)
2428         return(NULL);
2429     if (*RExC_parse == '|') {
2430         if (!SIZE_ONLY && RExC_extralen) {
2431             reginsert(pRExC_state, BRANCHJ, br);
2432         }
2433         else {                  /* MJD */
2434             reginsert(pRExC_state, BRANCH, br);
2435             Set_Node_Length(br, paren != 0);
2436             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2437         }
2438         have_branch = 1;
2439         if (SIZE_ONLY)
2440             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2441     }
2442     else if (paren == ':') {
2443         *flagp |= flags&SIMPLE;
2444     }
2445     if (open) {                         /* Starts with OPEN. */
2446         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2447     }
2448     else if (paren != '?')              /* Not Conditional */
2449         ret = br;
2450     *flagp |= flags & (SPSTART | HASWIDTH);
2451     lastbr = br;
2452     while (*RExC_parse == '|') {
2453         if (!SIZE_ONLY && RExC_extralen) {
2454             ender = reganode(pRExC_state, LONGJMP,0);
2455             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2456         }
2457         if (SIZE_ONLY)
2458             RExC_extralen += 2;         /* Account for LONGJMP. */
2459         nextchar(pRExC_state);
2460         br = regbranch(pRExC_state, &flags, 0);
2461
2462         if (br == NULL)
2463             return(NULL);
2464         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2465         lastbr = br;
2466         if (flags&HASWIDTH)
2467             *flagp |= HASWIDTH;
2468         *flagp |= flags&SPSTART;
2469     }
2470
2471     if (have_branch || paren != ':') {
2472         /* Make a closing node, and hook it on the end. */
2473         switch (paren) {
2474         case ':':
2475             ender = reg_node(pRExC_state, TAIL);
2476             break;
2477         case 1:
2478             ender = reganode(pRExC_state, CLOSE, parno);
2479             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2480             Set_Node_Length(ender,1); /* MJD */
2481             break;
2482         case '<':
2483         case ',':
2484         case '=':
2485         case '!':
2486             *flagp &= ~HASWIDTH;
2487             /* FALL THROUGH */
2488         case '>':
2489             ender = reg_node(pRExC_state, SUCCEED);
2490             break;
2491         case 0:
2492             ender = reg_node(pRExC_state, END);
2493             break;
2494         }
2495         regtail(pRExC_state, lastbr, ender);
2496
2497         if (have_branch) {
2498             /* Hook the tails of the branches to the closing node. */
2499             for (br = ret; br != NULL; br = regnext(br)) {
2500                 regoptail(pRExC_state, br, ender);
2501             }
2502         }
2503     }
2504
2505     {
2506         const char *p;
2507         static const char parens[] = "=!<,>";
2508
2509         if (paren && (p = strchr(parens, paren))) {
2510             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2511             int flag = (p - parens) > 1;
2512
2513             if (paren == '>')
2514                 node = SUSPEND, flag = 0;
2515             reginsert(pRExC_state, node,ret);
2516             Set_Node_Cur_Length(ret);
2517             Set_Node_Offset(ret, parse_start + 1);
2518             ret->flags = flag;
2519             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2520         }
2521     }
2522
2523     /* Check for proper termination. */
2524     if (paren) {
2525         RExC_flags = oregflags;
2526         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2527             RExC_parse = oregcomp_parse;
2528             vFAIL("Unmatched (");
2529         }
2530     }
2531     else if (!paren && RExC_parse < RExC_end) {
2532         if (*RExC_parse == ')') {
2533             RExC_parse++;
2534             vFAIL("Unmatched )");
2535         }
2536         else
2537             FAIL("Junk on end of regexp");      /* "Can't happen". */
2538         /* NOTREACHED */
2539     }
2540
2541     return(ret);
2542 }
2543
2544 /*
2545  - regbranch - one alternative of an | operator
2546  *
2547  * Implements the concatenation operator.
2548  */
2549 STATIC regnode *
2550 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2551 {
2552     register regnode *ret;
2553     register regnode *chain = NULL;
2554     register regnode *latest;
2555     I32 flags = 0, c = 0;
2556
2557     if (first)
2558         ret = NULL;
2559     else {
2560         if (!SIZE_ONLY && RExC_extralen)
2561             ret = reganode(pRExC_state, BRANCHJ,0);
2562         else {
2563             ret = reg_node(pRExC_state, BRANCH);
2564             Set_Node_Length(ret, 1);
2565         }
2566     }
2567         
2568     if (!first && SIZE_ONLY)
2569         RExC_extralen += 1;                     /* BRANCHJ */
2570
2571     *flagp = WORST;                     /* Tentatively. */
2572
2573     RExC_parse--;
2574     nextchar(pRExC_state);
2575     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2576         flags &= ~TRYAGAIN;
2577         latest = regpiece(pRExC_state, &flags);
2578         if (latest == NULL) {
2579             if (flags & TRYAGAIN)
2580                 continue;
2581             return(NULL);
2582         }
2583         else if (ret == NULL)
2584             ret = latest;
2585         *flagp |= flags&HASWIDTH;
2586         if (chain == NULL)      /* First piece. */
2587             *flagp |= flags&SPSTART;
2588         else {
2589             RExC_naughty++;
2590             regtail(pRExC_state, chain, latest);
2591         }
2592         chain = latest;
2593         c++;
2594     }
2595     if (chain == NULL) {        /* Loop ran zero times. */
2596         chain = reg_node(pRExC_state, NOTHING);
2597         if (ret == NULL)
2598             ret = chain;
2599     }
2600     if (c == 1) {
2601         *flagp |= flags&SIMPLE;
2602     }
2603
2604     return(ret);
2605 }
2606
2607 /*
2608  - regpiece - something followed by possible [*+?]
2609  *
2610  * Note that the branching code sequences used for ? and the general cases
2611  * of * and + are somewhat optimized:  they use the same NOTHING node as
2612  * both the endmarker for their branch list and the body of the last branch.
2613  * It might seem that this node could be dispensed with entirely, but the
2614  * endmarker role is not redundant.
2615  */
2616 STATIC regnode *
2617 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2618 {
2619     register regnode *ret;
2620     register char op;
2621     register char *next;
2622     I32 flags;
2623     const char * const origparse = RExC_parse;
2624     char *maxpos;
2625     I32 min;
2626     I32 max = REG_INFTY;
2627     char *parse_start;
2628
2629     ret = regatom(pRExC_state, &flags);
2630     if (ret == NULL) {
2631         if (flags & TRYAGAIN)
2632             *flagp |= TRYAGAIN;
2633         return(NULL);
2634     }
2635
2636     op = *RExC_parse;
2637
2638     if (op == '{' && regcurly(RExC_parse)) {
2639         parse_start = RExC_parse; /* MJD */
2640         next = RExC_parse + 1;
2641         maxpos = Nullch;
2642         while (isDIGIT(*next) || *next == ',') {
2643             if (*next == ',') {
2644                 if (maxpos)
2645                     break;
2646                 else
2647                     maxpos = next;
2648             }
2649             next++;
2650         }
2651         if (*next == '}') {             /* got one */
2652             if (!maxpos)
2653                 maxpos = next;
2654             RExC_parse++;
2655             min = atoi(RExC_parse);
2656             if (*maxpos == ',')
2657                 maxpos++;
2658             else
2659                 maxpos = RExC_parse;
2660             max = atoi(maxpos);
2661             if (!max && *maxpos != '0')
2662                 max = REG_INFTY;                /* meaning "infinity" */
2663             else if (max >= REG_INFTY)
2664                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2665             RExC_parse = next;
2666             nextchar(pRExC_state);
2667
2668         do_curly:
2669             if ((flags&SIMPLE)) {
2670                 RExC_naughty += 2 + RExC_naughty / 2;
2671                 reginsert(pRExC_state, CURLY, ret);
2672                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2673                 Set_Node_Cur_Length(ret);
2674             }
2675             else {
2676                 regnode *w = reg_node(pRExC_state, WHILEM);
2677
2678                 w->flags = 0;
2679                 regtail(pRExC_state, ret, w);
2680                 if (!SIZE_ONLY && RExC_extralen) {
2681                     reginsert(pRExC_state, LONGJMP,ret);
2682                     reginsert(pRExC_state, NOTHING,ret);
2683                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2684                 }
2685                 reginsert(pRExC_state, CURLYX,ret);
2686                                 /* MJD hk */
2687                 Set_Node_Offset(ret, parse_start+1);
2688                 Set_Node_Length(ret,
2689                                 op == '{' ? (RExC_parse - parse_start) : 1);
2690
2691                 if (!SIZE_ONLY && RExC_extralen)
2692                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2693                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2694                 if (SIZE_ONLY)
2695                     RExC_whilem_seen++, RExC_extralen += 3;
2696                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2697             }
2698             ret->flags = 0;
2699
2700             if (min > 0)
2701                 *flagp = WORST;
2702             if (max > 0)
2703                 *flagp |= HASWIDTH;
2704             if (max && max < min)
2705                 vFAIL("Can't do {n,m} with n > m");
2706             if (!SIZE_ONLY) {
2707                 ARG1_SET(ret, (U16)min);
2708                 ARG2_SET(ret, (U16)max);
2709             }
2710
2711             goto nest_check;
2712         }
2713     }
2714
2715     if (!ISMULT1(op)) {
2716         *flagp = flags;
2717         return(ret);
2718     }
2719
2720 #if 0                           /* Now runtime fix should be reliable. */
2721
2722     /* if this is reinstated, don't forget to put this back into perldiag:
2723
2724             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2725
2726            (F) The part of the regexp subject to either the * or + quantifier
2727            could match an empty string. The {#} shows in the regular
2728            expression about where the problem was discovered.
2729
2730     */
2731
2732     if (!(flags&HASWIDTH) && op != '?')
2733       vFAIL("Regexp *+ operand could be empty");
2734 #endif
2735
2736     parse_start = RExC_parse;
2737     nextchar(pRExC_state);
2738
2739     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2740
2741     if (op == '*' && (flags&SIMPLE)) {
2742         reginsert(pRExC_state, STAR, ret);
2743         ret->flags = 0;
2744         RExC_naughty += 4;
2745     }
2746     else if (op == '*') {
2747         min = 0;
2748         goto do_curly;
2749     }
2750     else if (op == '+' && (flags&SIMPLE)) {
2751         reginsert(pRExC_state, PLUS, ret);
2752         ret->flags = 0;
2753         RExC_naughty += 3;
2754     }
2755     else if (op == '+') {
2756         min = 1;
2757         goto do_curly;
2758     }
2759     else if (op == '?') {
2760         min = 0; max = 1;
2761         goto do_curly;
2762     }
2763   nest_check:
2764     if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
2765         vWARN3(RExC_parse,
2766                "%.*s matches null string many times",
2767                RExC_parse - origparse,
2768                origparse);
2769     }
2770
2771     if (*RExC_parse == '?') {
2772         nextchar(pRExC_state);
2773         reginsert(pRExC_state, MINMOD, ret);
2774         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2775     }
2776     if (ISMULT2(RExC_parse)) {
2777         RExC_parse++;
2778         vFAIL("Nested quantifiers");
2779     }
2780
2781     return(ret);
2782 }
2783
2784 /*
2785  - regatom - the lowest level
2786  *
2787  * Optimization:  gobbles an entire sequence of ordinary characters so that
2788  * it can turn them into a single node, which is smaller to store and
2789  * faster to run.  Backslashed characters are exceptions, each becoming a
2790  * separate node; the code is simpler that way and it's not worth fixing.
2791  *
2792  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2793 STATIC regnode *
2794 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2795 {
2796     register regnode *ret = 0;
2797     I32 flags;
2798     char *parse_start = RExC_parse;
2799
2800     *flagp = WORST;             /* Tentatively. */
2801
2802 tryagain:
2803     switch (*RExC_parse) {
2804     case '^':
2805         RExC_seen_zerolen++;
2806         nextchar(pRExC_state);
2807         if (RExC_flags & PMf_MULTILINE)
2808             ret = reg_node(pRExC_state, MBOL);
2809         else if (RExC_flags & PMf_SINGLELINE)
2810             ret = reg_node(pRExC_state, SBOL);
2811         else
2812             ret = reg_node(pRExC_state, BOL);
2813         Set_Node_Length(ret, 1); /* MJD */
2814         break;
2815     case '$':
2816         nextchar(pRExC_state);
2817         if (*RExC_parse)
2818             RExC_seen_zerolen++;
2819         if (RExC_flags & PMf_MULTILINE)
2820             ret = reg_node(pRExC_state, MEOL);
2821         else if (RExC_flags & PMf_SINGLELINE)
2822             ret = reg_node(pRExC_state, SEOL);
2823         else
2824             ret = reg_node(pRExC_state, EOL);
2825         Set_Node_Length(ret, 1); /* MJD */
2826         break;
2827     case '.':
2828         nextchar(pRExC_state);
2829         if (RExC_flags & PMf_SINGLELINE)
2830             ret = reg_node(pRExC_state, SANY);
2831         else
2832             ret = reg_node(pRExC_state, REG_ANY);
2833         *flagp |= HASWIDTH|SIMPLE;
2834         RExC_naughty++;
2835         Set_Node_Length(ret, 1); /* MJD */
2836         break;
2837     case '[':
2838     {
2839         char *oregcomp_parse = ++RExC_parse;
2840         ret = regclass(pRExC_state);
2841         if (*RExC_parse != ']') {
2842             RExC_parse = oregcomp_parse;
2843             vFAIL("Unmatched [");
2844         }
2845         nextchar(pRExC_state);
2846         *flagp |= HASWIDTH|SIMPLE;
2847         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2848         break;
2849     }
2850     case '(':
2851         nextchar(pRExC_state);
2852         ret = reg(pRExC_state, 1, &flags);
2853         if (ret == NULL) {
2854                 if (flags & TRYAGAIN) {
2855                     if (RExC_parse == RExC_end) {
2856                          /* Make parent create an empty node if needed. */
2857                         *flagp |= TRYAGAIN;
2858                         return(NULL);
2859                     }
2860                     goto tryagain;
2861                 }
2862                 return(NULL);
2863         }
2864         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2865         break;
2866     case '|':
2867     case ')':
2868         if (flags & TRYAGAIN) {
2869             *flagp |= TRYAGAIN;
2870             return NULL;
2871         }
2872         vFAIL("Internal urp");
2873                                 /* Supposed to be caught earlier. */
2874         break;
2875     case '{':
2876         if (!regcurly(RExC_parse)) {
2877             RExC_parse++;
2878             goto defchar;
2879         }
2880         /* FALL THROUGH */
2881     case '?':
2882     case '+':
2883     case '*':
2884         RExC_parse++;
2885         vFAIL("Quantifier follows nothing");
2886         break;
2887     case '\\':
2888         switch (*++RExC_parse) {
2889         case 'A':
2890             RExC_seen_zerolen++;
2891             ret = reg_node(pRExC_state, SBOL);
2892             *flagp |= SIMPLE;
2893             nextchar(pRExC_state);
2894             Set_Node_Length(ret, 2); /* MJD */
2895             break;
2896         case 'G':
2897             ret = reg_node(pRExC_state, GPOS);
2898             RExC_seen |= REG_SEEN_GPOS;
2899             *flagp |= SIMPLE;
2900             nextchar(pRExC_state);
2901             Set_Node_Length(ret, 2); /* MJD */
2902             break;
2903         case 'Z':
2904             ret = reg_node(pRExC_state, SEOL);
2905             *flagp |= SIMPLE;
2906             RExC_seen_zerolen++;                /* Do not optimize RE away */
2907             nextchar(pRExC_state);
2908             break;
2909         case 'z':
2910             ret = reg_node(pRExC_state, EOS);
2911             *flagp |= SIMPLE;
2912             RExC_seen_zerolen++;                /* Do not optimize RE away */
2913             nextchar(pRExC_state);
2914             Set_Node_Length(ret, 2); /* MJD */
2915             break;
2916         case 'C':
2917             ret = reg_node(pRExC_state, CANY);
2918             RExC_seen |= REG_SEEN_CANY;
2919             *flagp |= HASWIDTH|SIMPLE;
2920             nextchar(pRExC_state);
2921             Set_Node_Length(ret, 2); /* MJD */
2922             break;
2923         case 'X':
2924             ret = reg_node(pRExC_state, CLUMP);
2925             *flagp |= HASWIDTH;
2926             nextchar(pRExC_state);
2927             Set_Node_Length(ret, 2); /* MJD */
2928             break;
2929         case 'w':
2930             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2931             *flagp |= HASWIDTH|SIMPLE;
2932             nextchar(pRExC_state);
2933             Set_Node_Length(ret, 2); /* MJD */
2934             break;
2935         case 'W':
2936             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2937             *flagp |= HASWIDTH|SIMPLE;
2938             nextchar(pRExC_state);
2939             Set_Node_Length(ret, 2); /* MJD */
2940             break;
2941         case 'b':
2942             RExC_seen_zerolen++;
2943             RExC_seen |= REG_SEEN_LOOKBEHIND;
2944             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2945             *flagp |= SIMPLE;
2946             nextchar(pRExC_state);
2947             Set_Node_Length(ret, 2); /* MJD */
2948             break;
2949         case 'B':
2950             RExC_seen_zerolen++;
2951             RExC_seen |= REG_SEEN_LOOKBEHIND;
2952             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2953             *flagp |= SIMPLE;
2954             nextchar(pRExC_state);
2955             Set_Node_Length(ret, 2); /* MJD */
2956             break;
2957         case 's':
2958             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2959             *flagp |= HASWIDTH|SIMPLE;
2960             nextchar(pRExC_state);
2961             Set_Node_Length(ret, 2); /* MJD */
2962             break;
2963         case 'S':
2964             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2965             *flagp |= HASWIDTH|SIMPLE;
2966             nextchar(pRExC_state);
2967             Set_Node_Length(ret, 2); /* MJD */
2968             break;
2969         case 'd':
2970             ret = reg_node(pRExC_state, DIGIT);
2971             *flagp |= HASWIDTH|SIMPLE;
2972             nextchar(pRExC_state);
2973             Set_Node_Length(ret, 2); /* MJD */
2974             break;
2975         case 'D':
2976             ret = reg_node(pRExC_state, NDIGIT);
2977             *flagp |= HASWIDTH|SIMPLE;
2978             nextchar(pRExC_state);
2979             Set_Node_Length(ret, 2); /* MJD */
2980             break;
2981         case 'p':
2982         case 'P':
2983             {   
2984                 char* oldregxend = RExC_end;
2985                 char* parse_start = RExC_parse - 2;
2986
2987                 if (RExC_parse[1] == '{') {
2988                   /* a lovely hack--pretend we saw [\pX] instead */
2989                     RExC_end = strchr(RExC_parse, '}');
2990                     if (!RExC_end) {
2991                         U8 c = (U8)*RExC_parse;
2992                         RExC_parse += 2;
2993                         RExC_end = oldregxend;
2994                         vFAIL2("Missing right brace on \\%c{}", c);
2995                     }
2996                     RExC_end++;
2997                 }
2998                 else {
2999                     RExC_end = RExC_parse + 2;
3000                     if (RExC_end > oldregxend)
3001                         RExC_end = oldregxend;
3002                 }
3003                 RExC_parse--;
3004
3005                 ret = regclass(pRExC_state);
3006
3007                 RExC_end = oldregxend;
3008                 RExC_parse--;
3009
3010                 Set_Node_Offset(ret, parse_start + 2);
3011                 Set_Node_Cur_Length(ret);
3012                 nextchar(pRExC_state);
3013                 *flagp |= HASWIDTH|SIMPLE;
3014             }
3015             break;
3016         case 'n':
3017         case 'r':
3018         case 't':
3019         case 'f':
3020         case 'e':
3021         case 'a':
3022         case 'x':
3023         case 'c':
3024         case '0':
3025             goto defchar;
3026         case '1': case '2': case '3': case '4':
3027         case '5': case '6': case '7': case '8': case '9':
3028             {
3029                 const I32 num = atoi(RExC_parse);
3030
3031                 if (num > 9 && num >= RExC_npar)
3032                     goto defchar;
3033                 else {
3034                     char * parse_start = RExC_parse - 1; /* MJD */
3035                     while (isDIGIT(*RExC_parse))
3036                         RExC_parse++;
3037
3038                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3039                         vFAIL("Reference to nonexistent group");
3040                     RExC_sawback = 1;
3041                     ret = reganode(pRExC_state,
3042                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3043                                    num);
3044                     *flagp |= HASWIDTH;
3045
3046                     /* override incorrect value set in reganode MJD */
3047                     Set_Node_Offset(ret, parse_start+1);
3048                     Set_Node_Cur_Length(ret); /* MJD */
3049                     RExC_parse--;
3050                     nextchar(pRExC_state);
3051                 }
3052             }
3053             break;
3054         case '\0':
3055             if (RExC_parse >= RExC_end)
3056                 FAIL("Trailing \\");
3057             /* FALL THROUGH */
3058         default:
3059             /* Do not generate "unrecognized" warnings here, we fall
3060                back into the quick-grab loop below */
3061             parse_start--;
3062             goto defchar;
3063         }
3064         break;
3065
3066     case '#':
3067         if (RExC_flags & PMf_EXTENDED) {
3068             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3069             if (RExC_parse < RExC_end)
3070                 goto tryagain;
3071         }
3072         /* FALL THROUGH */
3073
3074     default: {
3075             register STRLEN len;
3076             register UV ender;
3077             register char *p;
3078             char *oldp, *s;
3079             STRLEN foldlen;
3080             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3081
3082             parse_start = RExC_parse - 1;
3083
3084             RExC_parse++;
3085
3086         defchar:
3087             ender = 0;
3088             ret = reg_node(pRExC_state,
3089                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3090             s = STRING(ret);
3091             for (len = 0, p = RExC_parse - 1;
3092               len < 127 && p < RExC_end;
3093               len++)
3094             {
3095                 oldp = p;
3096
3097                 if (RExC_flags & PMf_EXTENDED)
3098                     p = regwhite(p, RExC_end);
3099                 switch (*p) {
3100                 case '^':
3101                 case '$':
3102                 case '.':
3103                 case '[':
3104                 case '(':
3105                 case ')':
3106                 case '|':
3107                     goto loopdone;
3108                 case '\\':
3109                     switch (*++p) {
3110                     case 'A':
3111                     case 'C':
3112                     case 'X':
3113                     case 'G':
3114                     case 'Z':
3115                     case 'z':
3116                     case 'w':
3117                     case 'W':
3118                     case 'b':
3119                     case 'B':
3120                     case 's':
3121                     case 'S':
3122                     case 'd':
3123                     case 'D':
3124                     case 'p':
3125                     case 'P':
3126                         --p;
3127                         goto loopdone;
3128                     case 'n':
3129                         ender = '\n';
3130                         p++;
3131                         break;
3132                     case 'r':
3133                         ender = '\r';
3134                         p++;
3135                         break;
3136                     case 't':
3137                         ender = '\t';
3138                         p++;
3139                         break;
3140                     case 'f':
3141                         ender = '\f';
3142                         p++;
3143                         break;
3144                     case 'e':
3145                           ender = ASCII_TO_NATIVE('\033');
3146                         p++;
3147                         break;
3148                     case 'a':
3149                           ender = ASCII_TO_NATIVE('\007');
3150                         p++;
3151                         break;
3152                     case 'x':
3153                         if (*++p == '{') {
3154                             char* const e = strchr(p, '}');
3155         
3156                             if (!e) {
3157                                 RExC_parse = p + 1;
3158                                 vFAIL("Missing right brace on \\x{}");
3159                             }
3160                             else {
3161                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3162                                     | PERL_SCAN_DISALLOW_PREFIX;
3163                                 STRLEN numlen = e - p - 1;
3164                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3165                                 if (ender > 0xff)
3166                                     RExC_utf8 = 1;
3167                                 p = e + 1;
3168                             }
3169                         }
3170                         else {
3171                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3172                             STRLEN numlen = 2;
3173                             ender = grok_hex(p, &numlen, &flags, NULL);
3174                             p += numlen;
3175                         }
3176                         break;
3177                     case 'c':
3178                         p++;
3179                         ender = UCHARAT(p++);
3180                         ender = toCTRL(ender);
3181                         break;
3182                     case '0': case '1': case '2': case '3':case '4':
3183                     case '5': case '6': case '7': case '8':case '9':
3184                         if (*p == '0' ||
3185                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3186                             I32 flags = 0;
3187                             STRLEN numlen = 3;
3188                             ender = grok_oct(p, &numlen, &flags, NULL);
3189                             p += numlen;
3190                         }
3191                         else {
3192                             --p;
3193                             goto loopdone;
3194                         }
3195                         break;
3196                     case '\0':
3197                         if (p >= RExC_end)
3198                             FAIL("Trailing \\");
3199                         /* FALL THROUGH */
3200                     default:
3201                         if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
3202                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3203                         goto normal_default;
3204                     }
3205                     break;
3206                 default:
3207                   normal_default:
3208                     if (UTF8_IS_START(*p) && UTF) {
3209                         STRLEN numlen;
3210                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3211                                                &numlen, 0);
3212                         p += numlen;
3213                     }
3214                     else
3215                         ender = *p++;
3216                     break;
3217                 }
3218                 if (RExC_flags & PMf_EXTENDED)
3219                     p = regwhite(p, RExC_end);
3220                 if (UTF && FOLD) {
3221                     /* Prime the casefolded buffer. */
3222                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3223                 }
3224                 if (ISMULT2(p)) { /* Back off on ?+*. */
3225                     if (len)
3226                         p = oldp;
3227                     else if (UTF) {
3228                          STRLEN unilen;
3229
3230                          if (FOLD) {
3231                               /* Emit all the Unicode characters. */
3232                               STRLEN numlen;
3233                               for (foldbuf = tmpbuf;
3234                                    foldlen;
3235                                    foldlen -= numlen) {
3236                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3237                                    if (numlen > 0) {
3238                                         reguni(pRExC_state, ender, s, &unilen);
3239                                         s       += unilen;
3240                                         len     += unilen;
3241                                         /* In EBCDIC the numlen
3242                                          * and unilen can differ. */
3243                                         foldbuf += numlen;
3244                                         if (numlen >= foldlen)
3245                                              break;
3246                                    }
3247                                    else
3248                                         break; /* "Can't happen." */
3249                               }
3250                          }
3251                          else {
3252                               reguni(pRExC_state, ender, s, &unilen);
3253                               if (unilen > 0) {
3254                                    s   += unilen;
3255                                    len += unilen;
3256                               }
3257                          }
3258                     }
3259                     else {
3260                         len++;
3261                         REGC((char)ender, s++);
3262                     }
3263                     break;
3264                 }
3265                 if (UTF) {
3266                      STRLEN unilen;
3267
3268                      if (FOLD) {
3269                           /* Emit all the Unicode characters. */
3270                           STRLEN numlen;
3271                           for (foldbuf = tmpbuf;
3272                                foldlen;
3273                                foldlen -= numlen) {
3274                                ender = utf8_to_uvchr(foldbuf, &numlen);
3275                                if (numlen > 0) {
3276                                     reguni(pRExC_state, ender, s, &unilen);
3277                                     len     += unilen;
3278                                     s       += unilen;
3279                                     /* In EBCDIC the numlen
3280                                      * and unilen can differ. */
3281                                     foldbuf += numlen;
3282                                     if (numlen >= foldlen)
3283                                          break;
3284                                }
3285                                else
3286                                     break;
3287                           }
3288                      }
3289                      else {
3290                           reguni(pRExC_state, ender, s, &unilen);
3291                           if (unilen > 0) {
3292                                s   += unilen;
3293                                len += unilen;
3294                           }
3295                      }
3296                      len--;
3297                 }
3298                 else
3299                     REGC((char)ender, s++);
3300             }
3301         loopdone:
3302             RExC_parse = p - 1;
3303             Set_Node_Cur_Length(ret); /* MJD */
3304             nextchar(pRExC_state);
3305             {
3306                 /* len is STRLEN which is unsigned, need to copy to signed */
3307                 IV iv = len;
3308                 if (iv < 0)
3309                     vFAIL("Internal disaster");
3310             }
3311             if (len > 0)
3312                 *flagp |= HASWIDTH;
3313             if (len == 1 && UNI_IS_INVARIANT(ender))
3314                 *flagp |= SIMPLE;
3315             if (!SIZE_ONLY)
3316                 STR_LEN(ret) = len;
3317             if (SIZE_ONLY)
3318                 RExC_size += STR_SZ(len);
3319             else
3320                 RExC_emit += STR_SZ(len);
3321         }
3322         break;
3323     }
3324
3325     /* If the encoding pragma is in effect recode the text of
3326      * any EXACT-kind nodes. */
3327     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3328         STRLEN oldlen = STR_LEN(ret);
3329         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3330
3331         if (RExC_utf8)
3332             SvUTF8_on(sv);
3333         if (sv_utf8_downgrade(sv, TRUE)) {
3334             const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3335             const STRLEN newlen = SvCUR(sv);
3336
3337             if (SvUTF8(sv))
3338                 RExC_utf8 = 1;
3339             if (!SIZE_ONLY) {
3340                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3341                                       (int)oldlen, STRING(ret),
3342                                       (int)newlen, s));
3343                 Copy(s, STRING(ret), newlen, char);
3344                 STR_LEN(ret) += newlen - oldlen;
3345                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3346             } else
3347                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3348         }
3349     }
3350
3351     return(ret);
3352 }
3353
3354 STATIC char *
3355 S_regwhite(pTHX_ char *p, const char *e)
3356 {
3357     while (p < e) {
3358         if (isSPACE(*p))
3359             ++p;
3360         else if (*p == '#') {
3361             do {
3362                 p++;
3363             } while (p < e && *p != '\n');
3364         }
3365         else
3366             break;
3367     }
3368     return p;
3369 }
3370
3371 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3372    Character classes ([:foo:]) can also be negated ([:^foo:]).
3373    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3374    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3375    but trigger failures because they are currently unimplemented. */
3376
3377 #define POSIXCC_DONE(c)   ((c) == ':')
3378 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3379 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3380
3381 STATIC I32
3382 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3383 {
3384     I32 namedclass = OOB_NAMEDCLASS;
3385
3386     if (value == '[' && RExC_parse + 1 < RExC_end &&
3387         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3388         POSIXCC(UCHARAT(RExC_parse))) {
3389         const char c = UCHARAT(RExC_parse);
3390         char* s = RExC_parse++;
3391         
3392         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3393             RExC_parse++;
3394         if (RExC_parse == RExC_end)
3395             /* Grandfather lone [:, [=, [. */
3396             RExC_parse = s;
3397         else {
3398             const char* t = RExC_parse++; /* skip over the c */
3399             const char *posixcc;
3400
3401             assert(*t == c);
3402
3403             if (UCHARAT(RExC_parse) == ']') {
3404                 RExC_parse++; /* skip over the ending ] */
3405                 posixcc = s + 1;
3406                 if (*s == ':') {
3407                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3408                     const I32 skip = t - posixcc;
3409
3410                     /* Initially switch on the length of the name.  */
3411                     switch (skip) {
3412                     case 4:
3413                         if (memEQ(posixcc, "word", 4)) {
3414                             /* this is not POSIX, this is the Perl \w */;
3415                             namedclass
3416                                 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3417                         }
3418                         break;
3419                     case 5:
3420                         /* Names all of length 5.  */
3421                         /* alnum alpha ascii blank cntrl digit graph lower
3422                            print punct space upper  */
3423                         /* Offset 4 gives the best switch position.  */
3424                         switch (posixcc[4]) {
3425                         case 'a':
3426                             if (memEQ(posixcc, "alph", 4)) {
3427                                 /*                  a     */
3428                                 namedclass
3429                                     = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3430                             }
3431                             break;
3432                         case 'e':
3433                             if (memEQ(posixcc, "spac", 4)) {
3434                                 /*                  e     */
3435                                 namedclass
3436                                     = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3437                             }
3438                             break;
3439                         case 'h':
3440                             if (memEQ(posixcc, "grap", 4)) {
3441                                 /*                  h     */
3442                                 namedclass
3443                                     = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3444                             }
3445                             break;
3446                         case 'i':
3447                             if (memEQ(posixcc, "asci", 4)) {
3448                                 /*                  i     */
3449                                 namedclass
3450                                     = complement ? ANYOF_NASCII : ANYOF_ASCII;
3451                             }
3452                             break;
3453                         case 'k':
3454                             if (memEQ(posixcc, "blan", 4)) {
3455                                 /*                  k     */
3456                                 namedclass
3457                                     = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3458                             }
3459                             break;
3460                         case 'l':
3461                             if (memEQ(posixcc, "cntr", 4)) {
3462                                 /*                  l     */
3463                                 namedclass
3464                                     = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3465                             }
3466                             break;
3467                         case 'm':
3468                             if (memEQ(posixcc, "alnu", 4)) {
3469                                 /*                  m     */
3470                                 namedclass
3471                                     = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3472                             }
3473                             break;
3474                         case 'r':
3475                             if (memEQ(posixcc, "lowe", 4)) {
3476                                 /*                  r     */
3477                                 namedclass
3478                                     = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3479                             }
3480                             if (memEQ(posixcc, "uppe", 4)) {
3481                                 /*                  r     */
3482                                 namedclass
3483                                     = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3484                             }
3485                             break;
3486                         case 't':
3487                             if (memEQ(posixcc, "digi", 4)) {
3488                                 /*                  t     */
3489                                 namedclass
3490                                     = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3491                             }
3492                             if (memEQ(posixcc, "prin", 4)) {
3493                                 /*                  t     */
3494                                 namedclass
3495                                     = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3496                             }
3497                             if (memEQ(posixcc, "punc", 4)) {
3498                                 /*                  t     */
3499                                 namedclass
3500                                     = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3501                             }
3502                             break;
3503                         }
3504                         break;
3505                     case 6:
3506                         if (memEQ(posixcc, "xdigit", 6)) {
3507                             namedclass
3508                                 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3509                         }
3510                         break;
3511                     }
3512
3513                     if (namedclass == OOB_NAMEDCLASS)
3514                     {
3515                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3516                                       t - s - 1, s + 1);
3517                     }
3518                     assert (posixcc[skip] == ':');
3519                     assert (posixcc[skip+1] == ']');
3520                 } else if (!SIZE_ONLY) {
3521                     /* [[=foo=]] and [[.foo.]] are still future. */
3522
3523                     /* adjust RExC_parse so the warning shows after
3524                        the class closes */
3525                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3526                         RExC_parse++;
3527                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3528                 }
3529             } else {
3530                 /* Maternal grandfather:
3531                  * "[:" ending in ":" but not in ":]" */
3532                 RExC_parse = s;
3533             }
3534         }
3535     }
3536
3537     return namedclass;
3538 }
3539
3540 STATIC void
3541 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3542 {
3543     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3544         const char *s = RExC_parse;
3545         const char  c = *s++;
3546
3547         while(*s && isALNUM(*s))
3548             s++;
3549         if (*s && c == *s && s[1] == ']') {
3550             if (ckWARN(WARN_REGEXP))
3551                 vWARN3(s+2,
3552                         "POSIX syntax [%c %c] belongs inside character classes",
3553                         c, c);
3554
3555             /* [[=foo=]] and [[.foo.]] are still future. */
3556             if (POSIXCC_NOTYET(c)) {
3557                 /* adjust RExC_parse so the error shows after
3558                    the class closes */
3559                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3560                     ;
3561                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3562             }
3563         }
3564     }
3565 }
3566
3567 STATIC regnode *
3568 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3569 {
3570     register UV value;
3571     register UV nextvalue;
3572     register IV prevvalue = OOB_UNICODE;
3573     register IV range = 0;
3574     register regnode *ret;
3575     STRLEN numlen;
3576     IV namedclass;
3577     char *rangebegin = 0;
3578     bool need_class = 0;
3579     SV *listsv = Nullsv;
3580     register char *e;
3581     UV n;
3582     bool optimize_invert   = TRUE;
3583     AV* unicode_alternate  = 0;
3584 #ifdef EBCDIC
3585     UV literal_endpoint = 0;
3586 #endif
3587
3588     ret = reganode(pRExC_state, ANYOF, 0);
3589
3590     if (!SIZE_ONLY)
3591         ANYOF_FLAGS(ret) = 0;
3592
3593     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3594         RExC_naughty++;
3595         RExC_parse++;
3596         if (!SIZE_ONLY)
3597             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3598     }
3599
3600     if (SIZE_ONLY)
3601         RExC_size += ANYOF_SKIP;
3602     else {
3603         RExC_emit += ANYOF_SKIP;
3604         if (FOLD)
3605             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3606         if (LOC)
3607             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3608         ANYOF_BITMAP_ZERO(ret);
3609         listsv = newSVpvn("# comment\n", 10);
3610     }
3611
3612     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3613
3614     if (!SIZE_ONLY && POSIXCC(nextvalue))
3615         checkposixcc(pRExC_state);
3616
3617     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3618     if (UCHARAT(RExC_parse) == ']')
3619         goto charclassloop;
3620
3621     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3622
3623     charclassloop:
3624
3625         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3626
3627         if (!range)
3628             rangebegin = RExC_parse;
3629         if (UTF) {
3630             value = utf8n_to_uvchr((U8*)RExC_parse,
3631                                    RExC_end - RExC_parse,
3632                                    &numlen, 0);
3633             RExC_parse += numlen;
3634         }
3635         else
3636             value = UCHARAT(RExC_parse++);
3637         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3638         if (value == '[' && POSIXCC(nextvalue))
3639             namedclass = regpposixcc(pRExC_state, value);
3640         else if (value == '\\') {
3641             if (UTF) {
3642                 value = utf8n_to_uvchr((U8*)RExC_parse,
3643                                    RExC_end - RExC_parse,
3644                                    &numlen, 0);
3645                 RExC_parse += numlen;
3646             }
3647             else
3648                 value = UCHARAT(RExC_parse++);
3649             /* Some compilers cannot handle switching on 64-bit integer
3650              * values, therefore value cannot be an UV.  Yes, this will
3651              * be a problem later if we want switch on Unicode.
3652              * A similar issue a little bit later when switching on
3653              * namedclass. --jhi */
3654             switch ((I32)value) {
3655             case 'w':   namedclass = ANYOF_ALNUM;       break;
3656             case 'W':   namedclass = ANYOF_NALNUM;      break;
3657             case 's':   namedclass = ANYOF_SPACE;       break;
3658             case 'S':   namedclass = ANYOF_NSPACE;      break;
3659             case 'd':   namedclass = ANYOF_DIGIT;       break;
3660             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3661             case 'p':
3662             case 'P':
3663                 if (RExC_parse >= RExC_end)
3664                     vFAIL2("Empty \\%c{}", (U8)value);
3665                 if (*RExC_parse == '{') {
3666                     const U8 c = (U8)value;
3667                     e = strchr(RExC_parse++, '}');
3668                     if (!e)
3669                         vFAIL2("Missing right brace on \\%c{}", c);
3670                     while (isSPACE(UCHARAT(RExC_parse)))
3671                         RExC_parse++;
3672                     if (e == RExC_parse)
3673                         vFAIL2("Empty \\%c{}", c);
3674                     n = e - RExC_parse;
3675                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3676                         n--;
3677                 }
3678                 else {
3679                     e = RExC_parse;
3680                     n = 1;
3681                 }
3682                 if (!SIZE_ONLY) {
3683                     if (UCHARAT(RExC_parse) == '^') {
3684                          RExC_parse++;
3685                          n--;
3686                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3687                          while (isSPACE(UCHARAT(RExC_parse))) {
3688                               RExC_parse++;
3689                               n--;
3690                          }
3691                     }
3692                     if (value == 'p')
3693                          Perl_sv_catpvf(aTHX_ listsv,
3694                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3695                     else
3696                          Perl_sv_catpvf(aTHX_ listsv,
3697                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3698                 }
3699                 RExC_parse = e + 1;
3700                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3701                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
3702                 break;
3703             case 'n':   value = '\n';                   break;
3704             case 'r':   value = '\r';                   break;
3705             case 't':   value = '\t';                   break;
3706             case 'f':   value = '\f';                   break;
3707             case 'b':   value = '\b';                   break;
3708             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3709             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3710             case 'x':
3711                 if (*RExC_parse == '{') {
3712                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3713                         | PERL_SCAN_DISALLOW_PREFIX;
3714                     e = strchr(RExC_parse++, '}');
3715                     if (!e)
3716                         vFAIL("Missing right brace on \\x{}");
3717
3718                     numlen = e - RExC_parse;
3719                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3720                     RExC_parse = e + 1;
3721                 }
3722                 else {
3723                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3724                     numlen = 2;
3725                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3726                     RExC_parse += numlen;
3727                 }
3728                 break;
3729             case 'c':
3730                 value = UCHARAT(RExC_parse++);
3731                 value = toCTRL(value);
3732                 break;
3733             case '0': case '1': case '2': case '3': case '4':
3734             case '5': case '6': case '7': case '8': case '9':
3735             {
3736                 I32 flags = 0;
3737                 numlen = 3;
3738                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3739                 RExC_parse += numlen;
3740                 break;
3741             }
3742             default:
3743                 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
3744                     vWARN2(RExC_parse,
3745                            "Unrecognized escape \\%c in character class passed through",
3746                            (int)value);
3747                 break;
3748             }
3749         } /* end of \blah */
3750 #ifdef EBCDIC
3751         else
3752             literal_endpoint++;
3753 #endif
3754
3755         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3756
3757             if (!SIZE_ONLY && !need_class)
3758                 ANYOF_CLASS_ZERO(ret);
3759
3760             need_class = 1;
3761
3762             /* a bad range like a-\d, a-[:digit:] ? */
3763             if (range) {
3764                 if (!SIZE_ONLY) {
3765                     if (ckWARN(WARN_REGEXP))
3766                         vWARN4(RExC_parse,
3767                                "False [] range \"%*.*s\"",
3768                                RExC_parse - rangebegin,
3769                                RExC_parse - rangebegin,
3770                                rangebegin);
3771                     if (prevvalue < 256) {
3772                         ANYOF_BITMAP_SET(ret, prevvalue);
3773                         ANYOF_BITMAP_SET(ret, '-');
3774                     }
3775                     else {
3776                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3777                         Perl_sv_catpvf(aTHX_ listsv,
3778                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3779                     }
3780                 }
3781
3782                 range = 0; /* this was not a true range */
3783             }
3784
3785             if (!SIZE_ONLY) {
3786                 const char *what = NULL;
3787                 char yesno = 0;
3788
3789                 if (namedclass > OOB_NAMEDCLASS)
3790                     optimize_invert = FALSE;
3791                 /* Possible truncation here but in some 64-bit environments
3792                  * the compiler gets heartburn about switch on 64-bit values.
3793                  * A similar issue a little earlier when switching on value.
3794                  * --jhi */
3795                 switch ((I32)namedclass) {
3796                 case ANYOF_ALNUM:
3797                     if (LOC)
3798                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3799                     else {
3800                         for (value = 0; value < 256; value++)
3801                             if (isALNUM(value))
3802                                 ANYOF_BITMAP_SET(ret, value);
3803                     }
3804                     yesno = '+';
3805                     what = "Word";      
3806                     break;
3807                 case ANYOF_NALNUM:
3808                     if (LOC)
3809                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3810                     else {
3811                         for (value = 0; value < 256; value++)
3812                             if (!isALNUM(value))
3813                                 ANYOF_BITMAP_SET(ret, value);
3814                     }
3815                     yesno = '!';
3816                     what = "Word";
3817                     break;
3818                 case ANYOF_ALNUMC:
3819                     if (LOC)
3820                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3821                     else {
3822                         for (value = 0; value < 256; value++)
3823                             if (isALNUMC(value))
3824                                 ANYOF_BITMAP_SET(ret, value);
3825                     }
3826                     yesno = '+';
3827                     what = "Alnum";
3828                     break;
3829                 case ANYOF_NALNUMC:
3830                     if (LOC)
3831                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3832                     else {
3833                         for (value = 0; value < 256; value++)
3834                             if (!isALNUMC(value))
3835                                 ANYOF_BITMAP_SET(ret, value);
3836                     }
3837                     yesno = '!';
3838                     what = "Alnum";
3839                     break;
3840                 case ANYOF_ALPHA:
3841                     if (LOC)
3842                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3843                     else {
3844                         for (value = 0; value < 256; value++)
3845                             if (isALPHA(value))
3846                                 ANYOF_BITMAP_SET(ret, value);
3847                     }
3848                     yesno = '+';
3849                     what = "Alpha";
3850                     break;
3851                 case ANYOF_NALPHA:
3852                     if (LOC)
3853                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3854                     else {
3855                         for (value = 0; value < 256; value++)
3856                             if (!isALPHA(value))
3857                                 ANYOF_BITMAP_SET(ret, value);
3858                     }
3859                     yesno = '!';
3860                     what = "Alpha";
3861                     break;
3862                 case ANYOF_ASCII:
3863                     if (LOC)
3864                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3865                     else {
3866 #ifndef EBCDIC
3867                         for (value = 0; value < 128; value++)
3868                             ANYOF_BITMAP_SET(ret, value);
3869 #else  /* EBCDIC */
3870                         for (value = 0; value < 256; value++) {
3871                             if (isASCII(value))
3872                                 ANYOF_BITMAP_SET(ret, value);
3873                         }
3874 #endif /* EBCDIC */
3875                     }
3876                     yesno = '+';
3877                     what = "ASCII";
3878                     break;
3879                 case ANYOF_NASCII:
3880                     if (LOC)
3881                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3882                     else {
3883 #ifndef EBCDIC
3884                         for (value = 128; value < 256; value++)
3885                             ANYOF_BITMAP_SET(ret, value);
3886 #else  /* EBCDIC */
3887                         for (value = 0; value < 256; value++) {
3888                             if (!isASCII(value))
3889                                 ANYOF_BITMAP_SET(ret, value);
3890                         }
3891 #endif /* EBCDIC */
3892                     }
3893                     yesno = '!';
3894                     what = "ASCII";
3895                     break;
3896                 case ANYOF_BLANK:
3897                     if (LOC)
3898                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3899                     else {
3900                         for (value = 0; value < 256; value++)
3901                             if (isBLANK(value))
3902                                 ANYOF_BITMAP_SET(ret, value);
3903                     }
3904                     yesno = '+';
3905                     what = "Blank";
3906                     break;
3907                 case ANYOF_NBLANK:
3908                     if (LOC)
3909                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3910                     else {
3911                         for (value = 0; value < 256; value++)
3912                             if (!isBLANK(value))
3913                                 ANYOF_BITMAP_SET(ret, value);
3914                     }
3915                     yesno = '!';
3916                     what = "Blank";
3917                     break;
3918                 case ANYOF_CNTRL:
3919                     if (LOC)
3920                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3921                     else {
3922                         for (value = 0; value < 256; value++)
3923                             if (isCNTRL(value))
3924                                 ANYOF_BITMAP_SET(ret, value);
3925                     }
3926                     yesno = '+';
3927                     what = "Cntrl";
3928                     break;
3929                 case ANYOF_NCNTRL:
3930                     if (LOC)
3931                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3932                     else {
3933                         for (value = 0; value < 256; value++)
3934                             if (!isCNTRL(value))
3935                                 ANYOF_BITMAP_SET(ret, value);
3936                     }
3937                     yesno = '!';
3938                     what = "Cntrl";
3939                     break;
3940                 case ANYOF_DIGIT:
3941                     if (LOC)
3942                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3943                     else {
3944                         /* consecutive digits assumed */
3945                         for (value = '0'; value <= '9'; value++)
3946                             ANYOF_BITMAP_SET(ret, value);
3947                     }
3948                     yesno = '+';
3949                     what = "Digit";
3950                     break;
3951                 case ANYOF_NDIGIT:
3952                     if (LOC)
3953                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3954                     else {
3955                         /* consecutive digits assumed */
3956                         for (value = 0; value < '0'; value++)
3957                             ANYOF_BITMAP_SET(ret, value);
3958                         for (value = '9' + 1; value < 256; value++)
3959                             ANYOF_BITMAP_SET(ret, value);
3960                     }
3961                     yesno = '!';
3962                     what = "Digit";
3963                     break;
3964                 case ANYOF_GRAPH:
3965                     if (LOC)
3966                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3967                     else {
3968                         for (value = 0; value < 256; value++)
3969                             if (isGRAPH(value))
3970                                 ANYOF_BITMAP_SET(ret, value);
3971                     }
3972                     yesno = '+';
3973                     what = "Graph";
3974                     break;
3975                 case ANYOF_NGRAPH:
3976                     if (LOC)
3977                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3978                     else {
3979                         for (value = 0; value < 256; value++)
3980                             if (!isGRAPH(value))
3981                                 ANYOF_BITMAP_SET(ret, value);
3982                     }
3983                     yesno = '!';
3984                     what = "Graph";
3985                     break;
3986                 case ANYOF_LOWER:
3987                     if (LOC)
3988                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3989                     else {
3990                         for (value = 0; value < 256; value++)
3991                             if (isLOWER(value))
3992                                 ANYOF_BITMAP_SET(ret, value);
3993                     }
3994                     yesno = '+';
3995                     what = "Lower";
3996                     break;
3997                 case ANYOF_NLOWER:
3998                     if (LOC)
3999                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
4000                     else {
4001                         for (value = 0; value < 256; value++)
4002                             if (!isLOWER(value))
4003                                 ANYOF_BITMAP_SET(ret, value);
4004                     }
4005                     yesno = '!';
4006                     what = "Lower";
4007                     break;
4008                 case ANYOF_PRINT:
4009                     if (LOC)
4010                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
4011                     else {
4012                         for (value = 0; value < 256; value++)
4013                             if (isPRINT(value))
4014                                 ANYOF_BITMAP_SET(ret, value);
4015                     }
4016                     yesno = '+';
4017                     what = "Print";
4018                     break;
4019                 case ANYOF_NPRINT:
4020                     if (LOC)
4021                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4022                     else {
4023                         for (value = 0; value < 256; value++)
4024                             if (!isPRINT(value))
4025                                 ANYOF_BITMAP_SET(ret, value);
4026                     }
4027                     yesno = '!';
4028                     what = "Print";
4029                     break;
4030                 case ANYOF_PSXSPC:
4031                     if (LOC)
4032                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4033                     else {
4034                         for (value = 0; value < 256; value++)
4035                             if (isPSXSPC(value))
4036                                 ANYOF_BITMAP_SET(ret, value);
4037                     }
4038                     yesno = '+';
4039                     what = "Space";
4040                     break;
4041                 case ANYOF_NPSXSPC:
4042                     if (LOC)
4043                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4044                     else {
4045                         for (value = 0; value < 256; value++)
4046                             if (!isPSXSPC(value))
4047                                 ANYOF_BITMAP_SET(ret, value);
4048                     }
4049                     yesno = '!';
4050                     what = "Space";
4051                     break;
4052                 case ANYOF_PUNCT:
4053                     if (LOC)
4054                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4055                     else {
4056                         for (value = 0; value < 256; value++)
4057                             if (isPUNCT(value))
4058                                 ANYOF_BITMAP_SET(ret, value);
4059                     }
4060                     yesno = '+';
4061                     what = "Punct";
4062                     break;
4063                 case ANYOF_NPUNCT:
4064                     if (LOC)
4065                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4066                     else {
4067                         for (value = 0; value < 256; value++)
4068                             if (!isPUNCT(value))
4069                                 ANYOF_BITMAP_SET(ret, value);
4070                     }
4071                     yesno = '!';
4072                     what = "Punct";
4073                     break;
4074                 case ANYOF_SPACE:
4075                     if (LOC)
4076                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4077                     else {
4078                         for (value = 0; value < 256; value++)
4079                             if (isSPACE(value))
4080                                 ANYOF_BITMAP_SET(ret, value);
4081                     }
4082                     yesno = '+';
4083                     what = "SpacePerl";
4084                     break;
4085                 case ANYOF_NSPACE:
4086                     if (LOC)
4087                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4088                     else {
4089                         for (value = 0; value < 256; value++)
4090                             if (!isSPACE(value))
4091                                 ANYOF_BITMAP_SET(ret, value);
4092                     }
4093                     yesno = '!';
4094                     what = "SpacePerl";
4095                     break;
4096                 case ANYOF_UPPER:
4097                     if (LOC)
4098                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4099                     else {
4100                         for (value = 0; value < 256; value++)
4101                             if (isUPPER(value))
4102                                 ANYOF_BITMAP_SET(ret, value);
4103                     }
4104                     yesno = '+';
4105                     what = "Upper";
4106                     break;
4107                 case ANYOF_NUPPER:
4108                     if (LOC)
4109                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4110                     else {
4111                         for (value = 0; value < 256; value++)
4112                             if (!isUPPER(value))
4113                                 ANYOF_BITMAP_SET(ret, value);
4114                     }
4115                     yesno = '!';
4116                     what = "Upper";
4117                     break;
4118                 case ANYOF_XDIGIT:
4119                     if (LOC)
4120                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4121                     else {
4122                         for (value = 0; value < 256; value++)
4123                             if (isXDIGIT(value))
4124                                 ANYOF_BITMAP_SET(ret, value);
4125                     }
4126                     yesno = '+';
4127                     what = "XDigit";
4128                     break;
4129                 case ANYOF_NXDIGIT:
4130                     if (LOC)
4131                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4132                     else {
4133                         for (value = 0; value < 256; value++)
4134                             if (!isXDIGIT(value))
4135                                 ANYOF_BITMAP_SET(ret, value);
4136                     }
4137                     yesno = '!';
4138                     what = "XDigit";
4139                     break;
4140                 case ANYOF_MAX:
4141                     /* this is to handle \p and \P */
4142                     break;
4143                 default:
4144                     vFAIL("Invalid [::] class");
4145                     break;
4146                 }
4147                 if (what) {
4148                     /* Strings such as "+utf8::isWord\n" */
4149                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4150                 }
4151                 if (LOC)
4152                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4153                 continue;
4154             }
4155         } /* end of namedclass \blah */
4156
4157         if (range) {
4158             if (prevvalue > (IV)value) /* b-a */ {
4159                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4160                               RExC_parse - rangebegin,
4161                               RExC_parse - rangebegin,
4162                               rangebegin);
4163                 range = 0; /* not a valid range */
4164             }
4165         }
4166         else {
4167             prevvalue = value; /* save the beginning of the range */
4168             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4169                 RExC_parse[1] != ']') {
4170                 RExC_parse++;
4171
4172                 /* a bad range like \w-, [:word:]- ? */
4173                 if (namedclass > OOB_NAMEDCLASS) {
4174                     if (ckWARN(WARN_REGEXP))
4175                         vWARN4(RExC_parse,
4176                                "False [] range \"%*.*s\"",
4177                                RExC_parse - rangebegin,
4178                                RExC_parse - rangebegin,
4179                                rangebegin);
4180                     if (!SIZE_ONLY)
4181                         ANYOF_BITMAP_SET(ret, '-');
4182                 } else
4183                     range = 1;  /* yeah, it's a range! */
4184                 continue;       /* but do it the next time */
4185             }
4186         }
4187
4188         /* now is the next time */
4189         if (!SIZE_ONLY) {
4190             IV i;
4191
4192             if (prevvalue < 256) {
4193                 const IV ceilvalue = value < 256 ? value : 255;
4194
4195 #ifdef EBCDIC
4196                 /* In EBCDIC [\x89-\x91] should include
4197                  * the \x8e but [i-j] should not. */
4198                 if (literal_endpoint == 2 &&
4199                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4200                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4201                 {
4202                     if (isLOWER(prevvalue)) {
4203                         for (i = prevvalue; i <= ceilvalue; i++)
4204                             if (isLOWER(i))
4205                                 ANYOF_BITMAP_SET(ret, i);
4206                     } else {
4207                         for (i = prevvalue; i <= ceilvalue; i++)
4208                             if (isUPPER(i))
4209                                 ANYOF_BITMAP_SET(ret, i);
4210                     }
4211                 }
4212                 else
4213 #endif
4214                       for (i = prevvalue; i <= ceilvalue; i++)
4215                           ANYOF_BITMAP_SET(ret, i);
4216           }
4217           if (value > 255 || UTF) {
4218                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4219                 const UV natvalue      = NATIVE_TO_UNI(value);
4220
4221                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4222                 if (prevnatvalue < natvalue) { /* what about > ? */
4223                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4224                                    prevnatvalue, natvalue);
4225                 }
4226                 else if (prevnatvalue == natvalue) {
4227                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4228                     if (FOLD) {
4229                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4230                          STRLEN foldlen;
4231                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4232
4233                          /* If folding and foldable and a single
4234                           * character, insert also the folded version
4235                           * to the charclass. */
4236                          if (f != value) {
4237                               if (foldlen == (STRLEN)UNISKIP(f))
4238                                   Perl_sv_catpvf(aTHX_ listsv,
4239                                                  "%04"UVxf"\n", f);
4240                               else {
4241                                   /* Any multicharacter foldings
4242                                    * require the following transform:
4243                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4244                                    * where E folds into "pq" and F folds
4245                                    * into "rst", all other characters
4246                                    * fold to single characters.  We save
4247                                    * away these multicharacter foldings,
4248                                    * to be later saved as part of the
4249                                    * additional "s" data. */
4250                                   SV *sv;
4251
4252                                   if (!unicode_alternate)
4253                                       unicode_alternate = newAV();
4254                                   sv = newSVpvn((char*)foldbuf, foldlen);
4255                                   SvUTF8_on(sv);
4256                                   av_push(unicode_alternate, sv);
4257                               }
4258                          }
4259
4260                          /* If folding and the value is one of the Greek
4261                           * sigmas insert a few more sigmas to make the
4262                           * folding rules of the sigmas to work right.
4263                           * Note that not all the possible combinations
4264                           * are handled here: some of them are handled
4265                           * by the standard folding rules, and some of
4266                           * them (literal or EXACTF cases) are handled
4267                           * during runtime in regexec.c:S_find_byclass(). */
4268                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4269                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4270                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4271                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4272                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4273                          }
4274                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4275                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4276                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4277                     }
4278                 }
4279             }
4280 #ifdef EBCDIC
4281             literal_endpoint = 0;
4282 #endif
4283         }
4284
4285         range = 0; /* this range (if it was one) is done now */
4286     }
4287
4288     if (need_class) {
4289         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4290         if (SIZE_ONLY)
4291             RExC_size += ANYOF_CLASS_ADD_SKIP;
4292         else
4293             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4294     }
4295
4296     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4297     if (!SIZE_ONLY &&
4298          /* If the only flag is folding (plus possibly inversion). */
4299         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4300        ) {
4301         for (value = 0; value < 256; ++value) {
4302             if (ANYOF_BITMAP_TEST(ret, value)) {
4303                 UV fold = PL_fold[value];
4304
4305                 if (fold != value)
4306                     ANYOF_BITMAP_SET(ret, fold);
4307             }
4308         }
4309         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4310     }
4311
4312     /* optimize inverted simple patterns (e.g. [^a-z]) */
4313     if (!SIZE_ONLY && optimize_invert &&
4314         /* If the only flag is inversion. */
4315         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4316         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4317             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4318         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4319     }
4320
4321     if (!SIZE_ONLY) {
4322         AV *av = newAV();
4323         SV *rv;
4324
4325         /* The 0th element stores the character class description
4326          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4327          * to initialize the appropriate swash (which gets stored in
4328          * the 1st element), and also useful for dumping the regnode.
4329          * The 2nd element stores the multicharacter foldings,
4330          * used later (regexec.c:S_reginclass()). */
4331         av_store(av, 0, listsv);
4332         av_store(av, 1, NULL);
4333         av_store(av, 2, (SV*)unicode_alternate);
4334         rv = newRV_noinc((SV*)av);
4335         n = add_data(pRExC_state, 1, "s");
4336         RExC_rx->data->data[n] = (void*)rv;
4337         ARG_SET(ret, n);
4338     }
4339
4340     return ret;
4341 }
4342
4343 STATIC char*
4344 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4345 {
4346     char* retval = RExC_parse++;
4347
4348     for (;;) {
4349         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4350                 RExC_parse[2] == '#') {
4351             while (*RExC_parse != ')') {
4352                 if (RExC_parse == RExC_end)
4353                     FAIL("Sequence (?#... not terminated");
4354                 RExC_parse++;
4355             }
4356             RExC_parse++;
4357             continue;
4358         }
4359         if (RExC_flags & PMf_EXTENDED) {
4360             if (isSPACE(*RExC_parse)) {
4361                 RExC_parse++;
4362                 continue;
4363             }
4364             else if (*RExC_parse == '#') {
4365                 while (RExC_parse < RExC_end)
4366                     if (*RExC_parse++ == '\n') break;
4367                 continue;
4368             }
4369         }
4370         return retval;
4371     }
4372 }
4373
4374 /*
4375 - reg_node - emit a node
4376 */
4377 STATIC regnode *                        /* Location. */
4378 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4379 {
4380     register regnode *ptr;
4381     regnode * const ret = RExC_emit;
4382
4383     if (SIZE_ONLY) {
4384         SIZE_ALIGN(RExC_size);
4385         RExC_size += 1;
4386         return(ret);
4387     }
4388
4389     NODE_ALIGN_FILL(ret);
4390     ptr = ret;
4391     FILL_ADVANCE_NODE(ptr, op);
4392     if (RExC_offsets) {         /* MJD */
4393         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4394               "reg_node", __LINE__, 
4395               reg_name[op],
4396               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4397               ? "Overwriting end of array!\n" : "OK",
4398               RExC_emit - RExC_emit_start,
4399               RExC_parse - RExC_start,
4400               RExC_offsets[0])); 
4401         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4402     }
4403             
4404     RExC_emit = ptr;
4405
4406     return(ret);
4407 }
4408
4409 /*
4410 - reganode - emit a node with an argument
4411 */
4412 STATIC regnode *                        /* Location. */
4413 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4414 {
4415     register regnode *ptr;
4416     regnode * const ret = RExC_emit;
4417
4418     if (SIZE_ONLY) {
4419         SIZE_ALIGN(RExC_size);
4420         RExC_size += 2;
4421         return(ret);
4422     }
4423
4424     NODE_ALIGN_FILL(ret);
4425     ptr = ret;
4426     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4427     if (RExC_offsets) {         /* MJD */
4428         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4429               "reganode",
4430               __LINE__,
4431               reg_name[op],
4432               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4433               "Overwriting end of array!\n" : "OK",
4434               RExC_emit - RExC_emit_start,
4435               RExC_parse - RExC_start,
4436               RExC_offsets[0])); 
4437         Set_Cur_Node_Offset;
4438     }
4439             
4440     RExC_emit = ptr;
4441
4442     return(ret);
4443 }
4444
4445 /*
4446 - reguni - emit (if appropriate) a Unicode character
4447 */
4448 STATIC void
4449 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4450 {
4451     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4452 }
4453
4454 /*
4455 - reginsert - insert an operator in front of already-emitted operand
4456 *
4457 * Means relocating the operand.
4458 */
4459 STATIC void
4460 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4461 {
4462     register regnode *src;
4463     register regnode *dst;
4464     register regnode *place;
4465     const int offset = regarglen[(U8)op];
4466
4467 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4468
4469     if (SIZE_ONLY) {
4470         RExC_size += NODE_STEP_REGNODE + offset;
4471         return;
4472     }
4473
4474     src = RExC_emit;
4475     RExC_emit += NODE_STEP_REGNODE + offset;
4476     dst = RExC_emit;
4477     while (src > opnd) {
4478         StructCopy(--src, --dst, regnode);
4479         if (RExC_offsets) {     /* MJD 20010112 */
4480             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4481                   "reg_insert",
4482                   __LINE__,
4483                   reg_name[op],
4484                   dst - RExC_emit_start > RExC_offsets[0] 
4485                   ? "Overwriting end of array!\n" : "OK",
4486                   src - RExC_emit_start,
4487                   dst - RExC_emit_start,
4488                   RExC_offsets[0])); 
4489             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4490             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4491         }
4492     }
4493     
4494
4495     place = opnd;               /* Op node, where operand used to be. */
4496     if (RExC_offsets) {         /* MJD */
4497         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4498               "reginsert",
4499               __LINE__,
4500               reg_name[op],
4501               place - RExC_emit_start > RExC_offsets[0] 
4502               ? "Overwriting end of array!\n" : "OK",
4503               place - RExC_emit_start,
4504               RExC_parse - RExC_start,
4505               RExC_offsets[0])); 
4506         Set_Node_Offset(place, RExC_parse);
4507         Set_Node_Length(place, 1);
4508     }
4509     src = NEXTOPER(place);
4510     FILL_ADVANCE_NODE(place, op);
4511     Zero(src, offset, regnode);
4512 }
4513
4514 /*
4515 - regtail - set the next-pointer at the end of a node chain of p to val.
4516 */
4517 STATIC void
4518 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4519 {
4520     register regnode *scan;
4521
4522     if (SIZE_ONLY)
4523         return;
4524
4525     /* Find last node. */
4526     scan = p;
4527     for (;;) {
4528         regnode * const temp = regnext(scan);
4529         if (temp == NULL)
4530             break;
4531         scan = temp;
4532     }
4533
4534     if (reg_off_by_arg[OP(scan)]) {
4535         ARG_SET(scan, val - scan);
4536     }
4537     else {
4538         NEXT_OFF(scan) = val - scan;
4539     }
4540 }
4541
4542 /*
4543 - regoptail - regtail on operand of first argument; nop if operandless
4544 */
4545 STATIC void
4546 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4547 {
4548     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4549     if (p == NULL || SIZE_ONLY)
4550         return;
4551     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4552         regtail(pRExC_state, NEXTOPER(p), val);
4553     }
4554     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4555         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4556     }
4557     else
4558         return;
4559 }
4560
4561 /*
4562  - regcurly - a little FSA that accepts {\d+,?\d*}
4563  */
4564 STATIC I32
4565 S_regcurly(pTHX_ register const char *s)
4566 {
4567     if (*s++ != '{')
4568         return FALSE;
4569     if (!isDIGIT(*s))
4570         return FALSE;
4571     while (isDIGIT(*s))
4572         s++;
4573     if (*s == ',')
4574         s++;
4575     while (isDIGIT(*s))
4576         s++;
4577     if (*s != '}')
4578         return FALSE;
4579     return TRUE;
4580 }
4581
4582 /*
4583  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4584  */
4585 void
4586 Perl_regdump(pTHX_ regexp *r)
4587 {
4588 #ifdef DEBUGGING
4589     SV *sv = sv_newmortal();
4590
4591     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4592
4593     /* Header fields of interest. */
4594     if (r->anchored_substr)
4595         PerlIO_printf(Perl_debug_log,
4596                       "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4597                       PL_colors[0],
4598                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4599                       SvPVX_const(r->anchored_substr),
4600                       PL_colors[1],
4601                       SvTAIL(r->anchored_substr) ? "$" : "",
4602                       (IV)r->anchored_offset);
4603     else if (r->anchored_utf8)
4604         PerlIO_printf(Perl_debug_log,
4605                       "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4606                       PL_colors[0],
4607                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4608                       SvPVX_const(r->anchored_utf8),
4609                       PL_colors[1],
4610                       SvTAIL(r->anchored_utf8) ? "$" : "",
4611                       (IV)r->anchored_offset);
4612     if (r->float_substr)
4613         PerlIO_printf(Perl_debug_log,
4614                       "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4615                       PL_colors[0],
4616                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4617                       SvPVX_const(r->float_substr),
4618                       PL_colors[1],
4619                       SvTAIL(r->float_substr) ? "$" : "",
4620                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4621     else if (r->float_utf8)
4622         PerlIO_printf(Perl_debug_log,
4623                       "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4624                       PL_colors[0],
4625                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4626                       SvPVX_const(r->float_utf8),
4627                       PL_colors[1],
4628                       SvTAIL(r->float_utf8) ? "$" : "",
4629                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4630     if (r->check_substr || r->check_utf8)
4631         PerlIO_printf(Perl_debug_log,
4632                       r->check_substr == r->float_substr
4633                       && r->check_utf8 == r->float_utf8
4634                       ? "(checking floating" : "(checking anchored");
4635     if (r->reganch & ROPT_NOSCAN)
4636         PerlIO_printf(Perl_debug_log, " noscan");
4637     if (r->reganch & ROPT_CHECK_ALL)
4638         PerlIO_printf(Perl_debug_log, " isall");
4639     if (r->check_substr || r->check_utf8)
4640         PerlIO_printf(Perl_debug_log, ") ");
4641
4642     if (r->regstclass) {
4643         regprop(sv, r->regstclass);
4644         PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4645     }
4646     if (r->reganch & ROPT_ANCH) {
4647         PerlIO_printf(Perl_debug_log, "anchored");
4648         if (r->reganch & ROPT_ANCH_BOL)
4649             PerlIO_printf(Perl_debug_log, "(BOL)");
4650         if (r->reganch & ROPT_ANCH_MBOL)
4651             PerlIO_printf(Perl_debug_log, "(MBOL)");
4652         if (r->reganch & ROPT_ANCH_SBOL)
4653             PerlIO_printf(Perl_debug_log, "(SBOL)");
4654         if (r->reganch & ROPT_ANCH_GPOS)
4655             PerlIO_printf(Perl_debug_log, "(GPOS)");
4656         PerlIO_putc(Perl_debug_log, ' ');
4657     }
4658     if (r->reganch & ROPT_GPOS_SEEN)
4659         PerlIO_printf(Perl_debug_log, "GPOS ");
4660     if (r->reganch & ROPT_SKIP)
4661         PerlIO_printf(Perl_debug_log, "plus ");
4662     if (r->reganch & ROPT_IMPLICIT)
4663         PerlIO_printf(Perl_debug_log, "implicit ");
4664     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4665     if (r->reganch & ROPT_EVAL_SEEN)
4666         PerlIO_printf(Perl_debug_log, "with eval ");
4667     PerlIO_printf(Perl_debug_log, "\n");
4668     if (r->offsets) {
4669       U32 i;
4670       const U32 len = r->offsets[0];
4671       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4672       for (i = 1; i <= len; i++)
4673         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4674                       (UV)r->offsets[i*2-1], 
4675                       (UV)r->offsets[i*2]);
4676       PerlIO_printf(Perl_debug_log, "\n");
4677     }
4678 #endif  /* DEBUGGING */
4679 }
4680
4681 /*
4682 - regprop - printable representation of opcode
4683 */
4684 void
4685 Perl_regprop(pTHX_ SV *sv, regnode *o)
4686 {
4687 #ifdef DEBUGGING
4688     register int k;
4689
4690     sv_setpvn(sv, "", 0);
4691     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4692         /* It would be nice to FAIL() here, but this may be called from
4693            regexec.c, and it would be hard to supply pRExC_state. */
4694         Perl_croak(aTHX_ "Corrupted regexp opcode");
4695     sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4696
4697     k = PL_regkind[(U8)OP(o)];
4698
4699     if (k == EXACT) {
4700         SV * const dsv = sv_2mortal(newSVpvn("", 0));
4701         /* Using is_utf8_string() is a crude hack but it may
4702          * be the best for now since we have no flag "this EXACTish
4703          * node was UTF-8" --jhi */
4704         const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4705         const char * const s = do_utf8 ?
4706           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4707                          UNI_DISPLAY_REGEX) :
4708           STRING(o);
4709         const int len = do_utf8 ?
4710           strlen(s) :
4711           STR_LEN(o);
4712         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4713                        PL_colors[0],
4714                        len, s,
4715                        PL_colors[1]);
4716     }
4717     else if (k == CURLY) {
4718         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4719             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4720         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4721     }
4722     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4723         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4724     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4725         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4726     else if (k == LOGICAL)
4727         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4728     else if (k == ANYOF) {
4729         int i, rangestart = -1;
4730         const U8 flags = ANYOF_FLAGS(o);
4731         const char * const anyofs[] = { /* Should be synchronized with
4732                                          * ANYOF_ #xdefines in regcomp.h */
4733             "\\w",
4734             "\\W",
4735             "\\s",
4736             "\\S",
4737             "\\d",
4738             "\\D",
4739             "[:alnum:]",
4740             "[:^alnum:]",
4741             "[:alpha:]",
4742             "[:^alpha:]",
4743             "[:ascii:]",
4744             "[:^ascii:]",
4745             "[:ctrl:]",
4746             "[:^ctrl:]",
4747             "[:graph:]",
4748             "[:^graph:]",
4749             "[:lower:]",
4750             "[:^lower:]",
4751             "[:print:]",
4752             "[:^print:]",
4753             "[:punct:]",
4754             "[:^punct:]",
4755             "[:upper:]",
4756             "[:^upper:]",
4757             "[:xdigit:]",
4758             "[:^xdigit:]",
4759             "[:space:]",
4760             "[:^space:]",
4761             "[:blank:]",
4762             "[:^blank:]"
4763         };
4764
4765         if (flags & ANYOF_LOCALE)
4766             sv_catpv(sv, "{loc}");
4767         if (flags & ANYOF_FOLD)
4768             sv_catpv(sv, "{i}");
4769         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4770         if (flags & ANYOF_INVERT)
4771             sv_catpv(sv, "^");
4772         for (i = 0; i <= 256; i++) {
4773             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4774                 if (rangestart == -1)
4775                     rangestart = i;
4776             } else if (rangestart != -1) {
4777                 if (i <= rangestart + 3)
4778                     for (; rangestart < i; rangestart++)
4779                         put_byte(sv, rangestart);
4780                 else {
4781                     put_byte(sv, rangestart);
4782                     sv_catpv(sv, "-");
4783                     put_byte(sv, i - 1);
4784                 }
4785                 rangestart = -1;
4786             }
4787         }
4788
4789         if (o->flags & ANYOF_CLASS)
4790             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4791                 if (ANYOF_CLASS_TEST(o,i))
4792                     sv_catpv(sv, anyofs[i]);
4793
4794         if (flags & ANYOF_UNICODE)
4795             sv_catpv(sv, "{unicode}");
4796         else if (flags & ANYOF_UNICODE_ALL)
4797             sv_catpv(sv, "{unicode_all}");
4798
4799         {
4800             SV *lv;
4801             SV * const sw = regclass_swash(o, FALSE, &lv, 0);
4802         
4803             if (lv) {
4804                 if (sw) {
4805                     U8 s[UTF8_MAXBYTES_CASE+1];
4806                 
4807                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4808                         uvchr_to_utf8(s, i);
4809                         
4810                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4811                             if (rangestart == -1)
4812                                 rangestart = i;
4813                         } else if (rangestart != -1) {
4814                             if (i <= rangestart + 3)
4815                                 for (; rangestart < i; rangestart++) {
4816                                     const U8 * const e = uvchr_to_utf8(s,rangestart);
4817                                     U8 *p;
4818                                     for(p = s; p < e; p++)
4819                                         put_byte(sv, *p);
4820                                 }
4821                             else {
4822                                 const U8 *e = uvchr_to_utf8(s,rangestart);
4823                                 U8 *p;
4824                                 for (p = s; p < e; p++)
4825                                     put_byte(sv, *p);
4826                                 sv_catpvn(sv, "-", 1);
4827                                 e = uvchr_to_utf8(s, i-1);
4828                                 for (p = s; p < e; p++)
4829                                     put_byte(sv, *p);
4830                                 }
4831                                 rangestart = -1;
4832                             }
4833                         }
4834                         
4835                     sv_catpv(sv, "..."); /* et cetera */
4836                 }
4837
4838                 {
4839                     char *s = savesvpv(lv);
4840                     char *origs = s;
4841                 
4842                     while(*s && *s != '\n') s++;
4843                 
4844                     if (*s == '\n') {
4845                         const char * const t = ++s;
4846                         
4847                         while (*s) {
4848                             if (*s == '\n')
4849                                 *s = ' ';
4850                             s++;
4851                         }
4852                         if (s[-1] == ' ')
4853                             s[-1] = 0;
4854                         
4855                         sv_catpv(sv, t);
4856                     }
4857                 
4858                     Safefree(origs);
4859                 }
4860             }
4861         }
4862
4863         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4864     }
4865     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4866         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4867 #endif  /* DEBUGGING */
4868 }
4869
4870 SV *
4871 Perl_re_intuit_string(pTHX_ regexp *prog)
4872 {                               /* Assume that RE_INTUIT is set */
4873     DEBUG_r(
4874         {
4875             const char * const s = SvPV_nolen_const(prog->check_substr
4876                       ? prog->check_substr : prog->check_utf8);
4877
4878             if (!PL_colorset) reginitcolors();
4879             PerlIO_printf(Perl_debug_log,
4880                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4881                       PL_colors[4],
4882                       prog->check_substr ? "" : "utf8 ",
4883                       PL_colors[5],PL_colors[0],
4884                       s,
4885                       PL_colors[1],
4886                       (strlen(s) > 60 ? "..." : ""));
4887         } );
4888
4889     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4890 }
4891
4892 void
4893 Perl_pregfree(pTHX_ struct regexp *r)
4894 {
4895 #ifdef DEBUGGING
4896     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4897 #endif
4898
4899     if (!r || (--r->refcnt > 0))
4900         return;
4901     DEBUG_r({
4902         const char *s = (r->reganch & ROPT_UTF8)
4903             ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4904             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4905         const int len = SvCUR(dsv);
4906          if (!PL_colorset)
4907               reginitcolors();
4908          PerlIO_printf(Perl_debug_log,
4909                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4910                        PL_colors[4],PL_colors[5],PL_colors[0],
4911                        len, len, s,
4912                        PL_colors[1],
4913                        len > 60 ? "..." : "");
4914     });
4915
4916     /* gcov results gave these as non-null 100% of the time, so there's no
4917        optimisation in checking them before calling Safefree  */
4918     Safefree(r->precomp);
4919     Safefree(r->offsets);             /* 20010421 MJD */
4920     if (RX_MATCH_COPIED(r))
4921         Safefree(r->subbeg);
4922     if (r->substrs) {
4923         if (r->anchored_substr)
4924             SvREFCNT_dec(r->anchored_substr);
4925         if (r->anchored_utf8)
4926             SvREFCNT_dec(r->anchored_utf8);
4927         if (r->float_substr)
4928             SvREFCNT_dec(r->float_substr);
4929         if (r->float_utf8)
4930             SvREFCNT_dec(r->float_utf8);
4931         Safefree(r->substrs);
4932     }
4933     if (r->data) {
4934         int n = r->data->count;
4935         PAD* new_comppad = NULL;
4936         PAD* old_comppad;
4937         PADOFFSET refcnt;
4938
4939         while (--n >= 0) {
4940           /* If you add a ->what type here, update the comment in regcomp.h */
4941             switch (r->data->what[n]) {
4942             case 's':
4943                 SvREFCNT_dec((SV*)r->data->data[n]);
4944                 break;
4945             case 'f':
4946                 Safefree(r->data->data[n]);
4947                 break;
4948             case 'p':
4949                 new_comppad = (AV*)r->data->data[n];
4950                 break;
4951             case 'o':
4952                 if (new_comppad == NULL)
4953                     Perl_croak(aTHX_ "panic: pregfree comppad");
4954                 PAD_SAVE_LOCAL(old_comppad,
4955                     /* Watch out for global destruction's random ordering. */
4956                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4957                                 new_comppad : Null(PAD *)
4958                 );
4959                 OP_REFCNT_LOCK;
4960                 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4961                 OP_REFCNT_UNLOCK;
4962                 if (!refcnt)
4963                     op_free((OP_4tree*)r->data->data[n]);
4964
4965                 PAD_RESTORE_LOCAL(old_comppad);
4966                 SvREFCNT_dec((SV*)new_comppad);
4967                 new_comppad = NULL;
4968                 break;
4969             case 'n':
4970                 break;
4971             default:
4972                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4973             }
4974         }
4975         Safefree(r->data->what);
4976         Safefree(r->data);
4977     }
4978     Safefree(r->startp);
4979     Safefree(r->endp);
4980     Safefree(r);
4981 }
4982
4983 /*
4984  - regnext - dig the "next" pointer out of a node
4985  *
4986  * [Note, when REGALIGN is defined there are two places in regmatch()
4987  * that bypass this code for speed.]
4988  */
4989 regnode *
4990 Perl_regnext(pTHX_ register regnode *p)
4991 {
4992     register I32 offset;
4993
4994     if (p == &PL_regdummy)
4995         return(NULL);
4996
4997     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4998     if (offset == 0)
4999         return(NULL);
5000
5001     return(p+offset);
5002 }
5003
5004 STATIC void     
5005 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5006 {
5007     va_list args;
5008     STRLEN l1 = strlen(pat1);
5009     STRLEN l2 = strlen(pat2);
5010     char buf[512];
5011     SV *msv;
5012     const char *message;
5013
5014     if (l1 > 510)
5015         l1 = 510;
5016     if (l1 + l2 > 510)
5017         l2 = 510 - l1;
5018     Copy(pat1, buf, l1 , char);
5019     Copy(pat2, buf + l1, l2 , char);
5020     buf[l1 + l2] = '\n';
5021     buf[l1 + l2 + 1] = '\0';
5022 #ifdef I_STDARG
5023     /* ANSI variant takes additional second argument */
5024     va_start(args, pat2);
5025 #else
5026     va_start(args);
5027 #endif
5028     msv = vmess(buf, &args);
5029     va_end(args);
5030     message = SvPV_const(msv,l1);
5031     if (l1 > 512)
5032         l1 = 512;
5033     Copy(message, buf, l1 , char);
5034     buf[l1-1] = '\0';                   /* Overwrite \n */
5035     Perl_croak(aTHX_ "%s", buf);
5036 }
5037
5038 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5039
5040 void
5041 Perl_save_re_context(pTHX)
5042 {
5043
5044     struct re_save_state *state;
5045
5046     SAVEVPTR(PL_curcop);
5047     SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
5048
5049     state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
5050     PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
5051     SSPUSHINT(SAVEt_RE_STATE);
5052
5053     state->re_state_reg_flags = PL_reg_flags;
5054     state->re_state_bostr = PL_bostr;
5055     state->re_state_reginput = PL_reginput;
5056     state->re_state_regbol = PL_regbol;
5057     state->re_state_regeol = PL_regeol;
5058     state->re_state_regstartp = PL_regstartp;
5059     state->re_state_regendp = PL_regendp;
5060     state->re_state_reglastparen = PL_reglastparen;
5061     state->re_state_reglastcloseparen = PL_reglastcloseparen;
5062     state->re_state_regtill = PL_regtill;
5063     state->re_state_reg_start_tmp = PL_reg_start_tmp;
5064     state->re_state_reg_start_tmpl = PL_reg_start_tmpl;
5065     state->re_state_reg_eval_set = PL_reg_eval_set;
5066     state->re_state_regnarrate = PL_regnarrate;
5067     state->re_state_regindent = PL_regindent;
5068     state->re_state_reg_call_cc = PL_reg_call_cc;
5069     state->re_state_reg_re = PL_reg_re;
5070     state->re_state_reg_ganch = PL_reg_ganch;
5071     state->re_state_reg_sv = PL_reg_sv;
5072     state->re_state_reg_match_utf8 = PL_reg_match_utf8;
5073     state->re_state_reg_magic = PL_reg_magic;
5074     state->re_state_reg_oldpos = PL_reg_oldpos;
5075     state->re_state_reg_oldcurpm = PL_reg_oldcurpm;
5076     state->re_state_reg_curpm = PL_reg_curpm;
5077     state->re_state_reg_oldsaved = PL_reg_oldsaved;
5078     state->re_state_reg_oldsavedlen = PL_reg_oldsavedlen;
5079     state->re_state_reg_maxiter = PL_reg_maxiter;
5080     state->re_state_reg_leftiter = PL_reg_leftiter;
5081     state->re_state_reg_poscache = PL_reg_poscache;
5082     state->re_state_reg_poscache_size = PL_reg_poscache_size;
5083     state->re_state_regsize = PL_regsize;
5084     state->re_state_reg_starttry = PL_reg_starttry;
5085
5086     /* These variables have been eliminated from 5.10:  */
5087     state->re_state_regdata = PL_regdata;
5088     state->re_state_regprogram = PL_regprogram;
5089     state->re_state_regcc = PL_regcc;
5090     state->re_state_regprecomp = PL_regprecomp;
5091     state->re_state_regnpar = PL_regnpar;
5092
5093     PL_reg_start_tmp = 0;
5094     PL_reg_start_tmpl = 0;
5095     PL_reg_oldsaved = Nullch;
5096     PL_reg_oldsavedlen = 0;
5097     PL_reg_maxiter = 0;
5098     PL_reg_leftiter = 0;
5099     PL_reg_poscache = Nullch;
5100     PL_reg_poscache_size = 0;
5101
5102     {
5103         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5104         REGEXP *rx;
5105
5106         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5107             U32 i;
5108             for (i = 1; i <= rx->nparens; i++) {
5109                 GV *mgv;
5110                 char digits[TYPE_CHARS(long)];
5111 #ifdef USE_SNPRINTF
5112                 const STRLEN len = snprintf(digits, sizeof(digits), "%lu", (long)i);
5113 #else
5114                 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
5115 #endif /* #ifdef USE_SNPRINTF */
5116                 if ((mgv = gv_fetchpvn_flags(digits, len, 0, SVt_PV)))
5117                     save_scalar(mgv);
5118             }
5119         }
5120     }
5121 }
5122
5123 static void
5124 clear_re(pTHX_ void *r)
5125 {
5126     ReREFCNT_dec((regexp *)r);
5127 }
5128
5129 #ifdef DEBUGGING
5130
5131 STATIC void
5132 S_put_byte(pTHX_ SV *sv, int c)
5133 {
5134     if (isCNTRL(c) || c == 255 || !isPRINT(c))
5135         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5136     else if (c == '-' || c == ']' || c == '\\' || c == '^')
5137         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5138     else
5139         Perl_sv_catpvf(aTHX_ sv, "%c", c);
5140 }
5141
5142
5143 STATIC regnode *
5144 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5145 {
5146     register U8 op = EXACT;     /* Arbitrary non-END op. */
5147     register regnode *next;
5148
5149     while (op != END && (!last || node < last)) {
5150         /* While that wasn't END last time... */
5151
5152         NODE_ALIGN(node);
5153         op = OP(node);
5154         if (op == CLOSE)
5155             l--;        
5156         next = regnext(node);
5157         /* Where, what. */
5158         if (OP(node) == OPTIMIZED)
5159             goto after_print;
5160         regprop(sv, node);
5161         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5162                       (int)(2*l + 1), "", SvPVX_const(sv));
5163         if (next == NULL)               /* Next ptr. */
5164             PerlIO_printf(Perl_debug_log, "(0)");
5165         else
5166             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5167         (void)PerlIO_putc(Perl_debug_log, '\n');
5168       after_print:
5169         if (PL_regkind[(U8)op] == BRANCHJ) {
5170             register regnode *nnode = (OP(next) == LONGJMP
5171                                        ? regnext(next)
5172                                        : next);
5173             if (last && nnode > last)
5174                 nnode = last;
5175             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5176         }
5177         else if (PL_regkind[(U8)op] == BRANCH) {
5178             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5179         }
5180         else if ( op == CURLY) {   /* "next" might be very big: optimizer */
5181             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5182                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5183         }
5184         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5185             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5186                              next, sv, l + 1);
5187         }
5188         else if ( op == PLUS || op == STAR) {
5189             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5190         }
5191         else if (op == ANYOF) {
5192             /* arglen 1 + class block */
5193             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5194                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5195             node = NEXTOPER(node);
5196         }
5197         else if (PL_regkind[(U8)op] == EXACT) {
5198             /* Literal string, where present. */
5199             node += NODE_SZ_STR(node) - 1;
5200             node = NEXTOPER(node);
5201         }
5202         else {
5203             node = NEXTOPER(node);
5204             node += regarglen[(U8)op];
5205         }
5206         if (op == CURLYX || op == OPEN)
5207             l++;
5208         else if (op == WHILEM)
5209             l--;
5210     }
5211     return node;
5212 }
5213
5214 #endif  /* DEBUGGING */
5215
5216 /*
5217  * Local variables:
5218  * c-indentation-style: bsd
5219  * c-basic-offset: 4
5220  * indent-tabs-mode: t
5221  * End:
5222  *
5223  * ex: set ts=8 sts=4 sw=4 noet:
5224  */