This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* This file contains functions for compiling a regular expression.  See
9  * also regexec.c which funnily enough, contains functions for executing
10  * a regular expression.
11  *
12  * This file is also copied at build time to ext/re/re_comp.c, where
13  * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14  * This causes the main functions to be compiled under new names and with
15  * debugging support added, which makes "use re 'debug'" work.
16  */
17
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19  * confused with the original package (see point 3 below).  Thanks, Henry!
20  */
21
22 /* Additional note: this code is very heavily munged from Henry's version
23  * in places.  In some spots I've traded clarity for efficiency, so don't
24  * blame Henry for some of the lack of readability.
25  */
26
27 /* The names of the functions have been changed from regcomp and
28  * regexec to  pregcomp and pregexec in order to avoid conflicts
29  * with the POSIX routines of the same names.
30 */
31
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 #  ifndef PERL_IN_XSUB_RE
35 #    define PERL_IN_XSUB_RE
36 #  endif
37 /* need access to debugger hooks */
38 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
39 #    define DEBUGGING
40 #  endif
41 #endif
42
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 #  define Perl_pregcomp my_regcomp
46 #  define Perl_regdump my_regdump
47 #  define Perl_regprop my_regprop
48 #  define Perl_pregfree my_regfree
49 #  define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 #  define Perl_regnext my_regnext
52 #  define Perl_save_re_context my_save_re_context
53 #  define Perl_reginitcolors my_reginitcolors
54
55 #  define PERL_NO_GET_CONTEXT
56 #endif
57
58 /*SUPPRESS 112*/
59 /*
60  * pregcomp and pregexec -- regsub and regerror are not used in perl
61  *
62  *      Copyright (c) 1986 by University of Toronto.
63  *      Written by Henry Spencer.  Not derived from licensed software.
64  *
65  *      Permission is granted to anyone to use this software for any
66  *      purpose on any computer system, and to redistribute it freely,
67  *      subject to the following restrictions:
68  *
69  *      1. The author is not responsible for the consequences of use of
70  *              this software, no matter how awful, even if they arise
71  *              from defects in it.
72  *
73  *      2. The origin of this software must not be misrepresented, either
74  *              by explicit claim or by omission.
75  *
76  *      3. Altered versions must be plainly marked as such, and must not
77  *              be misrepresented as being the original software.
78  *
79  *
80  ****    Alterations to Henry's code are...
81  ****
82  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83  ****    2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84  ****
85  ****    You may distribute under the terms of either the GNU General Public
86  ****    License or the Artistic License, as specified in the README file.
87
88  *
89  * Beware that some of this code is subtly aware of the way operator
90  * precedence is structured in regular expressions.  Serious changes in
91  * regular-expression syntax might require a total rethink.
92  */
93 #include "EXTERN.h"
94 #define PERL_IN_REGCOMP_C
95 #include "perl.h"
96
97 #ifndef PERL_IN_XSUB_RE
98 #  include "INTERN.h"
99 #endif
100
101 #define REG_COMP_C
102 #include "regcomp.h"
103
104 #ifdef op
105 #undef op
106 #endif /* op */
107
108 #ifdef MSDOS
109 #  if defined(BUGGY_MSC6)
110  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 #    pragma optimize("a",off)
112  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 #    pragma optimize("w",on )
114 #  endif /* BUGGY_MSC6 */
115 #endif /* MSDOS */
116
117 #ifndef STATIC
118 #define STATIC  static
119 #endif
120
121 typedef struct RExC_state_t {
122     U32         flags;                  /* are we folding, multilining? */
123     char        *precomp;               /* uncompiled string. */
124     regexp      *rx;
125     char        *start;                 /* Start of input for compile */
126     char        *end;                   /* End of input for compile */
127     char        *parse;                 /* Input-scan pointer. */
128     I32         whilem_seen;            /* number of WHILEM in this expr */
129     regnode     *emit_start;            /* Start of emitted-code area */
130     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
131     I32         naughty;                /* How bad is this pattern? */
132     I32         sawback;                /* Did we see \1, ...? */
133     U32         seen;
134     I32         size;                   /* Code size. */
135     I32         npar;                   /* () count. */
136     I32         extralen;
137     I32         seen_zerolen;
138     I32         seen_evals;
139     I32         utf8;
140 #if ADD_TO_REGEXEC
141     char        *starttry;              /* -Dr: where regtry was called. */
142 #define RExC_starttry   (pRExC_state->starttry)
143 #endif
144 } RExC_state_t;
145
146 #define RExC_flags      (pRExC_state->flags)
147 #define RExC_precomp    (pRExC_state->precomp)
148 #define RExC_rx         (pRExC_state->rx)
149 #define RExC_start      (pRExC_state->start)
150 #define RExC_end        (pRExC_state->end)
151 #define RExC_parse      (pRExC_state->parse)
152 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
153 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit       (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty    (pRExC_state->naughty)
157 #define RExC_sawback    (pRExC_state->sawback)
158 #define RExC_seen       (pRExC_state->seen)
159 #define RExC_size       (pRExC_state->size)
160 #define RExC_npar       (pRExC_state->npar)
161 #define RExC_extralen   (pRExC_state->extralen)
162 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8       (pRExC_state->utf8)
165
166 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168         ((*s) == '{' && regcurly(s)))
169
170 #ifdef SPSTART
171 #undef SPSTART          /* dratted cpp namespace... */
172 #endif
173 /*
174  * Flags to be passed up and down.
175  */
176 #define WORST           0       /* Worst case. */
177 #define HASWIDTH        0x1     /* Known to match non-null strings. */
178 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART         0x4     /* Starts with * or +. */
180 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
181
182 /* Length of a variant. */
183
184 typedef struct scan_data_t {
185     I32 len_min;
186     I32 len_delta;
187     I32 pos_min;
188     I32 pos_delta;
189     SV *last_found;
190     I32 last_end;                       /* min value, <0 unless valid. */
191     I32 last_start_min;
192     I32 last_start_max;
193     SV **longest;                       /* Either &l_fixed, or &l_float. */
194     SV *longest_fixed;
195     I32 offset_fixed;
196     SV *longest_float;
197     I32 offset_float_min;
198     I32 offset_float_max;
199     I32 flags;
200     I32 whilem_c;
201     I32 *last_closep;
202     struct regnode_charclass_class *start_class;
203 } scan_data_t;
204
205 /*
206  * Forward declarations for pregcomp()'s friends.
207  */
208
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
210                                       0, 0, 0, 0, 0, 0};
211
212 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL          0x1
214 #define SF_BEFORE_MEOL          0x2
215 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
218 #ifdef NO_UNARY_PLUS
219 #  define SF_FIX_SHIFT_EOL      (0+2)
220 #  define SF_FL_SHIFT_EOL               (0+4)
221 #else
222 #  define SF_FIX_SHIFT_EOL      (+2)
223 #  define SF_FL_SHIFT_EOL               (+4)
224 #endif
225
226 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF               0x40
232 #define SF_HAS_PAR              0x80
233 #define SF_IN_PAR               0x100
234 #define SF_HAS_EVAL             0x200
235 #define SCF_DO_SUBSTR           0x400
236 #define SCF_DO_STCLASS_AND      0x0800
237 #define SCF_DO_STCLASS_OR       0x1000
238 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS  0x2000
240
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244
245 #define OOB_UNICODE             12345678
246 #define OOB_NAMEDCLASS          -1
247
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
251
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
254
255 /*
256  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258  * op/pragma/warn/regcomp.
259  */
260 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
262
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
264
265 /*
266  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267  * arg. Show regex, up to a maximum length. If it's too long, chop and add
268  * "...".
269  */
270 #define FAIL(msg) STMT_START {                                          \
271     const char *ellipses = "";                                          \
272     IV len = RExC_end - RExC_precomp;                                   \
273                                                                         \
274     if (!SIZE_ONLY)                                                     \
275         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
276     if (len > RegexLengthToShowInErrorMessages) {                       \
277         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
278         len = RegexLengthToShowInErrorMessages - 10;                    \
279         ellipses = "...";                                               \
280     }                                                                   \
281     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
282             msg, (int)len, RExC_precomp, ellipses);                     \
283 } STMT_END
284
285 /*
286  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287  * args. Show regex, up to a maximum length. If it's too long, chop and add
288  * "...".
289  */
290 #define FAIL2(pat,msg) STMT_START {                                     \
291     const char *ellipses = "";                                          \
292     IV len = RExC_end - RExC_precomp;                                   \
293                                                                         \
294     if (!SIZE_ONLY)                                                     \
295         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
296     if (len > RegexLengthToShowInErrorMessages) {                       \
297         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
298         len = RegexLengthToShowInErrorMessages - 10;                    \
299         ellipses = "...";                                               \
300     }                                                                   \
301     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
302             msg, (int)len, RExC_precomp, ellipses);                     \
303 } STMT_END
304
305
306 /*
307  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308  */
309 #define Simple_vFAIL(m) STMT_START {                                    \
310     IV offset = RExC_parse - RExC_precomp;                              \
311     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
312             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
313 } STMT_END
314
315 /*
316  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317  */
318 #define vFAIL(m) STMT_START {                           \
319     if (!SIZE_ONLY)                                     \
320         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
321     Simple_vFAIL(m);                                    \
322 } STMT_END
323
324 /*
325  * Like Simple_vFAIL(), but accepts two arguments.
326  */
327 #define Simple_vFAIL2(m,a1) STMT_START {                        \
328     IV offset = RExC_parse - RExC_precomp;                      \
329     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
330             (int)offset, RExC_precomp, RExC_precomp + offset);  \
331 } STMT_END
332
333 /*
334  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335  */
336 #define vFAIL2(m,a1) STMT_START {                       \
337     if (!SIZE_ONLY)                                     \
338         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
339     Simple_vFAIL2(m, a1);                               \
340 } STMT_END
341
342
343 /*
344  * Like Simple_vFAIL(), but accepts three arguments.
345  */
346 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
347     IV offset = RExC_parse - RExC_precomp;                      \
348     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
349             (int)offset, RExC_precomp, RExC_precomp + offset);  \
350 } STMT_END
351
352 /*
353  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354  */
355 #define vFAIL3(m,a1,a2) STMT_START {                    \
356     if (!SIZE_ONLY)                                     \
357         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
358     Simple_vFAIL3(m, a1, a2);                           \
359 } STMT_END
360
361 /*
362  * Like Simple_vFAIL(), but accepts four arguments.
363  */
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
365     IV offset = RExC_parse - RExC_precomp;                      \
366     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
367             (int)offset, RExC_precomp, RExC_precomp + offset);  \
368 } STMT_END
369
370 /*
371  * Like Simple_vFAIL(), but accepts five arguments.
372  */
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
374     IV offset = RExC_parse - RExC_precomp;                      \
375     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
376             (int)offset, RExC_precomp, RExC_precomp + offset);  \
377 } STMT_END
378
379
380 #define vWARN(loc,m) STMT_START {                                       \
381     IV offset = loc - RExC_precomp;                                     \
382     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
383             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
384 } STMT_END
385
386 #define vWARNdep(loc,m) STMT_START {                                    \
387     IV offset = loc - RExC_precomp;                                     \
388     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
389             "%s" REPORT_LOCATION,                                       \
390             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
391 } STMT_END
392
393
394 #define vWARN2(loc, m, a1) STMT_START {                                 \
395     IV offset = loc - RExC_precomp;                                     \
396     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
397             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
398 } STMT_END
399
400 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
401     IV offset = loc - RExC_precomp;                                     \
402     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
403             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
404 } STMT_END
405
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
407     IV offset = loc - RExC_precomp;                                     \
408     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
409             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410 } STMT_END
411
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
413     IV offset = loc - RExC_precomp;                                     \
414     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
415             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416 } STMT_END
417
418
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START {                  \
421     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422 } STMT_END
423
424 /* Macros for recording node offsets.   20001227 mjd@plover.com 
425  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
426  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
427  * Element 0 holds the number n.
428  */
429
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
432
433
434 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
435     if (! SIZE_ONLY) {                                                  \
436         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
437                 __LINE__, (node), (byte)));                             \
438         if((node) < 0) {                                                \
439             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440         } else {                                                        \
441             RExC_offsets[2*(node)-1] = (byte);                          \
442         }                                                               \
443     }                                                                   \
444 } STMT_END
445
446 #define Set_Node_Offset(node,byte) \
447     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
451     if (! SIZE_ONLY) {                                                  \
452         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
453                 __LINE__, (node), (len)));                              \
454         if((node) < 0) {                                                \
455             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456         } else {                                                        \
457             RExC_offsets[2*(node)] = (len);                             \
458         }                                                               \
459     }                                                                   \
460 } STMT_END
461
462 #define Set_Node_Length(node,len) \
463     Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466     Set_Node_Length(node, RExC_parse - parse_start)
467
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
472 static void clear_re(pTHX_ void *r);
473
474 /* Mark that we cannot extend a found fixed substring at this point.
475    Updata the longest found anchored substring and the longest found
476    floating substrings if needed. */
477
478 STATIC void
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
480 {
481     const STRLEN l = CHR_SVLEN(data->last_found);
482     const STRLEN old_l = CHR_SVLEN(*data->longest);
483
484     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485         SvSetMagicSV(*data->longest, data->last_found);
486         if (*data->longest == data->longest_fixed) {
487             data->offset_fixed = l ? data->last_start_min : data->pos_min;
488             if (data->flags & SF_BEFORE_EOL)
489                 data->flags
490                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491             else
492                 data->flags &= ~SF_FIX_BEFORE_EOL;
493         }
494         else {
495             data->offset_float_min = l ? data->last_start_min : data->pos_min;
496             data->offset_float_max = (l
497                                       ? data->last_start_max
498                                       : data->pos_min + data->pos_delta);
499             if ((U32)data->offset_float_max > (U32)I32_MAX)
500                 data->offset_float_max = I32_MAX;
501             if (data->flags & SF_BEFORE_EOL)
502                 data->flags
503                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504             else
505                 data->flags &= ~SF_FL_BEFORE_EOL;
506         }
507     }
508     SvCUR_set(data->last_found, 0);
509     {
510         SV * sv = data->last_found;
511         MAGIC *mg =
512             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513         if (mg && mg->mg_len > 0)
514             mg->mg_len = 0;
515     }
516     data->last_end = -1;
517     data->flags &= ~SF_BEFORE_EOL;
518 }
519
520 /* Can match anything (initialization) */
521 STATIC void
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
523 {
524     ANYOF_CLASS_ZERO(cl);
525     ANYOF_BITMAP_SETALL(cl);
526     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
527     if (LOC)
528         cl->flags |= ANYOF_LOCALE;
529 }
530
531 /* Can match anything (initialization) */
532 STATIC int
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534 {
535     int value;
536
537     for (value = 0; value <= ANYOF_MAX; value += 2)
538         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539             return 1;
540     if (!(cl->flags & ANYOF_UNICODE_ALL))
541         return 0;
542     if (!ANYOF_BITMAP_TESTALLSET(cl))
543         return 0;
544     return 1;
545 }
546
547 /* Can match anything (initialization) */
548 STATIC void
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
550 {
551     Zero(cl, 1, struct regnode_charclass_class);
552     cl->type = ANYOF;
553     cl_anything(pRExC_state, cl);
554 }
555
556 STATIC void
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
558 {
559     Zero(cl, 1, struct regnode_charclass_class);
560     cl->type = ANYOF;
561     cl_anything(pRExC_state, cl);
562     if (LOC)
563         cl->flags |= ANYOF_LOCALE;
564 }
565
566 /* 'And' a given class with another one.  Can create false positives */
567 /* We assume that cl is not inverted */
568 STATIC void
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570          struct regnode_charclass_class *and_with)
571 {
572     if (!(and_with->flags & ANYOF_CLASS)
573         && !(cl->flags & ANYOF_CLASS)
574         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575         && !(and_with->flags & ANYOF_FOLD)
576         && !(cl->flags & ANYOF_FOLD)) {
577         int i;
578
579         if (and_with->flags & ANYOF_INVERT)
580             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581                 cl->bitmap[i] &= ~and_with->bitmap[i];
582         else
583             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584                 cl->bitmap[i] &= and_with->bitmap[i];
585     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586     if (!(and_with->flags & ANYOF_EOS))
587         cl->flags &= ~ANYOF_EOS;
588
589     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590         !(and_with->flags & ANYOF_INVERT)) {
591         cl->flags &= ~ANYOF_UNICODE_ALL;
592         cl->flags |= ANYOF_UNICODE;
593         ARG_SET(cl, ARG(and_with));
594     }
595     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596         !(and_with->flags & ANYOF_INVERT))
597         cl->flags &= ~ANYOF_UNICODE_ALL;
598     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599         !(and_with->flags & ANYOF_INVERT))
600         cl->flags &= ~ANYOF_UNICODE;
601 }
602
603 /* 'OR' a given class with another one.  Can create false positives */
604 /* We assume that cl is not inverted */
605 STATIC void
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
607 {
608     if (or_with->flags & ANYOF_INVERT) {
609         /* We do not use
610          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611          *   <= (B1 | !B2) | (CL1 | !CL2)
612          * which is wasteful if CL2 is small, but we ignore CL2:
613          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614          * XXXX Can we handle case-fold?  Unclear:
615          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617          */
618         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619              && !(or_with->flags & ANYOF_FOLD)
620              && !(cl->flags & ANYOF_FOLD) ) {
621             int i;
622
623             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624                 cl->bitmap[i] |= ~or_with->bitmap[i];
625         } /* XXXX: logic is complicated otherwise */
626         else {
627             cl_anything(pRExC_state, cl);
628         }
629     } else {
630         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632              && (!(or_with->flags & ANYOF_FOLD)
633                  || (cl->flags & ANYOF_FOLD)) ) {
634             int i;
635
636             /* OR char bitmap and class bitmap separately */
637             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638                 cl->bitmap[i] |= or_with->bitmap[i];
639             if (or_with->flags & ANYOF_CLASS) {
640                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641                     cl->classflags[i] |= or_with->classflags[i];
642                 cl->flags |= ANYOF_CLASS;
643             }
644         }
645         else { /* XXXX: logic is complicated, leave it along for a moment. */
646             cl_anything(pRExC_state, cl);
647         }
648     }
649     if (or_with->flags & ANYOF_EOS)
650         cl->flags |= ANYOF_EOS;
651
652     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653         ARG(cl) != ARG(or_with)) {
654         cl->flags |= ANYOF_UNICODE_ALL;
655         cl->flags &= ~ANYOF_UNICODE;
656     }
657     if (or_with->flags & ANYOF_UNICODE_ALL) {
658         cl->flags |= ANYOF_UNICODE_ALL;
659         cl->flags &= ~ANYOF_UNICODE;
660     }
661 }
662
663 /*
664  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665  * These need to be revisited when a newer toolchain becomes available.
666  */
667 #if defined(__sparc64__) && defined(__GNUC__)
668 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669 #       undef  SPARC64_GCC_WORKAROUND
670 #       define SPARC64_GCC_WORKAROUND 1
671 #   endif
672 #endif
673
674 /* REx optimizer.  Converts nodes into quickier variants "in place".
675    Finds fixed substrings.  */
676
677 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
678    to the position after last scanned or to NULL. */
679
680 STATIC I32
681 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
682                         /* scanp: Start here (read-write). */
683                         /* deltap: Write maxlen-minlen here. */
684                         /* last: Stop before this one. */
685 {
686     I32 min = 0, pars = 0, code;
687     regnode *scan = *scanp, *next;
688     I32 delta = 0;
689     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
690     int is_inf_internal = 0;            /* The studied chunk is infinite */
691     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692     scan_data_t data_fake;
693     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
694
695     while (scan && OP(scan) != END && scan < last) {
696         /* Peephole optimizer: */
697
698         if (PL_regkind[(U8)OP(scan)] == EXACT) {
699             /* Merge several consecutive EXACTish nodes into one. */
700             regnode *n = regnext(scan);
701             U32 stringok = 1;
702 #ifdef DEBUGGING
703             regnode *stop = scan;
704 #endif
705
706             next = scan + NODE_SZ_STR(scan);
707             /* Skip NOTHING, merge EXACT*. */
708             while (n &&
709                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
710                      (stringok && (OP(n) == OP(scan))))
711                    && NEXT_OFF(n)
712                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713                 if (OP(n) == TAIL || n > next)
714                     stringok = 0;
715                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
716                     NEXT_OFF(scan) += NEXT_OFF(n);
717                     next = n + NODE_STEP_REGNODE;
718 #ifdef DEBUGGING
719                     if (stringok)
720                         stop = n;
721 #endif
722                     n = regnext(n);
723                 }
724                 else if (stringok) {
725                     const int oldl = STR_LEN(scan);
726                     regnode *nnext = regnext(n);
727
728                     if (oldl + STR_LEN(n) > U8_MAX)
729                         break;
730                     NEXT_OFF(scan) += NEXT_OFF(n);
731                     STR_LEN(scan) += STR_LEN(n);
732                     next = n + NODE_SZ_STR(n);
733                     /* Now we can overwrite *n : */
734                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
735 #ifdef DEBUGGING
736                     stop = next - 1;
737 #endif
738                     n = nnext;
739                 }
740             }
741
742             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
743 /*
744   Two problematic code points in Unicode casefolding of EXACT nodes:
745
746    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
748
749    which casefold to
750
751    Unicode                      UTF-8
752
753    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
754    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
755
756    This means that in case-insensitive matching (or "loose matching",
757    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758    length of the above casefolded versions) can match a target string
759    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760    This would rather mess up the minimum length computation.
761
762    What we'll do is to look for the tail four bytes, and then peek
763    at the preceding two bytes to see whether we need to decrease
764    the minimum length by four (six minus two).
765
766    Thanks to the design of UTF-8, there cannot be false matches:
767    A sequence of valid UTF-8 bytes cannot be a subsequence of
768    another valid sequence of UTF-8 bytes.
769
770 */
771                  char *s0 = STRING(scan), *s, *t;
772                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773                  const char *t0 = "\xcc\x88\xcc\x81";
774                  const char *t1 = t0 + 3;
775
776                  for (s = s0 + 2;
777                       s < s2 && (t = ninstr(s, s1, t0, t1));
778                       s = t + 4) {
779                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
781                            min -= 4;
782                  }
783             }
784
785 #ifdef DEBUGGING
786             /* Allow dumping */
787             n = scan + NODE_SZ_STR(scan);
788             while (n <= stop) {
789                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
790                     OP(n) = OPTIMIZED;
791                     NEXT_OFF(n) = 0;
792                 }
793                 n++;
794             }
795 #endif
796         }
797         /* Follow the next-chain of the current node and optimize
798            away all the NOTHINGs from it.  */
799         if (OP(scan) != CURLYX) {
800             const int max = (reg_off_by_arg[OP(scan)]
801                        ? I32_MAX
802                        /* I32 may be smaller than U16 on CRAYs! */
803                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
804             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
805             int noff;
806             regnode *n = scan;
807         
808             /* Skip NOTHING and LONGJMP. */
809             while ((n = regnext(n))
810                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
811                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
812                    && off + noff < max)
813                 off += noff;
814             if (reg_off_by_arg[OP(scan)])
815                 ARG(scan) = off;
816             else
817                 NEXT_OFF(scan) = off;
818         }
819         /* The principal pseudo-switch.  Cannot be a switch, since we
820            look into several different things.  */
821         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
822                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823             next = regnext(scan);
824             code = OP(scan);
825         
826             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
827                 I32 max1 = 0, min1 = I32_MAX, num = 0;
828                 struct regnode_charclass_class accum;
829                 
830                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
831                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
832                 if (flags & SCF_DO_STCLASS)
833                     cl_init_zero(pRExC_state, &accum);
834                 while (OP(scan) == code) {
835                     I32 deltanext, minnext, f = 0, fake;
836                     struct regnode_charclass_class this_class;
837
838                     num++;
839                     data_fake.flags = 0;
840                     if (data) {         
841                         data_fake.whilem_c = data->whilem_c;
842                         data_fake.last_closep = data->last_closep;
843                     }
844                     else
845                         data_fake.last_closep = &fake;
846                     next = regnext(scan);
847                     scan = NEXTOPER(scan);
848                     if (code != BRANCH)
849                         scan = NEXTOPER(scan);
850                     if (flags & SCF_DO_STCLASS) {
851                         cl_init(pRExC_state, &this_class);
852                         data_fake.start_class = &this_class;
853                         f = SCF_DO_STCLASS_AND;
854                     }           
855                     if (flags & SCF_WHILEM_VISITED_POS)
856                         f |= SCF_WHILEM_VISITED_POS;
857                     /* we suppose the run is continuous, last=next...*/
858                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
859                                           next, &data_fake, f);
860                     if (min1 > minnext)
861                         min1 = minnext;
862                     if (max1 < minnext + deltanext)
863                         max1 = minnext + deltanext;
864                     if (deltanext == I32_MAX)
865                         is_inf = is_inf_internal = 1;
866                     scan = next;
867                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
868                         pars++;
869                     if (data && (data_fake.flags & SF_HAS_EVAL))
870                         data->flags |= SF_HAS_EVAL;
871                     if (data)
872                         data->whilem_c = data_fake.whilem_c;
873                     if (flags & SCF_DO_STCLASS)
874                         cl_or(pRExC_state, &accum, &this_class);
875                     if (code == SUSPEND)
876                         break;
877                 }
878                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
879                     min1 = 0;
880                 if (flags & SCF_DO_SUBSTR) {
881                     data->pos_min += min1;
882                     data->pos_delta += max1 - min1;
883                     if (max1 != min1 || is_inf)
884                         data->longest = &(data->longest_float);
885                 }
886                 min += min1;
887                 delta += max1 - min1;
888                 if (flags & SCF_DO_STCLASS_OR) {
889                     cl_or(pRExC_state, data->start_class, &accum);
890                     if (min1) {
891                         cl_and(data->start_class, &and_with);
892                         flags &= ~SCF_DO_STCLASS;
893                     }
894                 }
895                 else if (flags & SCF_DO_STCLASS_AND) {
896                     if (min1) {
897                         cl_and(data->start_class, &accum);
898                         flags &= ~SCF_DO_STCLASS;
899                     }
900                     else {
901                         /* Switch to OR mode: cache the old value of
902                          * data->start_class */
903                         StructCopy(data->start_class, &and_with,
904                                    struct regnode_charclass_class);
905                         flags &= ~SCF_DO_STCLASS_AND;
906                         StructCopy(&accum, data->start_class,
907                                    struct regnode_charclass_class);
908                         flags |= SCF_DO_STCLASS_OR;
909                         data->start_class->flags |= ANYOF_EOS;
910                     }
911                 }
912             }
913             else if (code == BRANCHJ)   /* single branch is optimized. */
914                 scan = NEXTOPER(NEXTOPER(scan));
915             else                        /* single branch is optimized. */
916                 scan = NEXTOPER(scan);
917             continue;
918         }
919         else if (OP(scan) == EXACT) {
920             I32 l = STR_LEN(scan);
921             UV uc = *((U8*)STRING(scan));
922             if (UTF) {
923                 const U8 * const s = (U8*)STRING(scan);
924                 l = utf8_length(s, s + l);
925                 uc = utf8_to_uvchr(s, NULL);
926             }
927             min += l;
928             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
929                 /* The code below prefers earlier match for fixed
930                    offset, later match for variable offset.  */
931                 if (data->last_end == -1) { /* Update the start info. */
932                     data->last_start_min = data->pos_min;
933                     data->last_start_max = is_inf
934                         ? I32_MAX : data->pos_min + data->pos_delta;
935                 }
936                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
937                 {
938                     SV * sv = data->last_found;
939                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
941                     if (mg && mg->mg_len >= 0)
942                         mg->mg_len += utf8_length((U8*)STRING(scan),
943                                                   (U8*)STRING(scan)+STR_LEN(scan));
944                 }
945                 if (UTF)
946                     SvUTF8_on(data->last_found);
947                 data->last_end = data->pos_min + l;
948                 data->pos_min += l; /* As in the first entry. */
949                 data->flags &= ~SF_BEFORE_EOL;
950             }
951             if (flags & SCF_DO_STCLASS_AND) {
952                 /* Check whether it is compatible with what we know already! */
953                 int compat = 1;
954
955                 if (uc >= 0x100 ||
956                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
957                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
958                     && (!(data->start_class->flags & ANYOF_FOLD)
959                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
960                     )
961                     compat = 0;
962                 ANYOF_CLASS_ZERO(data->start_class);
963                 ANYOF_BITMAP_ZERO(data->start_class);
964                 if (compat)
965                     ANYOF_BITMAP_SET(data->start_class, uc);
966                 data->start_class->flags &= ~ANYOF_EOS;
967                 if (uc < 0x100)
968                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
969             }
970             else if (flags & SCF_DO_STCLASS_OR) {
971                 /* false positive possible if the class is case-folded */
972                 if (uc < 0x100)
973                     ANYOF_BITMAP_SET(data->start_class, uc);
974                 else
975                     data->start_class->flags |= ANYOF_UNICODE_ALL;
976                 data->start_class->flags &= ~ANYOF_EOS;
977                 cl_and(data->start_class, &and_with);
978             }
979             flags &= ~SCF_DO_STCLASS;
980         }
981         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
982             I32 l = STR_LEN(scan);
983             UV uc = *((U8*)STRING(scan));
984
985             /* Search for fixed substrings supports EXACT only. */
986             if (flags & SCF_DO_SUBSTR)
987                 scan_commit(pRExC_state, data);
988             if (UTF) {
989                 U8 *s = (U8 *)STRING(scan);
990                 l = utf8_length(s, s + l);
991                 uc = utf8_to_uvchr(s, NULL);
992             }
993             min += l;
994             if (data && (flags & SCF_DO_SUBSTR))
995                 data->pos_min += l;
996             if (flags & SCF_DO_STCLASS_AND) {
997                 /* Check whether it is compatible with what we know already! */
998                 int compat = 1;
999
1000                 if (uc >= 0x100 ||
1001                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1002                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
1003                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
1004                     compat = 0;
1005                 ANYOF_CLASS_ZERO(data->start_class);
1006                 ANYOF_BITMAP_ZERO(data->start_class);
1007                 if (compat) {
1008                     ANYOF_BITMAP_SET(data->start_class, uc);
1009                     data->start_class->flags &= ~ANYOF_EOS;
1010                     data->start_class->flags |= ANYOF_FOLD;
1011                     if (OP(scan) == EXACTFL)
1012                         data->start_class->flags |= ANYOF_LOCALE;
1013                 }
1014             }
1015             else if (flags & SCF_DO_STCLASS_OR) {
1016                 if (data->start_class->flags & ANYOF_FOLD) {
1017                     /* false positive possible if the class is case-folded.
1018                        Assume that the locale settings are the same... */
1019                     if (uc < 0x100)
1020                         ANYOF_BITMAP_SET(data->start_class, uc);
1021                     data->start_class->flags &= ~ANYOF_EOS;
1022                 }
1023                 cl_and(data->start_class, &and_with);
1024             }
1025             flags &= ~SCF_DO_STCLASS;
1026         }
1027         else if (strchr((const char*)PL_varies,OP(scan))) {
1028             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1029             I32 f = flags, pos_before = 0;
1030             regnode *oscan = scan;
1031             struct regnode_charclass_class this_class;
1032             struct regnode_charclass_class *oclass = NULL;
1033             I32 next_is_eval = 0;
1034
1035             switch (PL_regkind[(U8)OP(scan)]) {
1036             case WHILEM:                /* End of (?:...)* . */
1037                 scan = NEXTOPER(scan);
1038                 goto finish;
1039             case PLUS:
1040                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1041                     next = NEXTOPER(scan);
1042                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1043                         mincount = 1;
1044                         maxcount = REG_INFTY;
1045                         next = regnext(scan);
1046                         scan = NEXTOPER(scan);
1047                         goto do_curly;
1048                     }
1049                 }
1050                 if (flags & SCF_DO_SUBSTR)
1051                     data->pos_min++;
1052                 min++;
1053                 /* Fall through. */
1054             case STAR:
1055                 if (flags & SCF_DO_STCLASS) {
1056                     mincount = 0;
1057                     maxcount = REG_INFTY;
1058                     next = regnext(scan);
1059                     scan = NEXTOPER(scan);
1060                     goto do_curly;
1061                 }
1062                 is_inf = is_inf_internal = 1;
1063                 scan = regnext(scan);
1064                 if (flags & SCF_DO_SUBSTR) {
1065                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1066                     data->longest = &(data->longest_float);
1067                 }
1068                 goto optimize_curly_tail;
1069             case CURLY:
1070                 mincount = ARG1(scan);
1071                 maxcount = ARG2(scan);
1072                 next = regnext(scan);
1073                 if (OP(scan) == CURLYX) {
1074                     I32 lp = (data ? *(data->last_closep) : 0);
1075
1076                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1077                 }
1078                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1079                 next_is_eval = (OP(scan) == EVAL);
1080               do_curly:
1081                 if (flags & SCF_DO_SUBSTR) {
1082                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1083                     pos_before = data->pos_min;
1084                 }
1085                 if (data) {
1086                     fl = data->flags;
1087                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1088                     if (is_inf)
1089                         data->flags |= SF_IS_INF;
1090                 }
1091                 if (flags & SCF_DO_STCLASS) {
1092                     cl_init(pRExC_state, &this_class);
1093                     oclass = data->start_class;
1094                     data->start_class = &this_class;
1095                     f |= SCF_DO_STCLASS_AND;
1096                     f &= ~SCF_DO_STCLASS_OR;
1097                 }
1098                 /* These are the cases when once a subexpression
1099                    fails at a particular position, it cannot succeed
1100                    even after backtracking at the enclosing scope.
1101                 
1102                    XXXX what if minimal match and we are at the
1103                         initial run of {n,m}? */
1104                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105                     f &= ~SCF_WHILEM_VISITED_POS;
1106
1107                 /* This will finish on WHILEM, setting scan, or on NULL: */
1108                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1109                                       mincount == 0
1110                                         ? (f & ~SCF_DO_SUBSTR) : f);
1111
1112                 if (flags & SCF_DO_STCLASS)
1113                     data->start_class = oclass;
1114                 if (mincount == 0 || minnext == 0) {
1115                     if (flags & SCF_DO_STCLASS_OR) {
1116                         cl_or(pRExC_state, data->start_class, &this_class);
1117                     }
1118                     else if (flags & SCF_DO_STCLASS_AND) {
1119                         /* Switch to OR mode: cache the old value of
1120                          * data->start_class */
1121                         StructCopy(data->start_class, &and_with,
1122                                    struct regnode_charclass_class);
1123                         flags &= ~SCF_DO_STCLASS_AND;
1124                         StructCopy(&this_class, data->start_class,
1125                                    struct regnode_charclass_class);
1126                         flags |= SCF_DO_STCLASS_OR;
1127                         data->start_class->flags |= ANYOF_EOS;
1128                     }
1129                 } else {                /* Non-zero len */
1130                     if (flags & SCF_DO_STCLASS_OR) {
1131                         cl_or(pRExC_state, data->start_class, &this_class);
1132                         cl_and(data->start_class, &and_with);
1133                     }
1134                     else if (flags & SCF_DO_STCLASS_AND)
1135                         cl_and(data->start_class, &this_class);
1136                     flags &= ~SCF_DO_STCLASS;
1137                 }
1138                 if (!scan)              /* It was not CURLYX, but CURLY. */
1139                     scan = next;
1140                 if (ckWARN(WARN_REGEXP)
1141                        /* ? quantifier ok, except for (?{ ... }) */
1142                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1143                     && (minnext == 0) && (deltanext == 0)
1144                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1145                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1146                 {
1147                     vWARN(RExC_parse,
1148                           "Quantifier unexpected on zero-length expression");
1149                 }
1150
1151                 min += minnext * mincount;
1152                 is_inf_internal |= ((maxcount == REG_INFTY
1153                                      && (minnext + deltanext) > 0)
1154                                     || deltanext == I32_MAX);
1155                 is_inf |= is_inf_internal;
1156                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1157
1158                 /* Try powerful optimization CURLYX => CURLYN. */
1159                 if (  OP(oscan) == CURLYX && data
1160                       && data->flags & SF_IN_PAR
1161                       && !(data->flags & SF_HAS_EVAL)
1162                       && !deltanext && minnext == 1 ) {
1163                     /* Try to optimize to CURLYN.  */
1164                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1165                     regnode *nxt1 = nxt;
1166 #ifdef DEBUGGING
1167                     regnode *nxt2;
1168 #endif
1169
1170                     /* Skip open. */
1171                     nxt = regnext(nxt);
1172                     if (!strchr((const char*)PL_simple,OP(nxt))
1173                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1174                              && STR_LEN(nxt) == 1))
1175                         goto nogo;
1176 #ifdef DEBUGGING
1177                     nxt2 = nxt;
1178 #endif
1179                     nxt = regnext(nxt);
1180                     if (OP(nxt) != CLOSE)
1181                         goto nogo;
1182                     /* Now we know that nxt2 is the only contents: */
1183                     oscan->flags = (U8)ARG(nxt);
1184                     OP(oscan) = CURLYN;
1185                     OP(nxt1) = NOTHING; /* was OPEN. */
1186 #ifdef DEBUGGING
1187                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1191                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1192                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1193 #endif
1194                 }
1195               nogo:
1196
1197                 /* Try optimization CURLYX => CURLYM. */
1198                 if (  OP(oscan) == CURLYX && data
1199                       && !(data->flags & SF_HAS_PAR)
1200                       && !(data->flags & SF_HAS_EVAL)
1201                       && !deltanext     /* atom is fixed width */
1202                       && minnext != 0   /* CURLYM can't handle zero width */
1203                 ) {
1204                     /* XXXX How to optimize if data == 0? */
1205                     /* Optimize to a simpler form.  */
1206                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1207                     regnode *nxt2;
1208
1209                     OP(oscan) = CURLYM;
1210                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1211                             && (OP(nxt2) != WHILEM))
1212                         nxt = nxt2;
1213                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1214                     /* Need to optimize away parenths. */
1215                     if (data->flags & SF_IN_PAR) {
1216                         /* Set the parenth number.  */
1217                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1218
1219                         if (OP(nxt) != CLOSE)
1220                             FAIL("Panic opt close");
1221                         oscan->flags = (U8)ARG(nxt);
1222                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1223                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1224 #ifdef DEBUGGING
1225                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1227                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1229 #endif
1230 #if 0
1231                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232                             regnode *nnxt = regnext(nxt1);
1233                         
1234                             if (nnxt == nxt) {
1235                                 if (reg_off_by_arg[OP(nxt1)])
1236                                     ARG_SET(nxt1, nxt2 - nxt1);
1237                                 else if (nxt2 - nxt1 < U16_MAX)
1238                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1239                                 else
1240                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1241                             }
1242                             nxt1 = nnxt;
1243                         }
1244 #endif
1245                         /* Optimize again: */
1246                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1247                                     NULL, 0);
1248                     }
1249                     else
1250                         oscan->flags = 0;
1251                 }
1252                 else if ((OP(oscan) == CURLYX)
1253                          && (flags & SCF_WHILEM_VISITED_POS)
1254                          /* See the comment on a similar expression above.
1255                             However, this time it not a subexpression
1256                             we care about, but the expression itself. */
1257                          && (maxcount == REG_INFTY)
1258                          && data && ++data->whilem_c < 16) {
1259                     /* This stays as CURLYX, we can put the count/of pair. */
1260                     /* Find WHILEM (as in regexec.c) */
1261                     regnode *nxt = oscan + NEXT_OFF(oscan);
1262
1263                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1264                         nxt += ARG(nxt);
1265                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1267                 }
1268                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1269                     pars++;
1270                 if (flags & SCF_DO_SUBSTR) {
1271                     SV *last_str = Nullsv;
1272                     int counted = mincount != 0;
1273
1274                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1275 #if defined(SPARC64_GCC_WORKAROUND)
1276                         I32 b = 0;
1277                         STRLEN l = 0;
1278                         char *s = NULL;
1279                         I32 old = 0;
1280
1281                         if (pos_before >= data->last_start_min)
1282                             b = pos_before;
1283                         else
1284                             b = data->last_start_min;
1285
1286                         l = 0;
1287                         s = SvPV(data->last_found, l);
1288                         old = b - data->last_start_min;
1289
1290 #else
1291                         I32 b = pos_before >= data->last_start_min
1292                             ? pos_before : data->last_start_min;
1293                         STRLEN l;
1294                         char *s = SvPV(data->last_found, l);
1295                         I32 old = b - data->last_start_min;
1296 #endif
1297
1298                         if (UTF)
1299                             old = utf8_hop((U8*)s, old) - (U8*)s;
1300                         
1301                         l -= old;
1302                         /* Get the added string: */
1303                         last_str = newSVpvn(s  + old, l);
1304                         if (UTF)
1305                             SvUTF8_on(last_str);
1306                         if (deltanext == 0 && pos_before == b) {
1307                             /* What was added is a constant string */
1308                             if (mincount > 1) {
1309                                 SvGROW(last_str, (mincount * l) + 1);
1310                                 repeatcpy(SvPVX(last_str) + l,
1311                                           SvPVX_const(last_str), l, mincount - 1);
1312                                 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1313                                 /* Add additional parts. */
1314                                 SvCUR_set(data->last_found,
1315                                           SvCUR(data->last_found) - l);
1316                                 sv_catsv(data->last_found, last_str);
1317                                 {
1318                                     SV * sv = data->last_found;
1319                                     MAGIC *mg =
1320                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1321                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322                                     if (mg && mg->mg_len >= 0)
1323                                         mg->mg_len += CHR_SVLEN(last_str);
1324                                 }
1325                                 data->last_end += l * (mincount - 1);
1326                             }
1327                         } else {
1328                             /* start offset must point into the last copy */
1329                             data->last_start_min += minnext * (mincount - 1);
1330                             data->last_start_max += is_inf ? I32_MAX
1331                                 : (maxcount - 1) * (minnext + data->pos_delta);
1332                         }
1333                     }
1334                     /* It is counted once already... */
1335                     data->pos_min += minnext * (mincount - counted);
1336                     data->pos_delta += - counted * deltanext +
1337                         (minnext + deltanext) * maxcount - minnext * mincount;
1338                     if (mincount != maxcount) {
1339                          /* Cannot extend fixed substrings found inside
1340                             the group.  */
1341                         scan_commit(pRExC_state,data);
1342                         if (mincount && last_str) {
1343                             sv_setsv(data->last_found, last_str);
1344                             data->last_end = data->pos_min;
1345                             data->last_start_min =
1346                                 data->pos_min - CHR_SVLEN(last_str);
1347                             data->last_start_max = is_inf
1348                                 ? I32_MAX
1349                                 : data->pos_min + data->pos_delta
1350                                 - CHR_SVLEN(last_str);
1351                         }
1352                         data->longest = &(data->longest_float);
1353                     }
1354                     SvREFCNT_dec(last_str);
1355                 }
1356                 if (data && (fl & SF_HAS_EVAL))
1357                     data->flags |= SF_HAS_EVAL;
1358               optimize_curly_tail:
1359                 if (OP(oscan) != CURLYX) {
1360                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1361                            && NEXT_OFF(next))
1362                         NEXT_OFF(oscan) += NEXT_OFF(next);
1363                 }
1364                 continue;
1365             default:                    /* REF and CLUMP only? */
1366                 if (flags & SCF_DO_SUBSTR) {
1367                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1368                     data->longest = &(data->longest_float);
1369                 }
1370                 is_inf = is_inf_internal = 1;
1371                 if (flags & SCF_DO_STCLASS_OR)
1372                     cl_anything(pRExC_state, data->start_class);
1373                 flags &= ~SCF_DO_STCLASS;
1374                 break;
1375             }
1376         }
1377         else if (strchr((const char*)PL_simple,OP(scan))) {
1378             int value = 0;
1379
1380             if (flags & SCF_DO_SUBSTR) {
1381                 scan_commit(pRExC_state,data);
1382                 data->pos_min++;
1383             }
1384             min++;
1385             if (flags & SCF_DO_STCLASS) {
1386                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1387
1388                 /* Some of the logic below assumes that switching
1389                    locale on will only add false positives. */
1390                 switch (PL_regkind[(U8)OP(scan)]) {
1391                 case SANY:
1392                 default:
1393                   do_default:
1394                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1396                         cl_anything(pRExC_state, data->start_class);
1397                     break;
1398                 case REG_ANY:
1399                     if (OP(scan) == SANY)
1400                         goto do_default;
1401                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403                                  || (data->start_class->flags & ANYOF_CLASS));
1404                         cl_anything(pRExC_state, data->start_class);
1405                     }
1406                     if (flags & SCF_DO_STCLASS_AND || !value)
1407                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1408                     break;
1409                 case ANYOF:
1410                     if (flags & SCF_DO_STCLASS_AND)
1411                         cl_and(data->start_class,
1412                                (struct regnode_charclass_class*)scan);
1413                     else
1414                         cl_or(pRExC_state, data->start_class,
1415                               (struct regnode_charclass_class*)scan);
1416                     break;
1417                 case ALNUM:
1418                     if (flags & SCF_DO_STCLASS_AND) {
1419                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421                             for (value = 0; value < 256; value++)
1422                                 if (!isALNUM(value))
1423                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1424                         }
1425                     }
1426                     else {
1427                         if (data->start_class->flags & ANYOF_LOCALE)
1428                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1429                         else {
1430                             for (value = 0; value < 256; value++)
1431                                 if (isALNUM(value))
1432                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1433                         }
1434                     }
1435                     break;
1436                 case ALNUML:
1437                     if (flags & SCF_DO_STCLASS_AND) {
1438                         if (data->start_class->flags & ANYOF_LOCALE)
1439                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1440                     }
1441                     else {
1442                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443                         data->start_class->flags |= ANYOF_LOCALE;
1444                     }
1445                     break;
1446                 case NALNUM:
1447                     if (flags & SCF_DO_STCLASS_AND) {
1448                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450                             for (value = 0; value < 256; value++)
1451                                 if (isALNUM(value))
1452                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1453                         }
1454                     }
1455                     else {
1456                         if (data->start_class->flags & ANYOF_LOCALE)
1457                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1458                         else {
1459                             for (value = 0; value < 256; value++)
1460                                 if (!isALNUM(value))
1461                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1462                         }
1463                     }
1464                     break;
1465                 case NALNUML:
1466                     if (flags & SCF_DO_STCLASS_AND) {
1467                         if (data->start_class->flags & ANYOF_LOCALE)
1468                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1469                     }
1470                     else {
1471                         data->start_class->flags |= ANYOF_LOCALE;
1472                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1473                     }
1474                     break;
1475                 case SPACE:
1476                     if (flags & SCF_DO_STCLASS_AND) {
1477                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479                             for (value = 0; value < 256; value++)
1480                                 if (!isSPACE(value))
1481                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1482                         }
1483                     }
1484                     else {
1485                         if (data->start_class->flags & ANYOF_LOCALE)
1486                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1487                         else {
1488                             for (value = 0; value < 256; value++)
1489                                 if (isSPACE(value))
1490                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1491                         }
1492                     }
1493                     break;
1494                 case SPACEL:
1495                     if (flags & SCF_DO_STCLASS_AND) {
1496                         if (data->start_class->flags & ANYOF_LOCALE)
1497                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1498                     }
1499                     else {
1500                         data->start_class->flags |= ANYOF_LOCALE;
1501                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1502                     }
1503                     break;
1504                 case NSPACE:
1505                     if (flags & SCF_DO_STCLASS_AND) {
1506                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508                             for (value = 0; value < 256; value++)
1509                                 if (isSPACE(value))
1510                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1511                         }
1512                     }
1513                     else {
1514                         if (data->start_class->flags & ANYOF_LOCALE)
1515                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1516                         else {
1517                             for (value = 0; value < 256; value++)
1518                                 if (!isSPACE(value))
1519                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1520                         }
1521                     }
1522                     break;
1523                 case NSPACEL:
1524                     if (flags & SCF_DO_STCLASS_AND) {
1525                         if (data->start_class->flags & ANYOF_LOCALE) {
1526                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527                             for (value = 0; value < 256; value++)
1528                                 if (!isSPACE(value))
1529                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1530                         }
1531                     }
1532                     else {
1533                         data->start_class->flags |= ANYOF_LOCALE;
1534                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1535                     }
1536                     break;
1537                 case DIGIT:
1538                     if (flags & SCF_DO_STCLASS_AND) {
1539                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540                         for (value = 0; value < 256; value++)
1541                             if (!isDIGIT(value))
1542                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1543                     }
1544                     else {
1545                         if (data->start_class->flags & ANYOF_LOCALE)
1546                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1547                         else {
1548                             for (value = 0; value < 256; value++)
1549                                 if (isDIGIT(value))
1550                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1551                         }
1552                     }
1553                     break;
1554                 case NDIGIT:
1555                     if (flags & SCF_DO_STCLASS_AND) {
1556                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557                         for (value = 0; value < 256; value++)
1558                             if (isDIGIT(value))
1559                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1560                     }
1561                     else {
1562                         if (data->start_class->flags & ANYOF_LOCALE)
1563                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1564                         else {
1565                             for (value = 0; value < 256; value++)
1566                                 if (!isDIGIT(value))
1567                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1568                         }
1569                     }
1570                     break;
1571                 }
1572                 if (flags & SCF_DO_STCLASS_OR)
1573                     cl_and(data->start_class, &and_with);
1574                 flags &= ~SCF_DO_STCLASS;
1575             }
1576         }
1577         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1578             data->flags |= (OP(scan) == MEOL
1579                             ? SF_BEFORE_MEOL
1580                             : SF_BEFORE_SEOL);
1581         }
1582         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1583                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1584                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1585                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1586             /* Lookahead/lookbehind */
1587             I32 deltanext, minnext, fake = 0;
1588             regnode *nscan;
1589             struct regnode_charclass_class intrnl;
1590             int f = 0;
1591
1592             data_fake.flags = 0;
1593             if (data) {         
1594                 data_fake.whilem_c = data->whilem_c;
1595                 data_fake.last_closep = data->last_closep;
1596             }
1597             else
1598                 data_fake.last_closep = &fake;
1599             if ( flags & SCF_DO_STCLASS && !scan->flags
1600                  && OP(scan) == IFMATCH ) { /* Lookahead */
1601                 cl_init(pRExC_state, &intrnl);
1602                 data_fake.start_class = &intrnl;
1603                 f |= SCF_DO_STCLASS_AND;
1604             }
1605             if (flags & SCF_WHILEM_VISITED_POS)
1606                 f |= SCF_WHILEM_VISITED_POS;
1607             next = regnext(scan);
1608             nscan = NEXTOPER(NEXTOPER(scan));
1609             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1610             if (scan->flags) {
1611                 if (deltanext) {
1612                     vFAIL("Variable length lookbehind not implemented");
1613                 }
1614                 else if (minnext > U8_MAX) {
1615                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1616                 }
1617                 scan->flags = (U8)minnext;
1618             }
1619             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1620                 pars++;
1621             if (data && (data_fake.flags & SF_HAS_EVAL))
1622                 data->flags |= SF_HAS_EVAL;
1623             if (data)
1624                 data->whilem_c = data_fake.whilem_c;
1625             if (f & SCF_DO_STCLASS_AND) {
1626                 int was = (data->start_class->flags & ANYOF_EOS);
1627
1628                 cl_and(data->start_class, &intrnl);
1629                 if (was)
1630                     data->start_class->flags |= ANYOF_EOS;
1631             }
1632         }
1633         else if (OP(scan) == OPEN) {
1634             pars++;
1635         }
1636         else if (OP(scan) == CLOSE) {
1637             if ((I32)ARG(scan) == is_par) {
1638                 next = regnext(scan);
1639
1640                 if ( next && (OP(next) != WHILEM) && next < last)
1641                     is_par = 0;         /* Disable optimization */
1642             }
1643             if (data)
1644                 *(data->last_closep) = ARG(scan);
1645         }
1646         else if (OP(scan) == EVAL) {
1647                 if (data)
1648                     data->flags |= SF_HAS_EVAL;
1649         }
1650         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1651                 if (flags & SCF_DO_SUBSTR) {
1652                     scan_commit(pRExC_state,data);
1653                     data->longest = &(data->longest_float);
1654                 }
1655                 is_inf = is_inf_internal = 1;
1656                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1657                     cl_anything(pRExC_state, data->start_class);
1658                 flags &= ~SCF_DO_STCLASS;
1659         }
1660         /* Else: zero-length, ignore. */
1661         scan = regnext(scan);
1662     }
1663
1664   finish:
1665     *scanp = scan;
1666     *deltap = is_inf_internal ? I32_MAX : delta;
1667     if (flags & SCF_DO_SUBSTR && is_inf)
1668         data->pos_delta = I32_MAX - data->pos_min;
1669     if (is_par > U8_MAX)
1670         is_par = 0;
1671     if (is_par && pars==1 && data) {
1672         data->flags |= SF_IN_PAR;
1673         data->flags &= ~SF_HAS_PAR;
1674     }
1675     else if (pars && data) {
1676         data->flags |= SF_HAS_PAR;
1677         data->flags &= ~SF_IN_PAR;
1678     }
1679     if (flags & SCF_DO_STCLASS_OR)
1680         cl_and(data->start_class, &and_with);
1681     return min;
1682 }
1683
1684 STATIC I32
1685 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
1686 {
1687     if (RExC_rx->data) {
1688         Renewc(RExC_rx->data,
1689                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1690                char, struct reg_data);
1691         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692         RExC_rx->data->count += n;
1693     }
1694     else {
1695         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1696              char, struct reg_data);
1697         New(1208, RExC_rx->data->what, n, U8);
1698         RExC_rx->data->count = n;
1699     }
1700     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701     return RExC_rx->data->count - n;
1702 }
1703
1704 void
1705 Perl_reginitcolors(pTHX)
1706 {
1707     const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1708     if (s) {
1709         char *t = savepv(s);
1710         int i = 0;
1711         PL_colors[0] = t;
1712         while (++i < 6) {
1713             t = strchr(t, '\t');
1714             if (t) {
1715                 *t = '\0';
1716                 PL_colors[i] = ++t;
1717             }
1718             else
1719                 PL_colors[i] = t = (char *)"";
1720         }
1721     } else {
1722         int i = 0;
1723         while (i < 6)
1724             PL_colors[i++] = (char *)"";
1725     }
1726     PL_colorset = 1;
1727 }
1728
1729
1730 /*
1731  - pregcomp - compile a regular expression into internal code
1732  *
1733  * We can't allocate space until we know how big the compiled form will be,
1734  * but we can't compile it (and thus know how big it is) until we've got a
1735  * place to put the code.  So we cheat:  we compile it twice, once with code
1736  * generation turned off and size counting turned on, and once "for real".
1737  * This also means that we don't allocate space until we are sure that the
1738  * thing really will compile successfully, and we never have to move the
1739  * code and thus invalidate pointers into it.  (Note that it has to be in
1740  * one piece because free() must be able to free it all.) [NB: not true in perl]
1741  *
1742  * Beware that the optimization-preparation code in here knows about some
1743  * of the structure of the compiled regexp.  [I'll say.]
1744  */
1745 regexp *
1746 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1747 {
1748     register regexp *r;
1749     regnode *scan;
1750     regnode *first;
1751     I32 flags;
1752     I32 minlen = 0;
1753     I32 sawplus = 0;
1754     I32 sawopen = 0;
1755     scan_data_t data;
1756     RExC_state_t RExC_state;
1757     RExC_state_t *pRExC_state = &RExC_state;
1758
1759     if (exp == NULL)
1760         FAIL("NULL regexp argument");
1761
1762     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1763
1764     RExC_precomp = exp;
1765     DEBUG_r({
1766          if (!PL_colorset) reginitcolors();
1767          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1768                        PL_colors[4],PL_colors[5],PL_colors[0],
1769                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1770     });
1771     RExC_flags = pm->op_pmflags;
1772     RExC_sawback = 0;
1773
1774     RExC_seen = 0;
1775     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1776     RExC_seen_evals = 0;
1777     RExC_extralen = 0;
1778
1779     /* First pass: determine size, legality. */
1780     RExC_parse = exp;
1781     RExC_start = exp;
1782     RExC_end = xend;
1783     RExC_naughty = 0;
1784     RExC_npar = 1;
1785     RExC_size = 0L;
1786     RExC_emit = &PL_regdummy;
1787     RExC_whilem_seen = 0;
1788 #if 0 /* REGC() is (currently) a NOP at the first pass.
1789        * Clever compilers notice this and complain. --jhi */
1790     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1791 #endif
1792     if (reg(pRExC_state, 0, &flags) == NULL) {
1793         RExC_precomp = Nullch;
1794         return(NULL);
1795     }
1796     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1797
1798     /* Small enough for pointer-storage convention?
1799        If extralen==0, this means that we will not need long jumps. */
1800     if (RExC_size >= 0x10000L && RExC_extralen)
1801         RExC_size += RExC_extralen;
1802     else
1803         RExC_extralen = 0;
1804     if (RExC_whilem_seen > 15)
1805         RExC_whilem_seen = 15;
1806
1807     /* Allocate space and initialize. */
1808     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1809          char, regexp);
1810     if (r == NULL)
1811         FAIL("Regexp out of space");
1812
1813 #ifdef DEBUGGING
1814     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1815     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1816 #endif
1817     r->refcnt = 1;
1818     r->prelen = xend - exp;
1819     r->precomp = savepvn(RExC_precomp, r->prelen);
1820     r->subbeg = NULL;
1821     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1822     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1823
1824     r->substrs = 0;                     /* Useful during FAIL. */
1825     r->startp = 0;                      /* Useful during FAIL. */
1826     r->endp = 0;                        /* Useful during FAIL. */
1827
1828     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1829     if (r->offsets) {
1830         r->offsets[0] = RExC_size;
1831     }
1832     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1833                           "%s %"UVuf" bytes for offset annotations.\n",
1834                           r->offsets ? "Got" : "Couldn't get",
1835                           (UV)((2*RExC_size+1) * sizeof(U32))));
1836
1837     RExC_rx = r;
1838
1839     /* Second pass: emit code. */
1840     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1841     RExC_parse = exp;
1842     RExC_end = xend;
1843     RExC_naughty = 0;
1844     RExC_npar = 1;
1845     RExC_emit_start = r->program;
1846     RExC_emit = r->program;
1847     /* Store the count of eval-groups for security checks: */
1848     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1849     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1850     r->data = 0;
1851     if (reg(pRExC_state, 0, &flags) == NULL)
1852         return(NULL);
1853
1854     /* Dig out information for optimizations. */
1855     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1856     pm->op_pmflags = RExC_flags;
1857     if (UTF)
1858         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1859     r->regstclass = NULL;
1860     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1861         r->reganch |= ROPT_NAUGHTY;
1862     scan = r->program + 1;              /* First BRANCH. */
1863
1864     /* XXXX To minimize changes to RE engine we always allocate
1865        3-units-long substrs field. */
1866     Newz(1004, r->substrs, 1, struct reg_substr_data);
1867
1868     StructCopy(&zero_scan_data, &data, scan_data_t);
1869     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1870     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1871         I32 fake;
1872         STRLEN longest_float_length, longest_fixed_length;
1873         struct regnode_charclass_class ch_class;
1874         int stclass_flag;
1875         I32 last_close = 0;
1876
1877         first = scan;
1878         /* Skip introductions and multiplicators >= 1. */
1879         while ((OP(first) == OPEN && (sawopen = 1)) ||
1880                /* An OR of *one* alternative - should not happen now. */
1881             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1882             (OP(first) == PLUS) ||
1883             (OP(first) == MINMOD) ||
1884                /* An {n,m} with n>0 */
1885             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1886                 if (OP(first) == PLUS)
1887                     sawplus = 1;
1888                 else
1889                     first += regarglen[(U8)OP(first)];
1890                 first = NEXTOPER(first);
1891         }
1892
1893         /* Starting-point info. */
1894       again:
1895         if (PL_regkind[(U8)OP(first)] == EXACT) {
1896             if (OP(first) == EXACT)
1897                 ;       /* Empty, get anchored substr later. */
1898             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1899                 r->regstclass = first;
1900         }
1901         else if (strchr((const char*)PL_simple,OP(first)))
1902             r->regstclass = first;
1903         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1904                  PL_regkind[(U8)OP(first)] == NBOUND)
1905             r->regstclass = first;
1906         else if (PL_regkind[(U8)OP(first)] == BOL) {
1907             r->reganch |= (OP(first) == MBOL
1908                            ? ROPT_ANCH_MBOL
1909                            : (OP(first) == SBOL
1910                               ? ROPT_ANCH_SBOL
1911                               : ROPT_ANCH_BOL));
1912             first = NEXTOPER(first);
1913             goto again;
1914         }
1915         else if (OP(first) == GPOS) {
1916             r->reganch |= ROPT_ANCH_GPOS;
1917             first = NEXTOPER(first);
1918             goto again;
1919         }
1920         else if (!sawopen && (OP(first) == STAR &&
1921             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1922             !(r->reganch & ROPT_ANCH) )
1923         {
1924             /* turn .* into ^.* with an implied $*=1 */
1925             const int type =
1926                 (OP(NEXTOPER(first)) == REG_ANY)
1927                     ? ROPT_ANCH_MBOL
1928                     : ROPT_ANCH_SBOL;
1929             r->reganch |= type | ROPT_IMPLICIT;
1930             first = NEXTOPER(first);
1931             goto again;
1932         }
1933         if (sawplus && (!sawopen || !RExC_sawback)
1934             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1935             /* x+ must match at the 1st pos of run of x's */
1936             r->reganch |= ROPT_SKIP;
1937
1938         /* Scan is after the zeroth branch, first is atomic matcher. */
1939         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1940                               (IV)(first - scan + 1)));
1941         /*
1942         * If there's something expensive in the r.e., find the
1943         * longest literal string that must appear and make it the
1944         * regmust.  Resolve ties in favor of later strings, since
1945         * the regstart check works with the beginning of the r.e.
1946         * and avoiding duplication strengthens checking.  Not a
1947         * strong reason, but sufficient in the absence of others.
1948         * [Now we resolve ties in favor of the earlier string if
1949         * it happens that c_offset_min has been invalidated, since the
1950         * earlier string may buy us something the later one won't.]
1951         */
1952         minlen = 0;
1953
1954         data.longest_fixed = newSVpvn("",0);
1955         data.longest_float = newSVpvn("",0);
1956         data.last_found = newSVpvn("",0);
1957         data.longest = &(data.longest_fixed);
1958         first = scan;
1959         if (!r->regstclass) {
1960             cl_init(pRExC_state, &ch_class);
1961             data.start_class = &ch_class;
1962             stclass_flag = SCF_DO_STCLASS_AND;
1963         } else                          /* XXXX Check for BOUND? */
1964             stclass_flag = 0;
1965         data.last_closep = &last_close;
1966
1967         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1968                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1969         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1970              && data.last_start_min == 0 && data.last_end > 0
1971              && !RExC_seen_zerolen
1972              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1973             r->reganch |= ROPT_CHECK_ALL;
1974         scan_commit(pRExC_state, &data);
1975         SvREFCNT_dec(data.last_found);
1976
1977         longest_float_length = CHR_SVLEN(data.longest_float);
1978         if (longest_float_length
1979             || (data.flags & SF_FL_BEFORE_EOL
1980                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1981                     || (RExC_flags & PMf_MULTILINE)))) {
1982             int t;
1983
1984             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1985                 && data.offset_fixed == data.offset_float_min
1986                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1987                     goto remove_float;          /* As in (a)+. */
1988
1989             if (SvUTF8(data.longest_float)) {
1990                 r->float_utf8 = data.longest_float;
1991                 r->float_substr = Nullsv;
1992             } else {
1993                 r->float_substr = data.longest_float;
1994                 r->float_utf8 = Nullsv;
1995             }
1996             r->float_min_offset = data.offset_float_min;
1997             r->float_max_offset = data.offset_float_max;
1998             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1999                        && (!(data.flags & SF_FL_BEFORE_MEOL)
2000                            || (RExC_flags & PMf_MULTILINE)));
2001             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2002         }
2003         else {
2004           remove_float:
2005             r->float_substr = r->float_utf8 = Nullsv;
2006             SvREFCNT_dec(data.longest_float);
2007             longest_float_length = 0;
2008         }
2009
2010         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2011         if (longest_fixed_length
2012             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2013                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2014                     || (RExC_flags & PMf_MULTILINE)))) {
2015             int t;
2016
2017             if (SvUTF8(data.longest_fixed)) {
2018                 r->anchored_utf8 = data.longest_fixed;
2019                 r->anchored_substr = Nullsv;
2020             } else {
2021                 r->anchored_substr = data.longest_fixed;
2022                 r->anchored_utf8 = Nullsv;
2023             }
2024             r->anchored_offset = data.offset_fixed;
2025             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2026                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2027                      || (RExC_flags & PMf_MULTILINE)));
2028             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2029         }
2030         else {
2031             r->anchored_substr = r->anchored_utf8 = Nullsv;
2032             SvREFCNT_dec(data.longest_fixed);
2033             longest_fixed_length = 0;
2034         }
2035         if (r->regstclass
2036             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2037             r->regstclass = NULL;
2038         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2039             && stclass_flag
2040             && !(data.start_class->flags & ANYOF_EOS)
2041             && !cl_is_anything(data.start_class))
2042         {
2043             const I32 n = add_data(pRExC_state, 1, "f");
2044
2045             New(1006, RExC_rx->data->data[n], 1,
2046                 struct regnode_charclass_class);
2047             StructCopy(data.start_class,
2048                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2049                        struct regnode_charclass_class);
2050             r->regstclass = (regnode*)RExC_rx->data->data[n];
2051             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2052             PL_regdata = r->data; /* for regprop() */
2053             DEBUG_r({ SV *sv = sv_newmortal();
2054                       regprop(sv, (regnode*)data.start_class);
2055                       PerlIO_printf(Perl_debug_log,
2056                                     "synthetic stclass \"%s\".\n",
2057                                     SvPVX_const(sv));});
2058         }
2059
2060         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2061         if (longest_fixed_length > longest_float_length) {
2062             r->check_substr = r->anchored_substr;
2063             r->check_utf8 = r->anchored_utf8;
2064             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2065             if (r->reganch & ROPT_ANCH_SINGLE)
2066                 r->reganch |= ROPT_NOSCAN;
2067         }
2068         else {
2069             r->check_substr = r->float_substr;
2070             r->check_utf8 = r->float_utf8;
2071             r->check_offset_min = data.offset_float_min;
2072             r->check_offset_max = data.offset_float_max;
2073         }
2074         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2075            This should be changed ASAP!  */
2076         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2077             r->reganch |= RE_USE_INTUIT;
2078             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2079                 r->reganch |= RE_INTUIT_TAIL;
2080         }
2081     }
2082     else {
2083         /* Several toplevels. Best we can is to set minlen. */
2084         I32 fake;
2085         struct regnode_charclass_class ch_class;
2086         I32 last_close = 0;
2087         
2088         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2089         scan = r->program + 1;
2090         cl_init(pRExC_state, &ch_class);
2091         data.start_class = &ch_class;
2092         data.last_closep = &last_close;
2093         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2094         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2095                 = r->float_substr = r->float_utf8 = Nullsv;
2096         if (!(data.start_class->flags & ANYOF_EOS)
2097             && !cl_is_anything(data.start_class))
2098         {
2099             const I32 n = add_data(pRExC_state, 1, "f");
2100
2101             New(1006, RExC_rx->data->data[n], 1,
2102                 struct regnode_charclass_class);
2103             StructCopy(data.start_class,
2104                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2105                        struct regnode_charclass_class);
2106             r->regstclass = (regnode*)RExC_rx->data->data[n];
2107             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2108             DEBUG_r({ SV* sv = sv_newmortal();
2109                       regprop(sv, (regnode*)data.start_class);
2110                       PerlIO_printf(Perl_debug_log,
2111                                     "synthetic stclass \"%s\".\n",
2112                                     SvPVX_const(sv));});
2113         }
2114     }
2115
2116     r->minlen = minlen;
2117     if (RExC_seen & REG_SEEN_GPOS)
2118         r->reganch |= ROPT_GPOS_SEEN;
2119     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2120         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2121     if (RExC_seen & REG_SEEN_EVAL)
2122         r->reganch |= ROPT_EVAL_SEEN;
2123     if (RExC_seen & REG_SEEN_CANY)
2124         r->reganch |= ROPT_CANY_SEEN;
2125     Newz(1002, r->startp, RExC_npar, I32);
2126     Newz(1002, r->endp, RExC_npar, I32);
2127     PL_regdata = r->data; /* for regprop() */
2128     DEBUG_r(regdump(r));
2129     return(r);
2130 }
2131
2132 /*
2133  - reg - regular expression, i.e. main body or parenthesized thing
2134  *
2135  * Caller must absorb opening parenthesis.
2136  *
2137  * Combining parenthesis handling with the base level of regular expression
2138  * is a trifle forced, but the need to tie the tails of the branches to what
2139  * follows makes it hard to avoid.
2140  */
2141 STATIC regnode *
2142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2143     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2144 {
2145     register regnode *ret;              /* Will be the head of the group. */
2146     register regnode *br;
2147     register regnode *lastbr;
2148     register regnode *ender = 0;
2149     register I32 parno = 0;
2150     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2151
2152     /* for (?g), (?gc), and (?o) warnings; warning
2153        about (?c) will warn about (?g) -- japhy    */
2154
2155     I32 wastedflags = 0x00,
2156         wasted_o    = 0x01,
2157         wasted_g    = 0x02,
2158         wasted_gc   = 0x02 | 0x04,
2159         wasted_c    = 0x04;
2160
2161     char * parse_start = RExC_parse; /* MJD */
2162     char *oregcomp_parse = RExC_parse;
2163     char c;
2164
2165     *flagp = 0;                         /* Tentatively. */
2166
2167
2168     /* Make an OPEN node, if parenthesized. */
2169     if (paren) {
2170         if (*RExC_parse == '?') { /* (?...) */
2171             U32 posflags = 0, negflags = 0;
2172             U32 *flagsp = &posflags;
2173             int logical = 0;
2174             char *seqstart = RExC_parse;
2175
2176             RExC_parse++;
2177             paren = *RExC_parse++;
2178             ret = NULL;                 /* For look-ahead/behind. */
2179             switch (paren) {
2180             case '<':           /* (?<...) */
2181                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2182                 if (*RExC_parse == '!')
2183                     paren = ',';
2184                 if (*RExC_parse != '=' && *RExC_parse != '!')
2185                     goto unknown;
2186                 RExC_parse++;
2187             case '=':           /* (?=...) */
2188             case '!':           /* (?!...) */
2189                 RExC_seen_zerolen++;
2190             case ':':           /* (?:...) */
2191             case '>':           /* (?>...) */
2192                 break;
2193             case '$':           /* (?$...) */
2194             case '@':           /* (?@...) */
2195                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2196                 break;
2197             case '#':           /* (?#...) */
2198                 while (*RExC_parse && *RExC_parse != ')')
2199                     RExC_parse++;
2200                 if (*RExC_parse != ')')
2201                     FAIL("Sequence (?#... not terminated");
2202                 nextchar(pRExC_state);
2203                 *flagp = TRYAGAIN;
2204                 return NULL;
2205             case 'p':           /* (?p...) */
2206                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2207                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2208                 /* FALL THROUGH*/
2209             case '?':           /* (??...) */
2210                 logical = 1;
2211                 if (*RExC_parse != '{')
2212                     goto unknown;
2213                 paren = *RExC_parse++;
2214                 /* FALL THROUGH */
2215             case '{':           /* (?{...}) */
2216             {
2217                 I32 count = 1, n = 0;
2218                 char c;
2219                 char *s = RExC_parse;
2220                 SV *sv;
2221                 OP_4tree *sop, *rop;
2222
2223                 RExC_seen_zerolen++;
2224                 RExC_seen |= REG_SEEN_EVAL;
2225                 while (count && (c = *RExC_parse)) {
2226                     if (c == '\\' && RExC_parse[1])
2227                         RExC_parse++;
2228                     else if (c == '{')
2229                         count++;
2230                     else if (c == '}')
2231                         count--;
2232                     RExC_parse++;
2233                 }
2234                 if (*RExC_parse != ')')
2235                 {
2236                     RExC_parse = s;             
2237                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2238                 }
2239                 if (!SIZE_ONLY) {
2240                     PAD *pad;
2241                 
2242                     if (RExC_parse - 1 - s)
2243                         sv = newSVpvn(s, RExC_parse - 1 - s);
2244                     else
2245                         sv = newSVpvn("", 0);
2246
2247                     ENTER;
2248                     Perl_save_re_context(aTHX);
2249                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2250                     sop->op_private |= OPpREFCOUNTED;
2251                     /* re_dup will OpREFCNT_inc */
2252                     OpREFCNT_set(sop, 1);
2253                     LEAVE;
2254
2255                     n = add_data(pRExC_state, 3, "nop");
2256                     RExC_rx->data->data[n] = (void*)rop;
2257                     RExC_rx->data->data[n+1] = (void*)sop;
2258                     RExC_rx->data->data[n+2] = (void*)pad;
2259                     SvREFCNT_dec(sv);
2260                 }
2261                 else {                                          /* First pass */
2262                     if (PL_reginterp_cnt < ++RExC_seen_evals
2263                         && IN_PERL_RUNTIME)
2264                         /* No compiled RE interpolated, has runtime
2265                            components ===> unsafe.  */
2266                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2267                     if (PL_tainting && PL_tainted)
2268                         FAIL("Eval-group in insecure regular expression");
2269                 }
2270                 
2271                 nextchar(pRExC_state);
2272                 if (logical) {
2273                     ret = reg_node(pRExC_state, LOGICAL);
2274                     if (!SIZE_ONLY)
2275                         ret->flags = 2;
2276                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2277                     /* deal with the length of this later - MJD */
2278                     return ret;
2279                 }
2280                 ret = reganode(pRExC_state, EVAL, n);
2281                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2282                 Set_Node_Offset(ret, parse_start);
2283                 return ret;
2284             }
2285             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2286             {
2287                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2288                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2289                         || RExC_parse[1] == '<'
2290                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2291                         I32 flag;
2292                         
2293                         ret = reg_node(pRExC_state, LOGICAL);
2294                         if (!SIZE_ONLY)
2295                             ret->flags = 1;
2296                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2297                         goto insert_if;
2298                     }
2299                 }
2300                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2301                     /* (?(1)...) */
2302                     parno = atoi(RExC_parse++);
2303
2304                     while (isDIGIT(*RExC_parse))
2305                         RExC_parse++;
2306                     ret = reganode(pRExC_state, GROUPP, parno);
2307
2308                     if ((c = *nextchar(pRExC_state)) != ')')
2309                         vFAIL("Switch condition not recognized");
2310                   insert_if:
2311                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2312                     br = regbranch(pRExC_state, &flags, 1);
2313                     if (br == NULL)
2314                         br = reganode(pRExC_state, LONGJMP, 0);
2315                     else
2316                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2317                     c = *nextchar(pRExC_state);
2318                     if (flags&HASWIDTH)
2319                         *flagp |= HASWIDTH;
2320                     if (c == '|') {
2321                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2322                         regbranch(pRExC_state, &flags, 1);
2323                         regtail(pRExC_state, ret, lastbr);
2324                         if (flags&HASWIDTH)
2325                             *flagp |= HASWIDTH;
2326                         c = *nextchar(pRExC_state);
2327                     }
2328                     else
2329                         lastbr = NULL;
2330                     if (c != ')')
2331                         vFAIL("Switch (?(condition)... contains too many branches");
2332                     ender = reg_node(pRExC_state, TAIL);
2333                     regtail(pRExC_state, br, ender);
2334                     if (lastbr) {
2335                         regtail(pRExC_state, lastbr, ender);
2336                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2337                     }
2338                     else
2339                         regtail(pRExC_state, ret, ender);
2340                     return ret;
2341                 }
2342                 else {
2343                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2344                 }
2345             }
2346             case 0:
2347                 RExC_parse--; /* for vFAIL to print correctly */
2348                 vFAIL("Sequence (? incomplete");
2349                 break;
2350             default:
2351                 --RExC_parse;
2352               parse_flags:      /* (?i) */
2353                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2354                     /* (?g), (?gc) and (?o) are useless here
2355                        and must be globally applied -- japhy */
2356
2357                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2358                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2359                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2360                             if (! (wastedflags & wflagbit) ) {
2361                                 wastedflags |= wflagbit;
2362                                 vWARN5(
2363                                     RExC_parse + 1,
2364                                     "Useless (%s%c) - %suse /%c modifier",
2365                                     flagsp == &negflags ? "?-" : "?",
2366                                     *RExC_parse,
2367                                     flagsp == &negflags ? "don't " : "",
2368                                     *RExC_parse
2369                                 );
2370                             }
2371                         }
2372                     }
2373                     else if (*RExC_parse == 'c') {
2374                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2375                             if (! (wastedflags & wasted_c) ) {
2376                                 wastedflags |= wasted_gc;
2377                                 vWARN3(
2378                                     RExC_parse + 1,
2379                                     "Useless (%sc) - %suse /gc modifier",
2380                                     flagsp == &negflags ? "?-" : "?",
2381                                     flagsp == &negflags ? "don't " : ""
2382                                 );
2383                             }
2384                         }
2385                     }
2386                     else { pmflag(flagsp, *RExC_parse); }
2387
2388                     ++RExC_parse;
2389                 }
2390                 if (*RExC_parse == '-') {
2391                     flagsp = &negflags;
2392                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2393                     ++RExC_parse;
2394                     goto parse_flags;
2395                 }
2396                 RExC_flags |= posflags;
2397                 RExC_flags &= ~negflags;
2398                 if (*RExC_parse == ':') {
2399                     RExC_parse++;
2400                     paren = ':';
2401                     break;
2402                 }               
2403               unknown:
2404                 if (*RExC_parse != ')') {
2405                     RExC_parse++;
2406                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2407                 }
2408                 nextchar(pRExC_state);
2409                 *flagp = TRYAGAIN;
2410                 return NULL;
2411             }
2412         }
2413         else {                  /* (...) */
2414             parno = RExC_npar;
2415             RExC_npar++;
2416             ret = reganode(pRExC_state, OPEN, parno);
2417             Set_Node_Length(ret, 1); /* MJD */
2418             Set_Node_Offset(ret, RExC_parse); /* MJD */
2419             open = 1;
2420         }
2421     }
2422     else                        /* ! paren */
2423         ret = NULL;
2424
2425     /* Pick up the branches, linking them together. */
2426     parse_start = RExC_parse;   /* MJD */
2427     br = regbranch(pRExC_state, &flags, 1);
2428     /*     branch_len = (paren != 0); */
2429
2430     if (br == NULL)
2431         return(NULL);
2432     if (*RExC_parse == '|') {
2433         if (!SIZE_ONLY && RExC_extralen) {
2434             reginsert(pRExC_state, BRANCHJ, br);
2435         }
2436         else {                  /* MJD */
2437             reginsert(pRExC_state, BRANCH, br);
2438             Set_Node_Length(br, paren != 0);
2439             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2440         }
2441         have_branch = 1;
2442         if (SIZE_ONLY)
2443             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2444     }
2445     else if (paren == ':') {
2446         *flagp |= flags&SIMPLE;
2447     }
2448     if (open) {                         /* Starts with OPEN. */
2449         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2450     }
2451     else if (paren != '?')              /* Not Conditional */
2452         ret = br;
2453     *flagp |= flags & (SPSTART | HASWIDTH);
2454     lastbr = br;
2455     while (*RExC_parse == '|') {
2456         if (!SIZE_ONLY && RExC_extralen) {
2457             ender = reganode(pRExC_state, LONGJMP,0);
2458             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2459         }
2460         if (SIZE_ONLY)
2461             RExC_extralen += 2;         /* Account for LONGJMP. */
2462         nextchar(pRExC_state);
2463         br = regbranch(pRExC_state, &flags, 0);
2464
2465         if (br == NULL)
2466             return(NULL);
2467         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2468         lastbr = br;
2469         if (flags&HASWIDTH)
2470             *flagp |= HASWIDTH;
2471         *flagp |= flags&SPSTART;
2472     }
2473
2474     if (have_branch || paren != ':') {
2475         /* Make a closing node, and hook it on the end. */
2476         switch (paren) {
2477         case ':':
2478             ender = reg_node(pRExC_state, TAIL);
2479             break;
2480         case 1:
2481             ender = reganode(pRExC_state, CLOSE, parno);
2482             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2483             Set_Node_Length(ender,1); /* MJD */
2484             break;
2485         case '<':
2486         case ',':
2487         case '=':
2488         case '!':
2489             *flagp &= ~HASWIDTH;
2490             /* FALL THROUGH */
2491         case '>':
2492             ender = reg_node(pRExC_state, SUCCEED);
2493             break;
2494         case 0:
2495             ender = reg_node(pRExC_state, END);
2496             break;
2497         }
2498         regtail(pRExC_state, lastbr, ender);
2499
2500         if (have_branch) {
2501             /* Hook the tails of the branches to the closing node. */
2502             for (br = ret; br != NULL; br = regnext(br)) {
2503                 regoptail(pRExC_state, br, ender);
2504             }
2505         }
2506     }
2507
2508     {
2509         const char *p;
2510         static const char parens[] = "=!<,>";
2511
2512         if (paren && (p = strchr(parens, paren))) {
2513             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2514             int flag = (p - parens) > 1;
2515
2516             if (paren == '>')
2517                 node = SUSPEND, flag = 0;
2518             reginsert(pRExC_state, node,ret);
2519             Set_Node_Cur_Length(ret);
2520             Set_Node_Offset(ret, parse_start + 1);
2521             ret->flags = flag;
2522             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2523         }
2524     }
2525
2526     /* Check for proper termination. */
2527     if (paren) {
2528         RExC_flags = oregflags;
2529         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2530             RExC_parse = oregcomp_parse;
2531             vFAIL("Unmatched (");
2532         }
2533     }
2534     else if (!paren && RExC_parse < RExC_end) {
2535         if (*RExC_parse == ')') {
2536             RExC_parse++;
2537             vFAIL("Unmatched )");
2538         }
2539         else
2540             FAIL("Junk on end of regexp");      /* "Can't happen". */
2541         /* NOTREACHED */
2542     }
2543
2544     return(ret);
2545 }
2546
2547 /*
2548  - regbranch - one alternative of an | operator
2549  *
2550  * Implements the concatenation operator.
2551  */
2552 STATIC regnode *
2553 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2554 {
2555     register regnode *ret;
2556     register regnode *chain = NULL;
2557     register regnode *latest;
2558     I32 flags = 0, c = 0;
2559
2560     if (first)
2561         ret = NULL;
2562     else {
2563         if (!SIZE_ONLY && RExC_extralen)
2564             ret = reganode(pRExC_state, BRANCHJ,0);
2565         else {
2566             ret = reg_node(pRExC_state, BRANCH);
2567             Set_Node_Length(ret, 1);
2568         }
2569     }
2570         
2571     if (!first && SIZE_ONLY)
2572         RExC_extralen += 1;                     /* BRANCHJ */
2573
2574     *flagp = WORST;                     /* Tentatively. */
2575
2576     RExC_parse--;
2577     nextchar(pRExC_state);
2578     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2579         flags &= ~TRYAGAIN;
2580         latest = regpiece(pRExC_state, &flags);
2581         if (latest == NULL) {
2582             if (flags & TRYAGAIN)
2583                 continue;
2584             return(NULL);
2585         }
2586         else if (ret == NULL)
2587             ret = latest;
2588         *flagp |= flags&HASWIDTH;
2589         if (chain == NULL)      /* First piece. */
2590             *flagp |= flags&SPSTART;
2591         else {
2592             RExC_naughty++;
2593             regtail(pRExC_state, chain, latest);
2594         }
2595         chain = latest;
2596         c++;
2597     }
2598     if (chain == NULL) {        /* Loop ran zero times. */
2599         chain = reg_node(pRExC_state, NOTHING);
2600         if (ret == NULL)
2601             ret = chain;
2602     }
2603     if (c == 1) {
2604         *flagp |= flags&SIMPLE;
2605     }
2606
2607     return(ret);
2608 }
2609
2610 /*
2611  - regpiece - something followed by possible [*+?]
2612  *
2613  * Note that the branching code sequences used for ? and the general cases
2614  * of * and + are somewhat optimized:  they use the same NOTHING node as
2615  * both the endmarker for their branch list and the body of the last branch.
2616  * It might seem that this node could be dispensed with entirely, but the
2617  * endmarker role is not redundant.
2618  */
2619 STATIC regnode *
2620 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2621 {
2622     register regnode *ret;
2623     register char op;
2624     register char *next;
2625     I32 flags;
2626     const char * const origparse = RExC_parse;
2627     char *maxpos;
2628     I32 min;
2629     I32 max = REG_INFTY;
2630     char *parse_start;
2631
2632     ret = regatom(pRExC_state, &flags);
2633     if (ret == NULL) {
2634         if (flags & TRYAGAIN)
2635             *flagp |= TRYAGAIN;
2636         return(NULL);
2637     }
2638
2639     op = *RExC_parse;
2640
2641     if (op == '{' && regcurly(RExC_parse)) {
2642         parse_start = RExC_parse; /* MJD */
2643         next = RExC_parse + 1;
2644         maxpos = Nullch;
2645         while (isDIGIT(*next) || *next == ',') {
2646             if (*next == ',') {
2647                 if (maxpos)
2648                     break;
2649                 else
2650                     maxpos = next;
2651             }
2652             next++;
2653         }
2654         if (*next == '}') {             /* got one */
2655             if (!maxpos)
2656                 maxpos = next;
2657             RExC_parse++;
2658             min = atoi(RExC_parse);
2659             if (*maxpos == ',')
2660                 maxpos++;
2661             else
2662                 maxpos = RExC_parse;
2663             max = atoi(maxpos);
2664             if (!max && *maxpos != '0')
2665                 max = REG_INFTY;                /* meaning "infinity" */
2666             else if (max >= REG_INFTY)
2667                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2668             RExC_parse = next;
2669             nextchar(pRExC_state);
2670
2671         do_curly:
2672             if ((flags&SIMPLE)) {
2673                 RExC_naughty += 2 + RExC_naughty / 2;
2674                 reginsert(pRExC_state, CURLY, ret);
2675                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2676                 Set_Node_Cur_Length(ret);
2677             }
2678             else {
2679                 regnode *w = reg_node(pRExC_state, WHILEM);
2680
2681                 w->flags = 0;
2682                 regtail(pRExC_state, ret, w);
2683                 if (!SIZE_ONLY && RExC_extralen) {
2684                     reginsert(pRExC_state, LONGJMP,ret);
2685                     reginsert(pRExC_state, NOTHING,ret);
2686                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2687                 }
2688                 reginsert(pRExC_state, CURLYX,ret);
2689                                 /* MJD hk */
2690                 Set_Node_Offset(ret, parse_start+1);
2691                 Set_Node_Length(ret,
2692                                 op == '{' ? (RExC_parse - parse_start) : 1);
2693
2694                 if (!SIZE_ONLY && RExC_extralen)
2695                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2696                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2697                 if (SIZE_ONLY)
2698                     RExC_whilem_seen++, RExC_extralen += 3;
2699                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2700             }
2701             ret->flags = 0;
2702
2703             if (min > 0)
2704                 *flagp = WORST;
2705             if (max > 0)
2706                 *flagp |= HASWIDTH;
2707             if (max && max < min)
2708                 vFAIL("Can't do {n,m} with n > m");
2709             if (!SIZE_ONLY) {
2710                 ARG1_SET(ret, (U16)min);
2711                 ARG2_SET(ret, (U16)max);
2712             }
2713
2714             goto nest_check;
2715         }
2716     }
2717
2718     if (!ISMULT1(op)) {
2719         *flagp = flags;
2720         return(ret);
2721     }
2722
2723 #if 0                           /* Now runtime fix should be reliable. */
2724
2725     /* if this is reinstated, don't forget to put this back into perldiag:
2726
2727             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2728
2729            (F) The part of the regexp subject to either the * or + quantifier
2730            could match an empty string. The {#} shows in the regular
2731            expression about where the problem was discovered.
2732
2733     */
2734
2735     if (!(flags&HASWIDTH) && op != '?')
2736       vFAIL("Regexp *+ operand could be empty");
2737 #endif
2738
2739     parse_start = RExC_parse;
2740     nextchar(pRExC_state);
2741
2742     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2743
2744     if (op == '*' && (flags&SIMPLE)) {
2745         reginsert(pRExC_state, STAR, ret);
2746         ret->flags = 0;
2747         RExC_naughty += 4;
2748     }
2749     else if (op == '*') {
2750         min = 0;
2751         goto do_curly;
2752     }
2753     else if (op == '+' && (flags&SIMPLE)) {
2754         reginsert(pRExC_state, PLUS, ret);
2755         ret->flags = 0;
2756         RExC_naughty += 3;
2757     }
2758     else if (op == '+') {
2759         min = 1;
2760         goto do_curly;
2761     }
2762     else if (op == '?') {
2763         min = 0; max = 1;
2764         goto do_curly;
2765     }
2766   nest_check:
2767     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2768         vWARN3(RExC_parse,
2769                "%.*s matches null string many times",
2770                RExC_parse - origparse,
2771                origparse);
2772     }
2773
2774     if (*RExC_parse == '?') {
2775         nextchar(pRExC_state);
2776         reginsert(pRExC_state, MINMOD, ret);
2777         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2778     }
2779     if (ISMULT2(RExC_parse)) {
2780         RExC_parse++;
2781         vFAIL("Nested quantifiers");
2782     }
2783
2784     return(ret);
2785 }
2786
2787 /*
2788  - regatom - the lowest level
2789  *
2790  * Optimization:  gobbles an entire sequence of ordinary characters so that
2791  * it can turn them into a single node, which is smaller to store and
2792  * faster to run.  Backslashed characters are exceptions, each becoming a
2793  * separate node; the code is simpler that way and it's not worth fixing.
2794  *
2795  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2796 STATIC regnode *
2797 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2798 {
2799     register regnode *ret = 0;
2800     I32 flags;
2801     char *parse_start = RExC_parse;
2802
2803     *flagp = WORST;             /* Tentatively. */
2804
2805 tryagain:
2806     switch (*RExC_parse) {
2807     case '^':
2808         RExC_seen_zerolen++;
2809         nextchar(pRExC_state);
2810         if (RExC_flags & PMf_MULTILINE)
2811             ret = reg_node(pRExC_state, MBOL);
2812         else if (RExC_flags & PMf_SINGLELINE)
2813             ret = reg_node(pRExC_state, SBOL);
2814         else
2815             ret = reg_node(pRExC_state, BOL);
2816         Set_Node_Length(ret, 1); /* MJD */
2817         break;
2818     case '$':
2819         nextchar(pRExC_state);
2820         if (*RExC_parse)
2821             RExC_seen_zerolen++;
2822         if (RExC_flags & PMf_MULTILINE)
2823             ret = reg_node(pRExC_state, MEOL);
2824         else if (RExC_flags & PMf_SINGLELINE)
2825             ret = reg_node(pRExC_state, SEOL);
2826         else
2827             ret = reg_node(pRExC_state, EOL);
2828         Set_Node_Length(ret, 1); /* MJD */
2829         break;
2830     case '.':
2831         nextchar(pRExC_state);
2832         if (RExC_flags & PMf_SINGLELINE)
2833             ret = reg_node(pRExC_state, SANY);
2834         else
2835             ret = reg_node(pRExC_state, REG_ANY);
2836         *flagp |= HASWIDTH|SIMPLE;
2837         RExC_naughty++;
2838         Set_Node_Length(ret, 1); /* MJD */
2839         break;
2840     case '[':
2841     {
2842         char *oregcomp_parse = ++RExC_parse;
2843         ret = regclass(pRExC_state);
2844         if (*RExC_parse != ']') {
2845             RExC_parse = oregcomp_parse;
2846             vFAIL("Unmatched [");
2847         }
2848         nextchar(pRExC_state);
2849         *flagp |= HASWIDTH|SIMPLE;
2850         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2851         break;
2852     }
2853     case '(':
2854         nextchar(pRExC_state);
2855         ret = reg(pRExC_state, 1, &flags);
2856         if (ret == NULL) {
2857                 if (flags & TRYAGAIN) {
2858                     if (RExC_parse == RExC_end) {
2859                          /* Make parent create an empty node if needed. */
2860                         *flagp |= TRYAGAIN;
2861                         return(NULL);
2862                     }
2863                     goto tryagain;
2864                 }
2865                 return(NULL);
2866         }
2867         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2868         break;
2869     case '|':
2870     case ')':
2871         if (flags & TRYAGAIN) {
2872             *flagp |= TRYAGAIN;
2873             return NULL;
2874         }
2875         vFAIL("Internal urp");
2876                                 /* Supposed to be caught earlier. */
2877         break;
2878     case '{':
2879         if (!regcurly(RExC_parse)) {
2880             RExC_parse++;
2881             goto defchar;
2882         }
2883         /* FALL THROUGH */
2884     case '?':
2885     case '+':
2886     case '*':
2887         RExC_parse++;
2888         vFAIL("Quantifier follows nothing");
2889         break;
2890     case '\\':
2891         switch (*++RExC_parse) {
2892         case 'A':
2893             RExC_seen_zerolen++;
2894             ret = reg_node(pRExC_state, SBOL);
2895             *flagp |= SIMPLE;
2896             nextchar(pRExC_state);
2897             Set_Node_Length(ret, 2); /* MJD */
2898             break;
2899         case 'G':
2900             ret = reg_node(pRExC_state, GPOS);
2901             RExC_seen |= REG_SEEN_GPOS;
2902             *flagp |= SIMPLE;
2903             nextchar(pRExC_state);
2904             Set_Node_Length(ret, 2); /* MJD */
2905             break;
2906         case 'Z':
2907             ret = reg_node(pRExC_state, SEOL);
2908             *flagp |= SIMPLE;
2909             RExC_seen_zerolen++;                /* Do not optimize RE away */
2910             nextchar(pRExC_state);
2911             break;
2912         case 'z':
2913             ret = reg_node(pRExC_state, EOS);
2914             *flagp |= SIMPLE;
2915             RExC_seen_zerolen++;                /* Do not optimize RE away */
2916             nextchar(pRExC_state);
2917             Set_Node_Length(ret, 2); /* MJD */
2918             break;
2919         case 'C':
2920             ret = reg_node(pRExC_state, CANY);
2921             RExC_seen |= REG_SEEN_CANY;
2922             *flagp |= HASWIDTH|SIMPLE;
2923             nextchar(pRExC_state);
2924             Set_Node_Length(ret, 2); /* MJD */
2925             break;
2926         case 'X':
2927             ret = reg_node(pRExC_state, CLUMP);
2928             *flagp |= HASWIDTH;
2929             nextchar(pRExC_state);
2930             Set_Node_Length(ret, 2); /* MJD */
2931             break;
2932         case 'w':
2933             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2934             *flagp |= HASWIDTH|SIMPLE;
2935             nextchar(pRExC_state);
2936             Set_Node_Length(ret, 2); /* MJD */
2937             break;
2938         case 'W':
2939             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2940             *flagp |= HASWIDTH|SIMPLE;
2941             nextchar(pRExC_state);
2942             Set_Node_Length(ret, 2); /* MJD */
2943             break;
2944         case 'b':
2945             RExC_seen_zerolen++;
2946             RExC_seen |= REG_SEEN_LOOKBEHIND;
2947             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2948             *flagp |= SIMPLE;
2949             nextchar(pRExC_state);
2950             Set_Node_Length(ret, 2); /* MJD */
2951             break;
2952         case 'B':
2953             RExC_seen_zerolen++;
2954             RExC_seen |= REG_SEEN_LOOKBEHIND;
2955             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2956             *flagp |= SIMPLE;
2957             nextchar(pRExC_state);
2958             Set_Node_Length(ret, 2); /* MJD */
2959             break;
2960         case 's':
2961             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2962             *flagp |= HASWIDTH|SIMPLE;
2963             nextchar(pRExC_state);
2964             Set_Node_Length(ret, 2); /* MJD */
2965             break;
2966         case 'S':
2967             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2968             *flagp |= HASWIDTH|SIMPLE;
2969             nextchar(pRExC_state);
2970             Set_Node_Length(ret, 2); /* MJD */
2971             break;
2972         case 'd':
2973             ret = reg_node(pRExC_state, DIGIT);
2974             *flagp |= HASWIDTH|SIMPLE;
2975             nextchar(pRExC_state);
2976             Set_Node_Length(ret, 2); /* MJD */
2977             break;
2978         case 'D':
2979             ret = reg_node(pRExC_state, NDIGIT);
2980             *flagp |= HASWIDTH|SIMPLE;
2981             nextchar(pRExC_state);
2982             Set_Node_Length(ret, 2); /* MJD */
2983             break;
2984         case 'p':
2985         case 'P':
2986             {   
2987                 char* oldregxend = RExC_end;
2988                 char* parse_start = RExC_parse - 2;
2989
2990                 if (RExC_parse[1] == '{') {
2991                   /* a lovely hack--pretend we saw [\pX] instead */
2992                     RExC_end = strchr(RExC_parse, '}');
2993                     if (!RExC_end) {
2994                         U8 c = (U8)*RExC_parse;
2995                         RExC_parse += 2;
2996                         RExC_end = oldregxend;
2997                         vFAIL2("Missing right brace on \\%c{}", c);
2998                     }
2999                     RExC_end++;
3000                 }
3001                 else {
3002                     RExC_end = RExC_parse + 2;
3003                     if (RExC_end > oldregxend)
3004                         RExC_end = oldregxend;
3005                 }
3006                 RExC_parse--;
3007
3008                 ret = regclass(pRExC_state);
3009
3010                 RExC_end = oldregxend;
3011                 RExC_parse--;
3012
3013                 Set_Node_Offset(ret, parse_start + 2);
3014                 Set_Node_Cur_Length(ret);
3015                 nextchar(pRExC_state);
3016                 *flagp |= HASWIDTH|SIMPLE;
3017             }
3018             break;
3019         case 'n':
3020         case 'r':
3021         case 't':
3022         case 'f':
3023         case 'e':
3024         case 'a':
3025         case 'x':
3026         case 'c':
3027         case '0':
3028             goto defchar;
3029         case '1': case '2': case '3': case '4':
3030         case '5': case '6': case '7': case '8': case '9':
3031             {
3032                 const I32 num = atoi(RExC_parse);
3033
3034                 if (num > 9 && num >= RExC_npar)
3035                     goto defchar;
3036                 else {
3037                     char * parse_start = RExC_parse - 1; /* MJD */
3038                     while (isDIGIT(*RExC_parse))
3039                         RExC_parse++;
3040
3041                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3042                         vFAIL("Reference to nonexistent group");
3043                     RExC_sawback = 1;
3044                     ret = reganode(pRExC_state,
3045                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3046                                    num);
3047                     *flagp |= HASWIDTH;
3048
3049                     /* override incorrect value set in reganode MJD */
3050                     Set_Node_Offset(ret, parse_start+1);
3051                     Set_Node_Cur_Length(ret); /* MJD */
3052                     RExC_parse--;
3053                     nextchar(pRExC_state);
3054                 }
3055             }
3056             break;
3057         case '\0':
3058             if (RExC_parse >= RExC_end)
3059                 FAIL("Trailing \\");
3060             /* FALL THROUGH */
3061         default:
3062             /* Do not generate "unrecognized" warnings here, we fall
3063                back into the quick-grab loop below */
3064             parse_start--;
3065             goto defchar;
3066         }
3067         break;
3068
3069     case '#':
3070         if (RExC_flags & PMf_EXTENDED) {
3071             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3072             if (RExC_parse < RExC_end)
3073                 goto tryagain;
3074         }
3075         /* FALL THROUGH */
3076
3077     default: {
3078             register STRLEN len;
3079             register UV ender;
3080             register char *p;
3081             char *oldp, *s;
3082             STRLEN foldlen;
3083             U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3084
3085             parse_start = RExC_parse - 1;
3086
3087             RExC_parse++;
3088
3089         defchar:
3090             ender = 0;
3091             ret = reg_node(pRExC_state,
3092                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3093             s = STRING(ret);
3094             for (len = 0, p = RExC_parse - 1;
3095               len < 127 && p < RExC_end;
3096               len++)
3097             {
3098                 oldp = p;
3099
3100                 if (RExC_flags & PMf_EXTENDED)
3101                     p = regwhite(p, RExC_end);
3102                 switch (*p) {
3103                 case '^':
3104                 case '$':
3105                 case '.':
3106                 case '[':
3107                 case '(':
3108                 case ')':
3109                 case '|':
3110                     goto loopdone;
3111                 case '\\':
3112                     switch (*++p) {
3113                     case 'A':
3114                     case 'C':
3115                     case 'X':
3116                     case 'G':
3117                     case 'Z':
3118                     case 'z':
3119                     case 'w':
3120                     case 'W':
3121                     case 'b':
3122                     case 'B':
3123                     case 's':
3124                     case 'S':
3125                     case 'd':
3126                     case 'D':
3127                     case 'p':
3128                     case 'P':
3129                         --p;
3130                         goto loopdone;
3131                     case 'n':
3132                         ender = '\n';
3133                         p++;
3134                         break;
3135                     case 'r':
3136                         ender = '\r';
3137                         p++;
3138                         break;
3139                     case 't':
3140                         ender = '\t';
3141                         p++;
3142                         break;
3143                     case 'f':
3144                         ender = '\f';
3145                         p++;
3146                         break;
3147                     case 'e':
3148                           ender = ASCII_TO_NATIVE('\033');
3149                         p++;
3150                         break;
3151                     case 'a':
3152                           ender = ASCII_TO_NATIVE('\007');
3153                         p++;
3154                         break;
3155                     case 'x':
3156                         if (*++p == '{') {
3157                             char* const e = strchr(p, '}');
3158         
3159                             if (!e) {
3160                                 RExC_parse = p + 1;
3161                                 vFAIL("Missing right brace on \\x{}");
3162                             }
3163                             else {
3164                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3165                                     | PERL_SCAN_DISALLOW_PREFIX;
3166                                 STRLEN numlen = e - p - 1;
3167                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3168                                 if (ender > 0xff)
3169                                     RExC_utf8 = 1;
3170                                 p = e + 1;
3171                             }
3172                         }
3173                         else {
3174                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3175                             STRLEN numlen = 2;
3176                             ender = grok_hex(p, &numlen, &flags, NULL);
3177                             p += numlen;
3178                         }
3179                         break;
3180                     case 'c':
3181                         p++;
3182                         ender = UCHARAT(p++);
3183                         ender = toCTRL(ender);
3184                         break;
3185                     case '0': case '1': case '2': case '3':case '4':
3186                     case '5': case '6': case '7': case '8':case '9':
3187                         if (*p == '0' ||
3188                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3189                             I32 flags = 0;
3190                             STRLEN numlen = 3;
3191                             ender = grok_oct(p, &numlen, &flags, NULL);
3192                             p += numlen;
3193                         }
3194                         else {
3195                             --p;
3196                             goto loopdone;
3197                         }
3198                         break;
3199                     case '\0':
3200                         if (p >= RExC_end)
3201                             FAIL("Trailing \\");
3202                         /* FALL THROUGH */
3203                     default:
3204                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3205                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3206                         goto normal_default;
3207                     }
3208                     break;
3209                 default:
3210                   normal_default:
3211                     if (UTF8_IS_START(*p) && UTF) {
3212                         STRLEN numlen;
3213                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3214                                                &numlen, 0);
3215                         p += numlen;
3216                     }
3217                     else
3218                         ender = *p++;
3219                     break;
3220                 }
3221                 if (RExC_flags & PMf_EXTENDED)
3222                     p = regwhite(p, RExC_end);
3223                 if (UTF && FOLD) {
3224                     /* Prime the casefolded buffer. */
3225                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3226                 }
3227                 if (ISMULT2(p)) { /* Back off on ?+*. */
3228                     if (len)
3229                         p = oldp;
3230                     else if (UTF) {
3231                          STRLEN unilen;
3232
3233                          if (FOLD) {
3234                               /* Emit all the Unicode characters. */
3235                               STRLEN numlen;
3236                               for (foldbuf = tmpbuf;
3237                                    foldlen;
3238                                    foldlen -= numlen) {
3239                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3240                                    if (numlen > 0) {
3241                                         reguni(pRExC_state, ender, s, &unilen);
3242                                         s       += unilen;
3243                                         len     += unilen;
3244                                         /* In EBCDIC the numlen
3245                                          * and unilen can differ. */
3246                                         foldbuf += numlen;
3247                                         if (numlen >= foldlen)
3248                                              break;
3249                                    }
3250                                    else
3251                                         break; /* "Can't happen." */
3252                               }
3253                          }
3254                          else {
3255                               reguni(pRExC_state, ender, s, &unilen);
3256                               if (unilen > 0) {
3257                                    s   += unilen;
3258                                    len += unilen;
3259                               }
3260                          }
3261                     }
3262                     else {
3263                         len++;
3264                         REGC((char)ender, s++);
3265                     }
3266                     break;
3267                 }
3268                 if (UTF) {
3269                      STRLEN unilen;
3270
3271                      if (FOLD) {
3272                           /* Emit all the Unicode characters. */
3273                           STRLEN numlen;
3274                           for (foldbuf = tmpbuf;
3275                                foldlen;
3276                                foldlen -= numlen) {
3277                                ender = utf8_to_uvchr(foldbuf, &numlen);
3278                                if (numlen > 0) {
3279                                     reguni(pRExC_state, ender, s, &unilen);
3280                                     len     += unilen;
3281                                     s       += unilen;
3282                                     /* In EBCDIC the numlen
3283                                      * and unilen can differ. */
3284                                     foldbuf += numlen;
3285                                     if (numlen >= foldlen)
3286                                          break;
3287                                }
3288                                else
3289                                     break;
3290                           }
3291                      }
3292                      else {
3293                           reguni(pRExC_state, ender, s, &unilen);
3294                           if (unilen > 0) {
3295                                s   += unilen;
3296                                len += unilen;
3297                           }
3298                      }
3299                      len--;
3300                 }
3301                 else
3302                     REGC((char)ender, s++);
3303             }
3304         loopdone:
3305             RExC_parse = p - 1;
3306             Set_Node_Cur_Length(ret); /* MJD */
3307             nextchar(pRExC_state);
3308             {
3309                 /* len is STRLEN which is unsigned, need to copy to signed */
3310                 IV iv = len;
3311                 if (iv < 0)
3312                     vFAIL("Internal disaster");
3313             }
3314             if (len > 0)
3315                 *flagp |= HASWIDTH;
3316             if (len == 1 && UNI_IS_INVARIANT(ender))
3317                 *flagp |= SIMPLE;
3318             if (!SIZE_ONLY)
3319                 STR_LEN(ret) = len;
3320             if (SIZE_ONLY)
3321                 RExC_size += STR_SZ(len);
3322             else
3323                 RExC_emit += STR_SZ(len);
3324         }
3325         break;
3326     }
3327
3328     /* If the encoding pragma is in effect recode the text of
3329      * any EXACT-kind nodes. */
3330     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3331         STRLEN oldlen = STR_LEN(ret);
3332         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3333
3334         if (RExC_utf8)
3335             SvUTF8_on(sv);
3336         if (sv_utf8_downgrade(sv, TRUE)) {
3337             const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3338             const STRLEN newlen = SvCUR(sv);
3339
3340             if (SvUTF8(sv))
3341                 RExC_utf8 = 1;
3342             if (!SIZE_ONLY) {
3343                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344                                       (int)oldlen, STRING(ret),
3345                                       (int)newlen, s));
3346                 Copy(s, STRING(ret), newlen, char);
3347                 STR_LEN(ret) += newlen - oldlen;
3348                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3349             } else
3350                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3351         }
3352     }
3353
3354     return(ret);
3355 }
3356
3357 STATIC char *
3358 S_regwhite(pTHX_ char *p, const char *e)
3359 {
3360     while (p < e) {
3361         if (isSPACE(*p))
3362             ++p;
3363         else if (*p == '#') {
3364             do {
3365                 p++;
3366             } while (p < e && *p != '\n');
3367         }
3368         else
3369             break;
3370     }
3371     return p;
3372 }
3373
3374 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375    Character classes ([:foo:]) can also be negated ([:^foo:]).
3376    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3378    but trigger failures because they are currently unimplemented. */
3379
3380 #define POSIXCC_DONE(c)   ((c) == ':')
3381 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3383
3384 STATIC I32
3385 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3386 {
3387     char *posixcc = 0;
3388     I32 namedclass = OOB_NAMEDCLASS;
3389
3390     if (value == '[' && RExC_parse + 1 < RExC_end &&
3391         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3392         POSIXCC(UCHARAT(RExC_parse))) {
3393         const char c = UCHARAT(RExC_parse);
3394         char* s = RExC_parse++;
3395         
3396         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3397             RExC_parse++;
3398         if (RExC_parse == RExC_end)
3399             /* Grandfather lone [:, [=, [. */
3400             RExC_parse = s;
3401         else {
3402             const char* t = RExC_parse++; /* skip over the c */
3403
3404             assert(*t == c);
3405
3406             if (UCHARAT(RExC_parse) == ']') {
3407                 RExC_parse++; /* skip over the ending ] */
3408                 posixcc = s + 1;
3409                 if (*s == ':') {
3410                     const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3411                     const I32 skip = t - posixcc;
3412
3413                     /* Initially switch on the length of the name.  */
3414                     switch (skip) {
3415                     case 4:
3416                         if (memEQ(posixcc, "word", 4)) {
3417                             /* this is not POSIX, this is the Perl \w */;
3418                             namedclass
3419                                 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3420                         }
3421                         break;
3422                     case 5:
3423                         /* Names all of length 5.  */
3424                         /* alnum alpha ascii blank cntrl digit graph lower
3425                            print punct space upper  */
3426                         /* Offset 4 gives the best switch position.  */
3427                         switch (posixcc[4]) {
3428                         case 'a':
3429                             if (memEQ(posixcc, "alph", 4)) {
3430                                 /*                  a     */
3431                                 namedclass
3432                                     = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3433                             }
3434                             break;
3435                         case 'e':
3436                             if (memEQ(posixcc, "spac", 4)) {
3437                                 /*                  e     */
3438                                 namedclass
3439                                     = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3440                             }
3441                             break;
3442                         case 'h':
3443                             if (memEQ(posixcc, "grap", 4)) {
3444                                 /*                  h     */
3445                                 namedclass
3446                                     = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3447                             }
3448                             break;
3449                         case 'i':
3450                             if (memEQ(posixcc, "asci", 4)) {
3451                                 /*                  i     */
3452                                 namedclass
3453                                     = complement ? ANYOF_NASCII : ANYOF_ASCII;
3454                             }
3455                             break;
3456                         case 'k':
3457                             if (memEQ(posixcc, "blan", 4)) {
3458                                 /*                  k     */
3459                                 namedclass
3460                                     = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3461                             }
3462                             break;
3463                         case 'l':
3464                             if (memEQ(posixcc, "cntr", 4)) {
3465                                 /*                  l     */
3466                                 namedclass
3467                                     = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3468                             }
3469                             break;
3470                         case 'm':
3471                             if (memEQ(posixcc, "alnu", 4)) {
3472                                 /*                  m     */
3473                                 namedclass
3474                                     = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3475                             }
3476                             break;
3477                         case 'r':
3478                             if (memEQ(posixcc, "lowe", 4)) {
3479                                 /*                  r     */
3480                                 namedclass
3481                                     = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3482                             }
3483                             if (memEQ(posixcc, "uppe", 4)) {
3484                                 /*                  r     */
3485                                 namedclass
3486                                     = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3487                             }
3488                             break;
3489                         case 't':
3490                             if (memEQ(posixcc, "digi", 4)) {
3491                                 /*                  t     */
3492                                 namedclass
3493                                     = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3494                             }
3495                             if (memEQ(posixcc, "prin", 4)) {
3496                                 /*                  t     */
3497                                 namedclass
3498                                     = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3499                             }
3500                             if (memEQ(posixcc, "punc", 4)) {
3501                                 /*                  t     */
3502                                 namedclass
3503                                     = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3504                             }
3505                             break;
3506                         }
3507                         break;
3508                     case 6:
3509                         if (memEQ(posixcc, "xdigit", 6)) {
3510                             namedclass
3511                                 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3512                         }
3513                         break;
3514                     }
3515
3516                     if (namedclass == OOB_NAMEDCLASS)
3517                     {
3518                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3519                                       t - s - 1, s + 1);
3520                     }
3521                     assert (posixcc[skip] == ':');
3522                     assert (posixcc[skip+1] == ']');
3523                 } else if (!SIZE_ONLY) {
3524                     /* [[=foo=]] and [[.foo.]] are still future. */
3525
3526                     /* adjust RExC_parse so the warning shows after
3527                        the class closes */
3528                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3529                         RExC_parse++;
3530                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3531                 }
3532             } else {
3533                 /* Maternal grandfather:
3534                  * "[:" ending in ":" but not in ":]" */
3535                 RExC_parse = s;
3536             }
3537         }
3538     }
3539
3540     return namedclass;
3541 }
3542
3543 STATIC void
3544 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3545 {
3546     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3547         const char *s = RExC_parse;
3548         const char  c = *s++;
3549
3550         while(*s && isALNUM(*s))
3551             s++;
3552         if (*s && c == *s && s[1] == ']') {
3553             if (ckWARN(WARN_REGEXP))
3554                 vWARN3(s+2,
3555                         "POSIX syntax [%c %c] belongs inside character classes",
3556                         c, c);
3557
3558             /* [[=foo=]] and [[.foo.]] are still future. */
3559             if (POSIXCC_NOTYET(c)) {
3560                 /* adjust RExC_parse so the error shows after
3561                    the class closes */
3562                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3563                     ;
3564                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3565             }
3566         }
3567     }
3568 }
3569
3570 STATIC regnode *
3571 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3572 {
3573     register UV value;
3574     register UV nextvalue;
3575     register IV prevvalue = OOB_UNICODE;
3576     register IV range = 0;
3577     register regnode *ret;
3578     STRLEN numlen;
3579     IV namedclass;
3580     char *rangebegin = 0;
3581     bool need_class = 0;
3582     SV *listsv = Nullsv;
3583     register char *e;
3584     UV n;
3585     bool optimize_invert   = TRUE;
3586     AV* unicode_alternate  = 0;
3587 #ifdef EBCDIC
3588     UV literal_endpoint = 0;
3589 #endif
3590
3591     ret = reganode(pRExC_state, ANYOF, 0);
3592
3593     if (!SIZE_ONLY)
3594         ANYOF_FLAGS(ret) = 0;
3595
3596     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3597         RExC_naughty++;
3598         RExC_parse++;
3599         if (!SIZE_ONLY)
3600             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3601     }
3602
3603     if (SIZE_ONLY)
3604         RExC_size += ANYOF_SKIP;
3605     else {
3606         RExC_emit += ANYOF_SKIP;
3607         if (FOLD)
3608             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3609         if (LOC)
3610             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3611         ANYOF_BITMAP_ZERO(ret);
3612         listsv = newSVpvn("# comment\n", 10);
3613     }
3614
3615     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3616
3617     if (!SIZE_ONLY && POSIXCC(nextvalue))
3618         checkposixcc(pRExC_state);
3619
3620     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3621     if (UCHARAT(RExC_parse) == ']')
3622         goto charclassloop;
3623
3624     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3625
3626     charclassloop:
3627
3628         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3629
3630         if (!range)
3631             rangebegin = RExC_parse;
3632         if (UTF) {
3633             value = utf8n_to_uvchr((U8*)RExC_parse,
3634                                    RExC_end - RExC_parse,
3635                                    &numlen, 0);
3636             RExC_parse += numlen;
3637         }
3638         else
3639             value = UCHARAT(RExC_parse++);
3640         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3641         if (value == '[' && POSIXCC(nextvalue))
3642             namedclass = regpposixcc(pRExC_state, value);
3643         else if (value == '\\') {
3644             if (UTF) {
3645                 value = utf8n_to_uvchr((U8*)RExC_parse,
3646                                    RExC_end - RExC_parse,
3647                                    &numlen, 0);
3648                 RExC_parse += numlen;
3649             }
3650             else
3651                 value = UCHARAT(RExC_parse++);
3652             /* Some compilers cannot handle switching on 64-bit integer
3653              * values, therefore value cannot be an UV.  Yes, this will
3654              * be a problem later if we want switch on Unicode.
3655              * A similar issue a little bit later when switching on
3656              * namedclass. --jhi */
3657             switch ((I32)value) {
3658             case 'w':   namedclass = ANYOF_ALNUM;       break;
3659             case 'W':   namedclass = ANYOF_NALNUM;      break;
3660             case 's':   namedclass = ANYOF_SPACE;       break;
3661             case 'S':   namedclass = ANYOF_NSPACE;      break;
3662             case 'd':   namedclass = ANYOF_DIGIT;       break;
3663             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3664             case 'p':
3665             case 'P':
3666                 if (RExC_parse >= RExC_end)
3667                     vFAIL2("Empty \\%c{}", (U8)value);
3668                 if (*RExC_parse == '{') {
3669                     const U8 c = (U8)value;
3670                     e = strchr(RExC_parse++, '}');
3671                     if (!e)
3672                         vFAIL2("Missing right brace on \\%c{}", c);
3673                     while (isSPACE(UCHARAT(RExC_parse)))
3674                         RExC_parse++;
3675                     if (e == RExC_parse)
3676                         vFAIL2("Empty \\%c{}", c);
3677                     n = e - RExC_parse;
3678                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3679                         n--;
3680                 }
3681                 else {
3682                     e = RExC_parse;
3683                     n = 1;
3684                 }
3685                 if (!SIZE_ONLY) {
3686                     if (UCHARAT(RExC_parse) == '^') {
3687                          RExC_parse++;
3688                          n--;
3689                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3690                          while (isSPACE(UCHARAT(RExC_parse))) {
3691                               RExC_parse++;
3692                               n--;
3693                          }
3694                     }
3695                     if (value == 'p')
3696                          Perl_sv_catpvf(aTHX_ listsv,
3697                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3698                     else
3699                          Perl_sv_catpvf(aTHX_ listsv,
3700                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3701                 }
3702                 RExC_parse = e + 1;
3703                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3704                 namedclass = ANYOF_MAX;  /* no official name, but it's named */
3705                 break;
3706             case 'n':   value = '\n';                   break;
3707             case 'r':   value = '\r';                   break;
3708             case 't':   value = '\t';                   break;
3709             case 'f':   value = '\f';                   break;
3710             case 'b':   value = '\b';                   break;
3711             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3712             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3713             case 'x':
3714                 if (*RExC_parse == '{') {
3715                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3716                         | PERL_SCAN_DISALLOW_PREFIX;
3717                     e = strchr(RExC_parse++, '}');
3718                     if (!e)
3719                         vFAIL("Missing right brace on \\x{}");
3720
3721                     numlen = e - RExC_parse;
3722                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3723                     RExC_parse = e + 1;
3724                 }
3725                 else {
3726                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3727                     numlen = 2;
3728                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3729                     RExC_parse += numlen;
3730                 }
3731                 break;
3732             case 'c':
3733                 value = UCHARAT(RExC_parse++);
3734                 value = toCTRL(value);
3735                 break;
3736             case '0': case '1': case '2': case '3': case '4':
3737             case '5': case '6': case '7': case '8': case '9':
3738             {
3739                 I32 flags = 0;
3740                 numlen = 3;
3741                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3742                 RExC_parse += numlen;
3743                 break;
3744             }
3745             default:
3746                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3747                     vWARN2(RExC_parse,
3748                            "Unrecognized escape \\%c in character class passed through",
3749                            (int)value);
3750                 break;
3751             }
3752         } /* end of \blah */
3753 #ifdef EBCDIC
3754         else
3755             literal_endpoint++;
3756 #endif
3757
3758         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3759
3760             if (!SIZE_ONLY && !need_class)
3761                 ANYOF_CLASS_ZERO(ret);
3762
3763             need_class = 1;
3764
3765             /* a bad range like a-\d, a-[:digit:] ? */
3766             if (range) {
3767                 if (!SIZE_ONLY) {
3768                     if (ckWARN(WARN_REGEXP))
3769                         vWARN4(RExC_parse,
3770                                "False [] range \"%*.*s\"",
3771                                RExC_parse - rangebegin,
3772                                RExC_parse - rangebegin,
3773                                rangebegin);
3774                     if (prevvalue < 256) {
3775                         ANYOF_BITMAP_SET(ret, prevvalue);
3776                         ANYOF_BITMAP_SET(ret, '-');
3777                     }
3778                     else {
3779                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3780                         Perl_sv_catpvf(aTHX_ listsv,
3781                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3782                     }
3783                 }
3784
3785                 range = 0; /* this was not a true range */
3786             }
3787
3788             if (!SIZE_ONLY) {
3789                 const char *what = NULL;
3790                 char yesno = 0;
3791
3792                 if (namedclass > OOB_NAMEDCLASS)
3793                     optimize_invert = FALSE;
3794                 /* Possible truncation here but in some 64-bit environments
3795                  * the compiler gets heartburn about switch on 64-bit values.
3796                  * A similar issue a little earlier when switching on value.
3797                  * --jhi */
3798                 switch ((I32)namedclass) {
3799                 case ANYOF_ALNUM:
3800                     if (LOC)
3801                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3802                     else {
3803                         for (value = 0; value < 256; value++)
3804                             if (isALNUM(value))
3805                                 ANYOF_BITMAP_SET(ret, value);
3806                     }
3807                     yesno = '+';
3808                     what = "Word";      
3809                     break;
3810                 case ANYOF_NALNUM:
3811                     if (LOC)
3812                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3813                     else {
3814                         for (value = 0; value < 256; value++)
3815                             if (!isALNUM(value))
3816                                 ANYOF_BITMAP_SET(ret, value);
3817                     }
3818                     yesno = '!';
3819                     what = "Word";
3820                     break;
3821                 case ANYOF_ALNUMC:
3822                     if (LOC)
3823                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3824                     else {
3825                         for (value = 0; value < 256; value++)
3826                             if (isALNUMC(value))
3827                                 ANYOF_BITMAP_SET(ret, value);
3828                     }
3829                     yesno = '+';
3830                     what = "Alnum";
3831                     break;
3832                 case ANYOF_NALNUMC:
3833                     if (LOC)
3834                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3835                     else {
3836                         for (value = 0; value < 256; value++)
3837                             if (!isALNUMC(value))
3838                                 ANYOF_BITMAP_SET(ret, value);
3839                     }
3840                     yesno = '!';
3841                     what = "Alnum";
3842                     break;
3843                 case ANYOF_ALPHA:
3844                     if (LOC)
3845                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3846                     else {
3847                         for (value = 0; value < 256; value++)
3848                             if (isALPHA(value))
3849                                 ANYOF_BITMAP_SET(ret, value);
3850                     }
3851                     yesno = '+';
3852                     what = "Alpha";
3853                     break;
3854                 case ANYOF_NALPHA:
3855                     if (LOC)
3856                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3857                     else {
3858                         for (value = 0; value < 256; value++)
3859                             if (!isALPHA(value))
3860                                 ANYOF_BITMAP_SET(ret, value);
3861                     }
3862                     yesno = '!';
3863                     what = "Alpha";
3864                     break;
3865                 case ANYOF_ASCII:
3866                     if (LOC)
3867                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3868                     else {
3869 #ifndef EBCDIC
3870                         for (value = 0; value < 128; value++)
3871                             ANYOF_BITMAP_SET(ret, value);
3872 #else  /* EBCDIC */
3873                         for (value = 0; value < 256; value++) {
3874                             if (isASCII(value))
3875                                 ANYOF_BITMAP_SET(ret, value);
3876                         }
3877 #endif /* EBCDIC */
3878                     }
3879                     yesno = '+';
3880                     what = "ASCII";
3881                     break;
3882                 case ANYOF_NASCII:
3883                     if (LOC)
3884                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3885                     else {
3886 #ifndef EBCDIC
3887                         for (value = 128; value < 256; value++)
3888                             ANYOF_BITMAP_SET(ret, value);
3889 #else  /* EBCDIC */
3890                         for (value = 0; value < 256; value++) {
3891                             if (!isASCII(value))
3892                                 ANYOF_BITMAP_SET(ret, value);
3893                         }
3894 #endif /* EBCDIC */
3895                     }
3896                     yesno = '!';
3897                     what = "ASCII";
3898                     break;
3899                 case ANYOF_BLANK:
3900                     if (LOC)
3901                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3902                     else {
3903                         for (value = 0; value < 256; value++)
3904                             if (isBLANK(value))
3905                                 ANYOF_BITMAP_SET(ret, value);
3906                     }
3907                     yesno = '+';
3908                     what = "Blank";
3909                     break;
3910                 case ANYOF_NBLANK:
3911                     if (LOC)
3912                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3913                     else {
3914                         for (value = 0; value < 256; value++)
3915                             if (!isBLANK(value))
3916                                 ANYOF_BITMAP_SET(ret, value);
3917                     }
3918                     yesno = '!';
3919                     what = "Blank";
3920                     break;
3921                 case ANYOF_CNTRL:
3922                     if (LOC)
3923                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3924                     else {
3925                         for (value = 0; value < 256; value++)
3926                             if (isCNTRL(value))
3927                                 ANYOF_BITMAP_SET(ret, value);
3928                     }
3929                     yesno = '+';
3930                     what = "Cntrl";
3931                     break;
3932                 case ANYOF_NCNTRL:
3933                     if (LOC)
3934                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3935                     else {
3936                         for (value = 0; value < 256; value++)
3937                             if (!isCNTRL(value))
3938                                 ANYOF_BITMAP_SET(ret, value);
3939                     }
3940                     yesno = '!';
3941                     what = "Cntrl";
3942                     break;
3943                 case ANYOF_DIGIT:
3944                     if (LOC)
3945                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3946                     else {
3947                         /* consecutive digits assumed */
3948                         for (value = '0'; value <= '9'; value++)
3949                             ANYOF_BITMAP_SET(ret, value);
3950                     }
3951                     yesno = '+';
3952                     what = "Digit";
3953                     break;
3954                 case ANYOF_NDIGIT:
3955                     if (LOC)
3956                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3957                     else {
3958                         /* consecutive digits assumed */
3959                         for (value = 0; value < '0'; value++)
3960                             ANYOF_BITMAP_SET(ret, value);
3961                         for (value = '9' + 1; value < 256; value++)
3962                             ANYOF_BITMAP_SET(ret, value);
3963                     }
3964                     yesno = '!';
3965                     what = "Digit";
3966                     break;
3967                 case ANYOF_GRAPH:
3968                     if (LOC)
3969                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3970                     else {
3971                         for (value = 0; value < 256; value++)
3972                             if (isGRAPH(value))
3973                                 ANYOF_BITMAP_SET(ret, value);
3974                     }
3975                     yesno = '+';
3976                     what = "Graph";
3977                     break;
3978                 case ANYOF_NGRAPH:
3979                     if (LOC)
3980                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3981                     else {
3982                         for (value = 0; value < 256; value++)
3983                             if (!isGRAPH(value))
3984                                 ANYOF_BITMAP_SET(ret, value);
3985                     }
3986                     yesno = '!';
3987                     what = "Graph";
3988                     break;
3989                 case ANYOF_LOWER:
3990                     if (LOC)
3991                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3992                     else {
3993                         for (value = 0; value < 256; value++)
3994                             if (isLOWER(value))
3995                                 ANYOF_BITMAP_SET(ret, value);
3996                     }
3997                     yesno = '+';
3998                     what = "Lower";
3999                     break;
4000                 case ANYOF_NLOWER:
4001                     if (LOC)
4002                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
4003                     else {
4004                         for (value = 0; value < 256; value++)
4005                             if (!isLOWER(value))
4006                                 ANYOF_BITMAP_SET(ret, value);
4007                     }
4008                     yesno = '!';
4009                     what = "Lower";
4010                     break;
4011                 case ANYOF_PRINT:
4012                     if (LOC)
4013                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
4014                     else {
4015                         for (value = 0; value < 256; value++)
4016                             if (isPRINT(value))
4017                                 ANYOF_BITMAP_SET(ret, value);
4018                     }
4019                     yesno = '+';
4020                     what = "Print";
4021                     break;
4022                 case ANYOF_NPRINT:
4023                     if (LOC)
4024                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4025                     else {
4026                         for (value = 0; value < 256; value++)
4027                             if (!isPRINT(value))
4028                                 ANYOF_BITMAP_SET(ret, value);
4029                     }
4030                     yesno = '!';
4031                     what = "Print";
4032                     break;
4033                 case ANYOF_PSXSPC:
4034                     if (LOC)
4035                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4036                     else {
4037                         for (value = 0; value < 256; value++)
4038                             if (isPSXSPC(value))
4039                                 ANYOF_BITMAP_SET(ret, value);
4040                     }
4041                     yesno = '+';
4042                     what = "Space";
4043                     break;
4044                 case ANYOF_NPSXSPC:
4045                     if (LOC)
4046                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4047                     else {
4048                         for (value = 0; value < 256; value++)
4049                             if (!isPSXSPC(value))
4050                                 ANYOF_BITMAP_SET(ret, value);
4051                     }
4052                     yesno = '!';
4053                     what = "Space";
4054                     break;
4055                 case ANYOF_PUNCT:
4056                     if (LOC)
4057                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4058                     else {
4059                         for (value = 0; value < 256; value++)
4060                             if (isPUNCT(value))
4061                                 ANYOF_BITMAP_SET(ret, value);
4062                     }
4063                     yesno = '+';
4064                     what = "Punct";
4065                     break;
4066                 case ANYOF_NPUNCT:
4067                     if (LOC)
4068                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4069                     else {
4070                         for (value = 0; value < 256; value++)
4071                             if (!isPUNCT(value))
4072                                 ANYOF_BITMAP_SET(ret, value);
4073                     }
4074                     yesno = '!';
4075                     what = "Punct";
4076                     break;
4077                 case ANYOF_SPACE:
4078                     if (LOC)
4079                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4080                     else {
4081                         for (value = 0; value < 256; value++)
4082                             if (isSPACE(value))
4083                                 ANYOF_BITMAP_SET(ret, value);
4084                     }
4085                     yesno = '+';
4086                     what = "SpacePerl";
4087                     break;
4088                 case ANYOF_NSPACE:
4089                     if (LOC)
4090                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4091                     else {
4092                         for (value = 0; value < 256; value++)
4093                             if (!isSPACE(value))
4094                                 ANYOF_BITMAP_SET(ret, value);
4095                     }
4096                     yesno = '!';
4097                     what = "SpacePerl";
4098                     break;
4099                 case ANYOF_UPPER:
4100                     if (LOC)
4101                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4102                     else {
4103                         for (value = 0; value < 256; value++)
4104                             if (isUPPER(value))
4105                                 ANYOF_BITMAP_SET(ret, value);
4106                     }
4107                     yesno = '+';
4108                     what = "Upper";
4109                     break;
4110                 case ANYOF_NUPPER:
4111                     if (LOC)
4112                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4113                     else {
4114                         for (value = 0; value < 256; value++)
4115                             if (!isUPPER(value))
4116                                 ANYOF_BITMAP_SET(ret, value);
4117                     }
4118                     yesno = '!';
4119                     what = "Upper";
4120                     break;
4121                 case ANYOF_XDIGIT:
4122                     if (LOC)
4123                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4124                     else {
4125                         for (value = 0; value < 256; value++)
4126                             if (isXDIGIT(value))
4127                                 ANYOF_BITMAP_SET(ret, value);
4128                     }
4129                     yesno = '+';
4130                     what = "XDigit";
4131                     break;
4132                 case ANYOF_NXDIGIT:
4133                     if (LOC)
4134                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4135                     else {
4136                         for (value = 0; value < 256; value++)
4137                             if (!isXDIGIT(value))
4138                                 ANYOF_BITMAP_SET(ret, value);
4139                     }
4140                     yesno = '!';
4141                     what = "XDigit";
4142                     break;
4143                 case ANYOF_MAX:
4144                     /* this is to handle \p and \P */
4145                     break;
4146                 default:
4147                     vFAIL("Invalid [::] class");
4148                     break;
4149                 }
4150                 if (what) {
4151                     /* Strings such as "+utf8::isWord\n" */
4152                     Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4153                 }
4154                 if (LOC)
4155                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4156                 continue;
4157             }
4158         } /* end of namedclass \blah */
4159
4160         if (range) {
4161             if (prevvalue > (IV)value) /* b-a */ {
4162                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4163                               RExC_parse - rangebegin,
4164                               RExC_parse - rangebegin,
4165                               rangebegin);
4166                 range = 0; /* not a valid range */
4167             }
4168         }
4169         else {
4170             prevvalue = value; /* save the beginning of the range */
4171             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4172                 RExC_parse[1] != ']') {
4173                 RExC_parse++;
4174
4175                 /* a bad range like \w-, [:word:]- ? */
4176                 if (namedclass > OOB_NAMEDCLASS) {
4177                     if (ckWARN(WARN_REGEXP))
4178                         vWARN4(RExC_parse,
4179                                "False [] range \"%*.*s\"",
4180                                RExC_parse - rangebegin,
4181                                RExC_parse - rangebegin,
4182                                rangebegin);
4183                     if (!SIZE_ONLY)
4184                         ANYOF_BITMAP_SET(ret, '-');
4185                 } else
4186                     range = 1;  /* yeah, it's a range! */
4187                 continue;       /* but do it the next time */
4188             }
4189         }
4190
4191         /* now is the next time */
4192         if (!SIZE_ONLY) {
4193             IV i;
4194
4195             if (prevvalue < 256) {
4196                 const IV ceilvalue = value < 256 ? value : 255;
4197
4198 #ifdef EBCDIC
4199                 /* In EBCDIC [\x89-\x91] should include
4200                  * the \x8e but [i-j] should not. */
4201                 if (literal_endpoint == 2 &&
4202                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4203                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4204                 {
4205                     if (isLOWER(prevvalue)) {
4206                         for (i = prevvalue; i <= ceilvalue; i++)
4207                             if (isLOWER(i))
4208                                 ANYOF_BITMAP_SET(ret, i);
4209                     } else {
4210                         for (i = prevvalue; i <= ceilvalue; i++)
4211                             if (isUPPER(i))
4212                                 ANYOF_BITMAP_SET(ret, i);
4213                     }
4214                 }
4215                 else
4216 #endif
4217                       for (i = prevvalue; i <= ceilvalue; i++)
4218                           ANYOF_BITMAP_SET(ret, i);
4219           }
4220           if (value > 255 || UTF) {
4221                 const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4222                 const UV natvalue      = NATIVE_TO_UNI(value);
4223
4224                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4225                 if (prevnatvalue < natvalue) { /* what about > ? */
4226                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4227                                    prevnatvalue, natvalue);
4228                 }
4229                 else if (prevnatvalue == natvalue) {
4230                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4231                     if (FOLD) {
4232                          U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4233                          STRLEN foldlen;
4234                          const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4235
4236                          /* If folding and foldable and a single
4237                           * character, insert also the folded version
4238                           * to the charclass. */
4239                          if (f != value) {
4240                               if (foldlen == (STRLEN)UNISKIP(f))
4241                                   Perl_sv_catpvf(aTHX_ listsv,
4242                                                  "%04"UVxf"\n", f);
4243                               else {
4244                                   /* Any multicharacter foldings
4245                                    * require the following transform:
4246                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4247                                    * where E folds into "pq" and F folds
4248                                    * into "rst", all other characters
4249                                    * fold to single characters.  We save
4250                                    * away these multicharacter foldings,
4251                                    * to be later saved as part of the
4252                                    * additional "s" data. */
4253                                   SV *sv;
4254
4255                                   if (!unicode_alternate)
4256                                       unicode_alternate = newAV();
4257                                   sv = newSVpvn((char*)foldbuf, foldlen);
4258                                   SvUTF8_on(sv);
4259                                   av_push(unicode_alternate, sv);
4260                               }
4261                          }
4262
4263                          /* If folding and the value is one of the Greek
4264                           * sigmas insert a few more sigmas to make the
4265                           * folding rules of the sigmas to work right.
4266                           * Note that not all the possible combinations
4267                           * are handled here: some of them are handled
4268                           * by the standard folding rules, and some of
4269                           * them (literal or EXACTF cases) are handled
4270                           * during runtime in regexec.c:S_find_byclass(). */
4271                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4272                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4273                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4274                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4275                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4276                          }
4277                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4278                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4279                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4280                     }
4281                 }
4282             }
4283 #ifdef EBCDIC
4284             literal_endpoint = 0;
4285 #endif
4286         }
4287
4288         range = 0; /* this range (if it was one) is done now */
4289     }
4290
4291     if (need_class) {
4292         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4293         if (SIZE_ONLY)
4294             RExC_size += ANYOF_CLASS_ADD_SKIP;
4295         else
4296             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4297     }
4298
4299     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4300     if (!SIZE_ONLY &&
4301          /* If the only flag is folding (plus possibly inversion). */
4302         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4303        ) {
4304         for (value = 0; value < 256; ++value) {
4305             if (ANYOF_BITMAP_TEST(ret, value)) {
4306                 UV fold = PL_fold[value];
4307
4308                 if (fold != value)
4309                     ANYOF_BITMAP_SET(ret, fold);
4310             }
4311         }
4312         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4313     }
4314
4315     /* optimize inverted simple patterns (e.g. [^a-z]) */
4316     if (!SIZE_ONLY && optimize_invert &&
4317         /* If the only flag is inversion. */
4318         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4319         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4320             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4321         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4322     }
4323
4324     if (!SIZE_ONLY) {
4325         AV *av = newAV();
4326         SV *rv;
4327
4328         /* The 0th element stores the character class description
4329          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4330          * to initialize the appropriate swash (which gets stored in
4331          * the 1st element), and also useful for dumping the regnode.
4332          * The 2nd element stores the multicharacter foldings,
4333          * used later (regexec.c:S_reginclass()). */
4334         av_store(av, 0, listsv);
4335         av_store(av, 1, NULL);
4336         av_store(av, 2, (SV*)unicode_alternate);
4337         rv = newRV_noinc((SV*)av);
4338         n = add_data(pRExC_state, 1, "s");
4339         RExC_rx->data->data[n] = (void*)rv;
4340         ARG_SET(ret, n);
4341     }
4342
4343     return ret;
4344 }
4345
4346 STATIC char*
4347 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4348 {
4349     char* retval = RExC_parse++;
4350
4351     for (;;) {
4352         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4353                 RExC_parse[2] == '#') {
4354             while (*RExC_parse != ')') {
4355                 if (RExC_parse == RExC_end)
4356                     FAIL("Sequence (?#... not terminated");
4357                 RExC_parse++;
4358             }
4359             RExC_parse++;
4360             continue;
4361         }
4362         if (RExC_flags & PMf_EXTENDED) {
4363             if (isSPACE(*RExC_parse)) {
4364                 RExC_parse++;
4365                 continue;
4366             }
4367             else if (*RExC_parse == '#') {
4368                 while (RExC_parse < RExC_end)
4369                     if (*RExC_parse++ == '\n') break;
4370                 continue;
4371             }
4372         }
4373         return retval;
4374     }
4375 }
4376
4377 /*
4378 - reg_node - emit a node
4379 */
4380 STATIC regnode *                        /* Location. */
4381 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4382 {
4383     register regnode *ptr;
4384     regnode * const ret = RExC_emit;
4385
4386     if (SIZE_ONLY) {
4387         SIZE_ALIGN(RExC_size);
4388         RExC_size += 1;
4389         return(ret);
4390     }
4391
4392     NODE_ALIGN_FILL(ret);
4393     ptr = ret;
4394     FILL_ADVANCE_NODE(ptr, op);
4395     if (RExC_offsets) {         /* MJD */
4396         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4397               "reg_node", __LINE__, 
4398               reg_name[op],
4399               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4400               ? "Overwriting end of array!\n" : "OK",
4401               RExC_emit - RExC_emit_start,
4402               RExC_parse - RExC_start,
4403               RExC_offsets[0])); 
4404         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4405     }
4406             
4407     RExC_emit = ptr;
4408
4409     return(ret);
4410 }
4411
4412 /*
4413 - reganode - emit a node with an argument
4414 */
4415 STATIC regnode *                        /* Location. */
4416 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4417 {
4418     register regnode *ptr;
4419     regnode * const ret = RExC_emit;
4420
4421     if (SIZE_ONLY) {
4422         SIZE_ALIGN(RExC_size);
4423         RExC_size += 2;
4424         return(ret);
4425     }
4426
4427     NODE_ALIGN_FILL(ret);
4428     ptr = ret;
4429     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4430     if (RExC_offsets) {         /* MJD */
4431         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4432               "reganode",
4433               __LINE__,
4434               reg_name[op],
4435               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4436               "Overwriting end of array!\n" : "OK",
4437               RExC_emit - RExC_emit_start,
4438               RExC_parse - RExC_start,
4439               RExC_offsets[0])); 
4440         Set_Cur_Node_Offset;
4441     }
4442             
4443     RExC_emit = ptr;
4444
4445     return(ret);
4446 }
4447
4448 /*
4449 - reguni - emit (if appropriate) a Unicode character
4450 */
4451 STATIC void
4452 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4453 {
4454     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4455 }
4456
4457 /*
4458 - reginsert - insert an operator in front of already-emitted operand
4459 *
4460 * Means relocating the operand.
4461 */
4462 STATIC void
4463 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4464 {
4465     register regnode *src;
4466     register regnode *dst;
4467     register regnode *place;
4468     const int offset = regarglen[(U8)op];
4469
4470 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4471
4472     if (SIZE_ONLY) {
4473         RExC_size += NODE_STEP_REGNODE + offset;
4474         return;
4475     }
4476
4477     src = RExC_emit;
4478     RExC_emit += NODE_STEP_REGNODE + offset;
4479     dst = RExC_emit;
4480     while (src > opnd) {
4481         StructCopy(--src, --dst, regnode);
4482         if (RExC_offsets) {     /* MJD 20010112 */
4483             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4484                   "reg_insert",
4485                   __LINE__,
4486                   reg_name[op],
4487                   dst - RExC_emit_start > RExC_offsets[0] 
4488                   ? "Overwriting end of array!\n" : "OK",
4489                   src - RExC_emit_start,
4490                   dst - RExC_emit_start,
4491                   RExC_offsets[0])); 
4492             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4493             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4494         }
4495     }
4496     
4497
4498     place = opnd;               /* Op node, where operand used to be. */
4499     if (RExC_offsets) {         /* MJD */
4500         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4501               "reginsert",
4502               __LINE__,
4503               reg_name[op],
4504               place - RExC_emit_start > RExC_offsets[0] 
4505               ? "Overwriting end of array!\n" : "OK",
4506               place - RExC_emit_start,
4507               RExC_parse - RExC_start,
4508               RExC_offsets[0])); 
4509         Set_Node_Offset(place, RExC_parse);
4510         Set_Node_Length(place, 1);
4511     }
4512     src = NEXTOPER(place);
4513     FILL_ADVANCE_NODE(place, op);
4514     Zero(src, offset, regnode);
4515 }
4516
4517 /*
4518 - regtail - set the next-pointer at the end of a node chain of p to val.
4519 */
4520 STATIC void
4521 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4522 {
4523     register regnode *scan;
4524
4525     if (SIZE_ONLY)
4526         return;
4527
4528     /* Find last node. */
4529     scan = p;
4530     for (;;) {
4531         regnode * const temp = regnext(scan);
4532         if (temp == NULL)
4533             break;
4534         scan = temp;
4535     }
4536
4537     if (reg_off_by_arg[OP(scan)]) {
4538         ARG_SET(scan, val - scan);
4539     }
4540     else {
4541         NEXT_OFF(scan) = val - scan;
4542     }
4543 }
4544
4545 /*
4546 - regoptail - regtail on operand of first argument; nop if operandless
4547 */
4548 STATIC void
4549 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4550 {
4551     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4552     if (p == NULL || SIZE_ONLY)
4553         return;
4554     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4555         regtail(pRExC_state, NEXTOPER(p), val);
4556     }
4557     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4558         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4559     }
4560     else
4561         return;
4562 }
4563
4564 /*
4565  - regcurly - a little FSA that accepts {\d+,?\d*}
4566  */
4567 STATIC I32
4568 S_regcurly(pTHX_ register const char *s)
4569 {
4570     if (*s++ != '{')
4571         return FALSE;
4572     if (!isDIGIT(*s))
4573         return FALSE;
4574     while (isDIGIT(*s))
4575         s++;
4576     if (*s == ',')
4577         s++;
4578     while (isDIGIT(*s))
4579         s++;
4580     if (*s != '}')
4581         return FALSE;
4582     return TRUE;
4583 }
4584
4585
4586 #ifdef DEBUGGING
4587
4588 STATIC regnode *
4589 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4590 {
4591     register U8 op = EXACT;     /* Arbitrary non-END op. */
4592     register regnode *next;
4593
4594     while (op != END && (!last || node < last)) {
4595         /* While that wasn't END last time... */
4596
4597         NODE_ALIGN(node);
4598         op = OP(node);
4599         if (op == CLOSE)
4600             l--;        
4601         next = regnext(node);
4602         /* Where, what. */
4603         if (OP(node) == OPTIMIZED)
4604             goto after_print;
4605         regprop(sv, node);
4606         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4607                       (int)(2*l + 1), "", SvPVX_const(sv));
4608         if (next == NULL)               /* Next ptr. */
4609             PerlIO_printf(Perl_debug_log, "(0)");
4610         else
4611             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4612         (void)PerlIO_putc(Perl_debug_log, '\n');
4613       after_print:
4614         if (PL_regkind[(U8)op] == BRANCHJ) {
4615             register regnode *nnode = (OP(next) == LONGJMP
4616                                        ? regnext(next)
4617                                        : next);
4618             if (last && nnode > last)
4619                 nnode = last;
4620             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4621         }
4622         else if (PL_regkind[(U8)op] == BRANCH) {
4623             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4624         }
4625         else if ( op == CURLY) {   /* "next" might be very big: optimizer */
4626             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4627                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4628         }
4629         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4630             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4631                              next, sv, l + 1);
4632         }
4633         else if ( op == PLUS || op == STAR) {
4634             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4635         }
4636         else if (op == ANYOF) {
4637             /* arglen 1 + class block */
4638             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4639                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4640             node = NEXTOPER(node);
4641         }
4642         else if (PL_regkind[(U8)op] == EXACT) {
4643             /* Literal string, where present. */
4644             node += NODE_SZ_STR(node) - 1;
4645             node = NEXTOPER(node);
4646         }
4647         else {
4648             node = NEXTOPER(node);
4649             node += regarglen[(U8)op];
4650         }
4651         if (op == CURLYX || op == OPEN)
4652             l++;
4653         else if (op == WHILEM)
4654             l--;
4655     }
4656     return node;
4657 }
4658
4659 #endif  /* DEBUGGING */
4660
4661 /*
4662  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4663  */
4664 void
4665 Perl_regdump(pTHX_ regexp *r)
4666 {
4667 #ifdef DEBUGGING
4668     SV *sv = sv_newmortal();
4669
4670     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4671
4672     /* Header fields of interest. */
4673     if (r->anchored_substr)
4674         PerlIO_printf(Perl_debug_log,
4675                       "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4676                       PL_colors[0],
4677                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4678                       SvPVX_const(r->anchored_substr),
4679                       PL_colors[1],
4680                       SvTAIL(r->anchored_substr) ? "$" : "",
4681                       (IV)r->anchored_offset);
4682     else if (r->anchored_utf8)
4683         PerlIO_printf(Perl_debug_log,
4684                       "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4685                       PL_colors[0],
4686                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4687                       SvPVX_const(r->anchored_utf8),
4688                       PL_colors[1],
4689                       SvTAIL(r->anchored_utf8) ? "$" : "",
4690                       (IV)r->anchored_offset);
4691     if (r->float_substr)
4692         PerlIO_printf(Perl_debug_log,
4693                       "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4694                       PL_colors[0],
4695                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4696                       SvPVX_const(r->float_substr),
4697                       PL_colors[1],
4698                       SvTAIL(r->float_substr) ? "$" : "",
4699                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4700     else if (r->float_utf8)
4701         PerlIO_printf(Perl_debug_log,
4702                       "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4703                       PL_colors[0],
4704                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4705                       SvPVX_const(r->float_utf8),
4706                       PL_colors[1],
4707                       SvTAIL(r->float_utf8) ? "$" : "",
4708                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4709     if (r->check_substr || r->check_utf8)
4710         PerlIO_printf(Perl_debug_log,
4711                       r->check_substr == r->float_substr
4712                       && r->check_utf8 == r->float_utf8
4713                       ? "(checking floating" : "(checking anchored");
4714     if (r->reganch & ROPT_NOSCAN)
4715         PerlIO_printf(Perl_debug_log, " noscan");
4716     if (r->reganch & ROPT_CHECK_ALL)
4717         PerlIO_printf(Perl_debug_log, " isall");
4718     if (r->check_substr || r->check_utf8)
4719         PerlIO_printf(Perl_debug_log, ") ");
4720
4721     if (r->regstclass) {
4722         regprop(sv, r->regstclass);
4723         PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4724     }
4725     if (r->reganch & ROPT_ANCH) {
4726         PerlIO_printf(Perl_debug_log, "anchored");
4727         if (r->reganch & ROPT_ANCH_BOL)
4728             PerlIO_printf(Perl_debug_log, "(BOL)");
4729         if (r->reganch & ROPT_ANCH_MBOL)
4730             PerlIO_printf(Perl_debug_log, "(MBOL)");
4731         if (r->reganch & ROPT_ANCH_SBOL)
4732             PerlIO_printf(Perl_debug_log, "(SBOL)");
4733         if (r->reganch & ROPT_ANCH_GPOS)
4734             PerlIO_printf(Perl_debug_log, "(GPOS)");
4735         PerlIO_putc(Perl_debug_log, ' ');
4736     }
4737     if (r->reganch & ROPT_GPOS_SEEN)
4738         PerlIO_printf(Perl_debug_log, "GPOS ");
4739     if (r->reganch & ROPT_SKIP)
4740         PerlIO_printf(Perl_debug_log, "plus ");
4741     if (r->reganch & ROPT_IMPLICIT)
4742         PerlIO_printf(Perl_debug_log, "implicit ");
4743     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4744     if (r->reganch & ROPT_EVAL_SEEN)
4745         PerlIO_printf(Perl_debug_log, "with eval ");
4746     PerlIO_printf(Perl_debug_log, "\n");
4747     if (r->offsets) {
4748       U32 i;
4749       const U32 len = r->offsets[0];
4750       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4751       for (i = 1; i <= len; i++)
4752         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4753                       (UV)r->offsets[i*2-1], 
4754                       (UV)r->offsets[i*2]);
4755       PerlIO_printf(Perl_debug_log, "\n");
4756     }
4757 #endif  /* DEBUGGING */
4758 }
4759
4760 #ifdef DEBUGGING
4761
4762 STATIC void
4763 S_put_byte(pTHX_ SV *sv, int c)
4764 {
4765     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4766         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4767     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4768         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4769     else
4770         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4771 }
4772
4773 #endif  /* DEBUGGING */
4774
4775 /*
4776 - regprop - printable representation of opcode
4777 */
4778 void
4779 Perl_regprop(pTHX_ SV *sv, regnode *o)
4780 {
4781 #ifdef DEBUGGING
4782     register int k;
4783
4784     sv_setpvn(sv, "", 0);
4785     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4786         /* It would be nice to FAIL() here, but this may be called from
4787            regexec.c, and it would be hard to supply pRExC_state. */
4788         Perl_croak(aTHX_ "Corrupted regexp opcode");
4789     sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4790
4791     k = PL_regkind[(U8)OP(o)];
4792
4793     if (k == EXACT) {
4794         SV *dsv = sv_2mortal(newSVpvn("", 0));
4795         /* Using is_utf8_string() is a crude hack but it may
4796          * be the best for now since we have no flag "this EXACTish
4797          * node was UTF-8" --jhi */
4798         const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4799         const char *s = do_utf8 ?
4800           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4801                          UNI_DISPLAY_REGEX) :
4802           STRING(o);
4803         const int len = do_utf8 ?
4804           strlen(s) :
4805           STR_LEN(o);
4806         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4807                        PL_colors[0],
4808                        len, s,
4809                        PL_colors[1]);
4810     }
4811     else if (k == CURLY) {
4812         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4813             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4814         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4815     }
4816     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4817         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4818     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4819         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4820     else if (k == LOGICAL)
4821         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4822     else if (k == ANYOF) {
4823         int i, rangestart = -1;
4824         U8 flags = ANYOF_FLAGS(o);
4825         const char * const anyofs[] = { /* Should be synchronized with
4826                                          * ANYOF_ #xdefines in regcomp.h */
4827             "\\w",
4828             "\\W",
4829             "\\s",
4830             "\\S",
4831             "\\d",
4832             "\\D",
4833             "[:alnum:]",
4834             "[:^alnum:]",
4835             "[:alpha:]",
4836             "[:^alpha:]",
4837             "[:ascii:]",
4838             "[:^ascii:]",
4839             "[:ctrl:]",
4840             "[:^ctrl:]",
4841             "[:graph:]",
4842             "[:^graph:]",
4843             "[:lower:]",
4844             "[:^lower:]",
4845             "[:print:]",
4846             "[:^print:]",
4847             "[:punct:]",
4848             "[:^punct:]",
4849             "[:upper:]",
4850             "[:^upper:]",
4851             "[:xdigit:]",
4852             "[:^xdigit:]",
4853             "[:space:]",
4854             "[:^space:]",
4855             "[:blank:]",
4856             "[:^blank:]"
4857         };
4858
4859         if (flags & ANYOF_LOCALE)
4860             sv_catpv(sv, "{loc}");
4861         if (flags & ANYOF_FOLD)
4862             sv_catpv(sv, "{i}");
4863         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4864         if (flags & ANYOF_INVERT)
4865             sv_catpv(sv, "^");
4866         for (i = 0; i <= 256; i++) {
4867             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4868                 if (rangestart == -1)
4869                     rangestart = i;
4870             } else if (rangestart != -1) {
4871                 if (i <= rangestart + 3)
4872                     for (; rangestart < i; rangestart++)
4873                         put_byte(sv, rangestart);
4874                 else {
4875                     put_byte(sv, rangestart);
4876                     sv_catpv(sv, "-");
4877                     put_byte(sv, i - 1);
4878                 }
4879                 rangestart = -1;
4880             }
4881         }
4882
4883         if (o->flags & ANYOF_CLASS)
4884             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4885                 if (ANYOF_CLASS_TEST(o,i))
4886                     sv_catpv(sv, anyofs[i]);
4887
4888         if (flags & ANYOF_UNICODE)
4889             sv_catpv(sv, "{unicode}");
4890         else if (flags & ANYOF_UNICODE_ALL)
4891             sv_catpv(sv, "{unicode_all}");
4892
4893         {
4894             SV *lv;
4895             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4896         
4897             if (lv) {
4898                 if (sw) {
4899                     U8 s[UTF8_MAXBYTES_CASE+1];
4900                 
4901                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4902                         uvchr_to_utf8(s, i);
4903                         
4904                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4905                             if (rangestart == -1)
4906                                 rangestart = i;
4907                         } else if (rangestart != -1) {
4908                             U8 *p;
4909                         
4910                             if (i <= rangestart + 3)
4911                                 for (; rangestart < i; rangestart++) {
4912                                     U8 *e;
4913                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4914                                         put_byte(sv, *p);
4915                                 }
4916                             else {
4917                                 U8 *e;
4918                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4919                                     put_byte(sv, *p);
4920                                 sv_catpv(sv, "-");
4921                                 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4922                                     put_byte(sv, *p);
4923                                 }
4924                                 rangestart = -1;
4925                             }
4926                         }
4927                         
4928                     sv_catpv(sv, "..."); /* et cetera */
4929                 }
4930
4931                 {
4932                     char *s = savesvpv(lv);
4933                     char *origs = s;
4934                 
4935                     while(*s && *s != '\n') s++;
4936                 
4937                     if (*s == '\n') {
4938                         const char *t = ++s;
4939                         
4940                         while (*s) {
4941                             if (*s == '\n')
4942                                 *s = ' ';
4943                             s++;
4944                         }
4945                         if (s[-1] == ' ')
4946                             s[-1] = 0;
4947                         
4948                         sv_catpv(sv, t);
4949                     }
4950                 
4951                     Safefree(origs);
4952                 }
4953             }
4954         }
4955
4956         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4957     }
4958     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4959         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4960 #endif  /* DEBUGGING */
4961 }
4962
4963 SV *
4964 Perl_re_intuit_string(pTHX_ regexp *prog)
4965 {                               /* Assume that RE_INTUIT is set */
4966     DEBUG_r(
4967         {
4968             const char *s = SvPV_nolen_const(prog->check_substr
4969                       ? prog->check_substr : prog->check_utf8);
4970
4971             if (!PL_colorset) reginitcolors();
4972             PerlIO_printf(Perl_debug_log,
4973                       "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4974                       PL_colors[4],
4975                       prog->check_substr ? "" : "utf8 ",
4976                       PL_colors[5],PL_colors[0],
4977                       s,
4978                       PL_colors[1],
4979                       (strlen(s) > 60 ? "..." : ""));
4980         } );
4981
4982     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4983 }
4984
4985 void
4986 Perl_pregfree(pTHX_ struct regexp *r)
4987 {
4988 #ifdef DEBUGGING
4989     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4990 #endif
4991
4992     if (!r || (--r->refcnt > 0))
4993         return;
4994     DEBUG_r({
4995         const char *s = (r->reganch & ROPT_UTF8)
4996             ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4997             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4998         const int len = SvCUR(dsv);
4999          if (!PL_colorset)
5000               reginitcolors();
5001          PerlIO_printf(Perl_debug_log,
5002                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
5003                        PL_colors[4],PL_colors[5],PL_colors[0],
5004                        len, len, s,
5005                        PL_colors[1],
5006                        len > 60 ? "..." : "");
5007     });
5008
5009     if (r->precomp)
5010         Safefree(r->precomp);
5011     if (r->offsets)             /* 20010421 MJD */
5012         Safefree(r->offsets);
5013     if (RX_MATCH_COPIED(r))
5014         Safefree(r->subbeg);
5015     if (r->substrs) {
5016         if (r->anchored_substr)
5017             SvREFCNT_dec(r->anchored_substr);
5018         if (r->anchored_utf8)
5019             SvREFCNT_dec(r->anchored_utf8);
5020         if (r->float_substr)
5021             SvREFCNT_dec(r->float_substr);
5022         if (r->float_utf8)
5023             SvREFCNT_dec(r->float_utf8);
5024         Safefree(r->substrs);
5025     }
5026     if (r->data) {
5027         int n = r->data->count;
5028         PAD* new_comppad = NULL;
5029         PAD* old_comppad;
5030         PADOFFSET refcnt;
5031
5032         while (--n >= 0) {
5033           /* If you add a ->what type here, update the comment in regcomp.h */
5034             switch (r->data->what[n]) {
5035             case 's':
5036                 SvREFCNT_dec((SV*)r->data->data[n]);
5037                 break;
5038             case 'f':
5039                 Safefree(r->data->data[n]);
5040                 break;
5041             case 'p':
5042                 new_comppad = (AV*)r->data->data[n];
5043                 break;
5044             case 'o':
5045                 if (new_comppad == NULL)
5046                     Perl_croak(aTHX_ "panic: pregfree comppad");
5047                 PAD_SAVE_LOCAL(old_comppad,
5048                     /* Watch out for global destruction's random ordering. */
5049                     (SvTYPE(new_comppad) == SVt_PVAV) ?
5050                                 new_comppad : Null(PAD *)
5051                 );
5052                 OP_REFCNT_LOCK;
5053                 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
5054                 OP_REFCNT_UNLOCK;
5055                 if (!refcnt)
5056                     op_free((OP_4tree*)r->data->data[n]);
5057
5058                 PAD_RESTORE_LOCAL(old_comppad);
5059                 SvREFCNT_dec((SV*)new_comppad);
5060                 new_comppad = NULL;
5061                 break;
5062             case 'n':
5063                 break;
5064             default:
5065                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
5066             }
5067         }
5068         Safefree(r->data->what);
5069         Safefree(r->data);
5070     }
5071     Safefree(r->startp);
5072     Safefree(r->endp);
5073     Safefree(r);
5074 }
5075
5076 /*
5077  - regnext - dig the "next" pointer out of a node
5078  *
5079  * [Note, when REGALIGN is defined there are two places in regmatch()
5080  * that bypass this code for speed.]
5081  */
5082 regnode *
5083 Perl_regnext(pTHX_ register regnode *p)
5084 {
5085     register I32 offset;
5086
5087     if (p == &PL_regdummy)
5088         return(NULL);
5089
5090     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5091     if (offset == 0)
5092         return(NULL);
5093
5094     return(p+offset);
5095 }
5096
5097 STATIC void     
5098 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5099 {
5100     va_list args;
5101     STRLEN l1 = strlen(pat1);
5102     STRLEN l2 = strlen(pat2);
5103     char buf[512];
5104     SV *msv;
5105     const char *message;
5106
5107     if (l1 > 510)
5108         l1 = 510;
5109     if (l1 + l2 > 510)
5110         l2 = 510 - l1;
5111     Copy(pat1, buf, l1 , char);
5112     Copy(pat2, buf + l1, l2 , char);
5113     buf[l1 + l2] = '\n';
5114     buf[l1 + l2 + 1] = '\0';
5115 #ifdef I_STDARG
5116     /* ANSI variant takes additional second argument */
5117     va_start(args, pat2);
5118 #else
5119     va_start(args);
5120 #endif
5121     msv = vmess(buf, &args);
5122     va_end(args);
5123     message = SvPV(msv,l1);
5124     if (l1 > 512)
5125         l1 = 512;
5126     Copy(message, buf, l1 , char);
5127     buf[l1-1] = '\0';                   /* Overwrite \n */
5128     Perl_croak(aTHX_ "%s", buf);
5129 }
5130
5131 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5132
5133 void
5134 Perl_save_re_context(pTHX)
5135 {
5136     SAVEI32(PL_reg_flags);              /* from regexec.c */
5137     SAVEPPTR(PL_bostr);
5138     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5139     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5140     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5141     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5142     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5143     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5144     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5145     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5146     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5147     PL_reg_start_tmp = 0;
5148     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5149     PL_reg_start_tmpl = 0;
5150     SAVEVPTR(PL_regdata);
5151     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5152     SAVEI32(PL_regnarrate);             /* from regexec.c */
5153     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5154     SAVEINT(PL_regindent);              /* from regexec.c */
5155     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5156     SAVEVPTR(PL_curcop);
5157     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5158     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5159     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5160     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5161     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5162     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5163     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5164     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5165     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5166     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5167     PL_reg_oldsaved = Nullch;
5168     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5169     PL_reg_oldsavedlen = 0;
5170     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5171     PL_reg_maxiter = 0;
5172     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5173     PL_reg_leftiter = 0;
5174     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5175     PL_reg_poscache = Nullch;
5176     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5177     PL_reg_poscache_size = 0;
5178     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5179     SAVEI32(PL_regnpar);                /* () count. */
5180     SAVEI32(PL_regsize);                /* from regexec.c */
5181
5182     {
5183         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5184         REGEXP *rx;
5185
5186         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5187             U32 i;
5188             for (i = 1; i <= rx->nparens; i++) {
5189                 GV *mgv;
5190                 char digits[TYPE_CHARS(long)];
5191                 sprintf(digits, "%lu", (long)i);
5192                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5193                     save_scalar(mgv);
5194             }
5195         }
5196     }
5197
5198 #ifdef DEBUGGING
5199     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5200 #endif
5201 }
5202
5203 static void
5204 clear_re(pTHX_ void *r)
5205 {
5206     ReREFCNT_dec((regexp *)r);
5207 }
5208
5209 /*
5210  * Local variables:
5211  * c-indentation-style: bsd
5212  * c-basic-offset: 4
5213  * indent-tabs-mode: t
5214  * End:
5215  *
5216  * ex: set ts=8 sts=4 sw=4 noet:
5217  */