This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: [PATCH] Annoyingly unhelpful messages from lib/File/Temp/t/security.t
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73  ****    2000, 2001, 2002, 2003, by Larry Wall and others
74  ****
75  ****    You may distribute under the terms of either the GNU General Public
76  ****    License or the Artistic License, as specified in the README file.
77
78  *
79  * Beware that some of this code is subtly aware of the way operator
80  * precedence is structured in regular expressions.  Serious changes in
81  * regular-expression syntax might require a total rethink.
82  */
83 #include "EXTERN.h"
84 #define PERL_IN_REGCOMP_C
85 #include "perl.h"
86
87 #ifndef PERL_IN_XSUB_RE
88 #  include "INTERN.h"
89 #endif
90
91 #define REG_COMP_C
92 #include "regcomp.h"
93
94 #ifdef op
95 #undef op
96 #endif /* op */
97
98 #ifdef MSDOS
99 # if defined(BUGGY_MSC6)
100  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101  # pragma optimize("a",off)
102  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103  # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
105 #endif /* MSDOS */
106
107 #ifndef STATIC
108 #define STATIC  static
109 #endif
110
111 typedef struct RExC_state_t {
112     U32         flags;                  /* are we folding, multilining? */
113     char        *precomp;               /* uncompiled string. */
114     regexp      *rx;
115     char        *start;                 /* Start of input for compile */
116     char        *end;                   /* End of input for compile */
117     char        *parse;                 /* Input-scan pointer. */
118     I32         whilem_seen;            /* number of WHILEM in this expr */
119     regnode     *emit_start;            /* Start of emitted-code area */
120     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
121     I32         naughty;                /* How bad is this pattern? */
122     I32         sawback;                /* Did we see \1, ...? */
123     U32         seen;
124     I32         size;                   /* Code size. */
125     I32         npar;                   /* () count. */
126     I32         extralen;
127     I32         seen_zerolen;
128     I32         seen_evals;
129     I32         utf8;
130 #if ADD_TO_REGEXEC
131     char        *starttry;              /* -Dr: where regtry was called. */
132 #define RExC_starttry   (pRExC_state->starttry)
133 #endif
134 } RExC_state_t;
135
136 #define RExC_flags      (pRExC_state->flags)
137 #define RExC_precomp    (pRExC_state->precomp)
138 #define RExC_rx         (pRExC_state->rx)
139 #define RExC_start      (pRExC_state->start)
140 #define RExC_end        (pRExC_state->end)
141 #define RExC_parse      (pRExC_state->parse)
142 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
143 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit       (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty    (pRExC_state->naughty)
147 #define RExC_sawback    (pRExC_state->sawback)
148 #define RExC_seen       (pRExC_state->seen)
149 #define RExC_size       (pRExC_state->size)
150 #define RExC_npar       (pRExC_state->npar)
151 #define RExC_extralen   (pRExC_state->extralen)
152 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8       (pRExC_state->utf8)
155
156 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158         ((*s) == '{' && regcurly(s)))
159
160 #ifdef SPSTART
161 #undef SPSTART          /* dratted cpp namespace... */
162 #endif
163 /*
164  * Flags to be passed up and down.
165  */
166 #define WORST           0       /* Worst case. */
167 #define HASWIDTH        0x1     /* Known to match non-null strings. */
168 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART         0x4     /* Starts with * or +. */
170 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
171
172 /* Length of a variant. */
173
174 typedef struct scan_data_t {
175     I32 len_min;
176     I32 len_delta;
177     I32 pos_min;
178     I32 pos_delta;
179     SV *last_found;
180     I32 last_end;                       /* min value, <0 unless valid. */
181     I32 last_start_min;
182     I32 last_start_max;
183     SV **longest;                       /* Either &l_fixed, or &l_float. */
184     SV *longest_fixed;
185     I32 offset_fixed;
186     SV *longest_float;
187     I32 offset_float_min;
188     I32 offset_float_max;
189     I32 flags;
190     I32 whilem_c;
191     I32 *last_closep;
192     struct regnode_charclass_class *start_class;
193 } scan_data_t;
194
195 /*
196  * Forward declarations for pregcomp()'s friends.
197  */
198
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200                                       0, 0, 0, 0, 0, 0};
201
202 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL          0x1
204 #define SF_BEFORE_MEOL          0x2
205 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
208 #ifdef NO_UNARY_PLUS
209 #  define SF_FIX_SHIFT_EOL      (0+2)
210 #  define SF_FL_SHIFT_EOL               (0+4)
211 #else
212 #  define SF_FIX_SHIFT_EOL      (+2)
213 #  define SF_FL_SHIFT_EOL               (+4)
214 #endif
215
216 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF               0x40
222 #define SF_HAS_PAR              0x80
223 #define SF_IN_PAR               0x100
224 #define SF_HAS_EVAL             0x200
225 #define SCF_DO_SUBSTR           0x400
226 #define SCF_DO_STCLASS_AND      0x0800
227 #define SCF_DO_STCLASS_OR       0x1000
228 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS  0x2000
230
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234
235 #define OOB_UNICODE             12345678
236 #define OOB_NAMEDCLASS          -1
237
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
241
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
244
245 /*
246  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248  * op/pragma/warn/regcomp.
249  */
250 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
252
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
254
255 /*
256  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257  * arg. Show regex, up to a maximum length. If it's too long, chop and add
258  * "...".
259  */
260 #define FAIL(msg) STMT_START {                                          \
261     char *ellipses = "";                                                \
262     IV len = RExC_end - RExC_precomp;                                   \
263                                                                         \
264     if (!SIZE_ONLY)                                                     \
265         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
266     if (len > RegexLengthToShowInErrorMessages) {                       \
267         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
268         len = RegexLengthToShowInErrorMessages - 10;                    \
269         ellipses = "...";                                               \
270     }                                                                   \
271     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
272             msg, (int)len, RExC_precomp, ellipses);                     \
273 } STMT_END
274
275 /*
276  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277  * args. Show regex, up to a maximum length. If it's too long, chop and add
278  * "...".
279  */
280 #define FAIL2(pat,msg) STMT_START {                                     \
281     char *ellipses = "";                                                \
282     IV len = RExC_end - RExC_precomp;                                   \
283                                                                         \
284     if (!SIZE_ONLY)                                                     \
285         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
286     if (len > RegexLengthToShowInErrorMessages) {                       \
287         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
288         len = RegexLengthToShowInErrorMessages - 10;                    \
289         ellipses = "...";                                               \
290     }                                                                   \
291     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
292             msg, (int)len, RExC_precomp, ellipses);                     \
293 } STMT_END
294
295
296 /*
297  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298  */
299 #define Simple_vFAIL(m) STMT_START {                                    \
300     IV offset = RExC_parse - RExC_precomp;                              \
301     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
302             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
303 } STMT_END
304
305 /*
306  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307  */
308 #define vFAIL(m) STMT_START {                           \
309     if (!SIZE_ONLY)                                     \
310         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
311     Simple_vFAIL(m);                                    \
312 } STMT_END
313
314 /*
315  * Like Simple_vFAIL(), but accepts two arguments.
316  */
317 #define Simple_vFAIL2(m,a1) STMT_START {                        \
318     IV offset = RExC_parse - RExC_precomp;                      \
319     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
320             (int)offset, RExC_precomp, RExC_precomp + offset);  \
321 } STMT_END
322
323 /*
324  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325  */
326 #define vFAIL2(m,a1) STMT_START {                       \
327     if (!SIZE_ONLY)                                     \
328         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
329     Simple_vFAIL2(m, a1);                               \
330 } STMT_END
331
332
333 /*
334  * Like Simple_vFAIL(), but accepts three arguments.
335  */
336 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
337     IV offset = RExC_parse - RExC_precomp;                      \
338     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
339             (int)offset, RExC_precomp, RExC_precomp + offset);  \
340 } STMT_END
341
342 /*
343  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344  */
345 #define vFAIL3(m,a1,a2) STMT_START {                    \
346     if (!SIZE_ONLY)                                     \
347         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
348     Simple_vFAIL3(m, a1, a2);                           \
349 } STMT_END
350
351 /*
352  * Like Simple_vFAIL(), but accepts four arguments.
353  */
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
355     IV offset = RExC_parse - RExC_precomp;                      \
356     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
357             (int)offset, RExC_precomp, RExC_precomp + offset);  \
358 } STMT_END
359
360 /*
361  * Like Simple_vFAIL(), but accepts five arguments.
362  */
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
364     IV offset = RExC_parse - RExC_precomp;                      \
365     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
366             (int)offset, RExC_precomp, RExC_precomp + offset);  \
367 } STMT_END
368
369
370 #define vWARN(loc,m) STMT_START {                                       \
371     IV offset = loc - RExC_precomp;                                     \
372     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
373             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
374 } STMT_END
375
376 #define vWARNdep(loc,m) STMT_START {                                    \
377     IV offset = loc - RExC_precomp;                                     \
378     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
379             "%s" REPORT_LOCATION,                                       \
380             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
381 } STMT_END
382
383
384 #define vWARN2(loc, m, a1) STMT_START {                                 \
385     IV offset = loc - RExC_precomp;                                     \
386     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
387             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
388 } STMT_END
389
390 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
391     IV offset = loc - RExC_precomp;                                     \
392     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
393             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
394 } STMT_END
395
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
397     IV offset = loc - RExC_precomp;                                     \
398     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
399             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 } STMT_END
401
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
403     IV offset = loc - RExC_precomp;                                     \
404     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
405             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 } STMT_END
407
408
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START {                  \
411     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412 } STMT_END
413
414 /* Macros for recording node offsets.   20001227 mjd@plover.com 
415  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
416  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
417  * Element 0 holds the number n.
418  */
419
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
425     if (! SIZE_ONLY) {                                                  \
426         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
427                 __LINE__, (node), (byte)));                             \
428         if((node) < 0) {                                                \
429             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430         } else {                                                        \
431             RExC_offsets[2*(node)-1] = (byte);                          \
432         }                                                               \
433     }                                                                   \
434 } STMT_END
435
436 #define Set_Node_Offset(node,byte) \
437     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
441     if (! SIZE_ONLY) {                                                  \
442         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
443                 __LINE__, (node), (len)));                              \
444         if((node) < 0) {                                                \
445             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446         } else {                                                        \
447             RExC_offsets[2*(node)] = (len);                             \
448         }                                                               \
449     }                                                                   \
450 } STMT_END
451
452 #define Set_Node_Length(node,len) \
453     Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456     Set_Node_Length(node, RExC_parse - parse_start)
457
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
462 static void clear_re(pTHX_ void *r);
463
464 /* Mark that we cannot extend a found fixed substring at this point.
465    Updata the longest found anchored substring and the longest found
466    floating substrings if needed. */
467
468 STATIC void
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 {
471     STRLEN l = CHR_SVLEN(data->last_found);
472     STRLEN old_l = CHR_SVLEN(*data->longest);
473
474     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475         sv_setsv(*data->longest, data->last_found);
476         if (*data->longest == data->longest_fixed) {
477             data->offset_fixed = l ? data->last_start_min : data->pos_min;
478             if (data->flags & SF_BEFORE_EOL)
479                 data->flags
480                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481             else
482                 data->flags &= ~SF_FIX_BEFORE_EOL;
483         }
484         else {
485             data->offset_float_min = l ? data->last_start_min : data->pos_min;
486             data->offset_float_max = (l
487                                       ? data->last_start_max
488                                       : data->pos_min + data->pos_delta);
489             if ((U32)data->offset_float_max > (U32)I32_MAX)
490                 data->offset_float_max = I32_MAX;
491             if (data->flags & SF_BEFORE_EOL)
492                 data->flags
493                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494             else
495                 data->flags &= ~SF_FL_BEFORE_EOL;
496         }
497     }
498     SvCUR_set(data->last_found, 0);
499     {
500         SV * sv = data->last_found;
501         MAGIC *mg =
502             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503         if (mg && mg->mg_len > 0)
504             mg->mg_len = 0;
505     }
506     data->last_end = -1;
507     data->flags &= ~SF_BEFORE_EOL;
508 }
509
510 /* Can match anything (initialization) */
511 STATIC void
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 {
514     ANYOF_CLASS_ZERO(cl);
515     ANYOF_BITMAP_SETALL(cl);
516     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517     if (LOC)
518         cl->flags |= ANYOF_LOCALE;
519 }
520
521 /* Can match anything (initialization) */
522 STATIC int
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524 {
525     int value;
526
527     for (value = 0; value <= ANYOF_MAX; value += 2)
528         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529             return 1;
530     if (!(cl->flags & ANYOF_UNICODE_ALL))
531         return 0;
532     if (!ANYOF_BITMAP_TESTALLSET(cl))
533         return 0;
534     return 1;
535 }
536
537 /* Can match anything (initialization) */
538 STATIC void
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 {
541     Zero(cl, 1, struct regnode_charclass_class);
542     cl->type = ANYOF;
543     cl_anything(pRExC_state, cl);
544 }
545
546 STATIC void
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 {
549     Zero(cl, 1, struct regnode_charclass_class);
550     cl->type = ANYOF;
551     cl_anything(pRExC_state, cl);
552     if (LOC)
553         cl->flags |= ANYOF_LOCALE;
554 }
555
556 /* 'And' a given class with another one.  Can create false positives */
557 /* We assume that cl is not inverted */
558 STATIC void
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560          struct regnode_charclass_class *and_with)
561 {
562     if (!(and_with->flags & ANYOF_CLASS)
563         && !(cl->flags & ANYOF_CLASS)
564         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565         && !(and_with->flags & ANYOF_FOLD)
566         && !(cl->flags & ANYOF_FOLD)) {
567         int i;
568
569         if (and_with->flags & ANYOF_INVERT)
570             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571                 cl->bitmap[i] &= ~and_with->bitmap[i];
572         else
573             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574                 cl->bitmap[i] &= and_with->bitmap[i];
575     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576     if (!(and_with->flags & ANYOF_EOS))
577         cl->flags &= ~ANYOF_EOS;
578
579     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580         !(and_with->flags & ANYOF_INVERT)) {
581         cl->flags &= ~ANYOF_UNICODE_ALL;
582         cl->flags |= ANYOF_UNICODE;
583         ARG_SET(cl, ARG(and_with));
584     }
585     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586         !(and_with->flags & ANYOF_INVERT))
587         cl->flags &= ~ANYOF_UNICODE_ALL;
588     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589         !(and_with->flags & ANYOF_INVERT))
590         cl->flags &= ~ANYOF_UNICODE;
591 }
592
593 /* 'OR' a given class with another one.  Can create false positives */
594 /* We assume that cl is not inverted */
595 STATIC void
596 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
597 {
598     if (or_with->flags & ANYOF_INVERT) {
599         /* We do not use
600          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601          *   <= (B1 | !B2) | (CL1 | !CL2)
602          * which is wasteful if CL2 is small, but we ignore CL2:
603          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604          * XXXX Can we handle case-fold?  Unclear:
605          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607          */
608         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609              && !(or_with->flags & ANYOF_FOLD)
610              && !(cl->flags & ANYOF_FOLD) ) {
611             int i;
612
613             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614                 cl->bitmap[i] |= ~or_with->bitmap[i];
615         } /* XXXX: logic is complicated otherwise */
616         else {
617             cl_anything(pRExC_state, cl);
618         }
619     } else {
620         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
622              && (!(or_with->flags & ANYOF_FOLD)
623                  || (cl->flags & ANYOF_FOLD)) ) {
624             int i;
625
626             /* OR char bitmap and class bitmap separately */
627             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628                 cl->bitmap[i] |= or_with->bitmap[i];
629             if (or_with->flags & ANYOF_CLASS) {
630                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631                     cl->classflags[i] |= or_with->classflags[i];
632                 cl->flags |= ANYOF_CLASS;
633             }
634         }
635         else { /* XXXX: logic is complicated, leave it along for a moment. */
636             cl_anything(pRExC_state, cl);
637         }
638     }
639     if (or_with->flags & ANYOF_EOS)
640         cl->flags |= ANYOF_EOS;
641
642     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643         ARG(cl) != ARG(or_with)) {
644         cl->flags |= ANYOF_UNICODE_ALL;
645         cl->flags &= ~ANYOF_UNICODE;
646     }
647     if (or_with->flags & ANYOF_UNICODE_ALL) {
648         cl->flags |= ANYOF_UNICODE_ALL;
649         cl->flags &= ~ANYOF_UNICODE;
650     }
651 }
652
653 /*
654  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655  * These need to be revisited when a newer toolchain becomes available.
656  */
657 #if defined(__sparc64__) && defined(__GNUC__)
658 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659 #       undef  SPARC64_GCC_WORKAROUND
660 #       define SPARC64_GCC_WORKAROUND 1
661 #   endif
662 #endif
663
664 /* REx optimizer.  Converts nodes into quickier variants "in place".
665    Finds fixed substrings.  */
666
667 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668    to the position after last scanned or to NULL. */
669
670 STATIC I32
671 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
672                         /* scanp: Start here (read-write). */
673                         /* deltap: Write maxlen-minlen here. */
674                         /* last: Stop before this one. */
675 {
676     I32 min = 0, pars = 0, code;
677     regnode *scan = *scanp, *next;
678     I32 delta = 0;
679     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
680     int is_inf_internal = 0;            /* The studied chunk is infinite */
681     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682     scan_data_t data_fake;
683     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
684
685     while (scan && OP(scan) != END && scan < last) {
686         /* Peephole optimizer: */
687
688         if (PL_regkind[(U8)OP(scan)] == EXACT) {
689             /* Merge several consecutive EXACTish nodes into one. */
690             regnode *n = regnext(scan);
691             U32 stringok = 1;
692 #ifdef DEBUGGING
693             regnode *stop = scan;
694 #endif
695
696             next = scan + NODE_SZ_STR(scan);
697             /* Skip NOTHING, merge EXACT*. */
698             while (n &&
699                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
700                      (stringok && (OP(n) == OP(scan))))
701                    && NEXT_OFF(n)
702                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703                 if (OP(n) == TAIL || n > next)
704                     stringok = 0;
705                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
706                     NEXT_OFF(scan) += NEXT_OFF(n);
707                     next = n + NODE_STEP_REGNODE;
708 #ifdef DEBUGGING
709                     if (stringok)
710                         stop = n;
711 #endif
712                     n = regnext(n);
713                 }
714                 else if (stringok) {
715                     int oldl = STR_LEN(scan);
716                     regnode *nnext = regnext(n);
717
718                     if (oldl + STR_LEN(n) > U8_MAX)
719                         break;
720                     NEXT_OFF(scan) += NEXT_OFF(n);
721                     STR_LEN(scan) += STR_LEN(n);
722                     next = n + NODE_SZ_STR(n);
723                     /* Now we can overwrite *n : */
724                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
725 #ifdef DEBUGGING
726                     stop = next - 1;
727 #endif
728                     n = nnext;
729                 }
730             }
731
732             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
733 /*
734   Two problematic code points in Unicode casefolding of EXACT nodes:
735
736    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
738
739    which casefold to
740
741    Unicode                      UTF-8
742
743    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
744    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
745
746    This means that in case-insensitive matching (or "loose matching",
747    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748    length of the above casefolded versions) can match a target string
749    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750    This would rather mess up the minimum length computation.
751
752    What we'll do is to look for the tail four bytes, and then peek
753    at the preceding two bytes to see whether we need to decrease
754    the minimum length by four (six minus two).
755
756    Thanks to the design of UTF-8, there cannot be false matches:
757    A sequence of valid UTF-8 bytes cannot be a subsequence of
758    another valid sequence of UTF-8 bytes.
759
760 */
761                  char *s0 = STRING(scan), *s, *t;
762                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763                  char *t0 = "\xcc\x88\xcc\x81";
764                  char *t1 = t0 + 3;
765                  
766                  for (s = s0 + 2;
767                       s < s2 && (t = ninstr(s, s1, t0, t1));
768                       s = t + 4) {
769                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
771                            min -= 4;
772                  }
773             }
774
775 #ifdef DEBUGGING
776             /* Allow dumping */
777             n = scan + NODE_SZ_STR(scan);
778             while (n <= stop) {
779                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
780                     OP(n) = OPTIMIZED;
781                     NEXT_OFF(n) = 0;
782                 }
783                 n++;
784             }
785 #endif
786         }
787         /* Follow the next-chain of the current node and optimize
788            away all the NOTHINGs from it.  */
789         if (OP(scan) != CURLYX) {
790             int max = (reg_off_by_arg[OP(scan)]
791                        ? I32_MAX
792                        /* I32 may be smaller than U16 on CRAYs! */
793                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
794             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
795             int noff;
796             regnode *n = scan;
797         
798             /* Skip NOTHING and LONGJMP. */
799             while ((n = regnext(n))
800                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
801                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
802                    && off + noff < max)
803                 off += noff;
804             if (reg_off_by_arg[OP(scan)])
805                 ARG(scan) = off;
806             else
807                 NEXT_OFF(scan) = off;
808         }
809         /* The principal pseudo-switch.  Cannot be a switch, since we
810            look into several different things.  */
811         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
812                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813             next = regnext(scan);
814             code = OP(scan);
815         
816             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
817                 I32 max1 = 0, min1 = I32_MAX, num = 0;
818                 struct regnode_charclass_class accum;
819                 
820                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
821                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
822                 if (flags & SCF_DO_STCLASS)
823                     cl_init_zero(pRExC_state, &accum);
824                 while (OP(scan) == code) {
825                     I32 deltanext, minnext, f = 0, fake;
826                     struct regnode_charclass_class this_class;
827
828                     num++;
829                     data_fake.flags = 0;
830                     if (data) {         
831                         data_fake.whilem_c = data->whilem_c;
832                         data_fake.last_closep = data->last_closep;
833                     }
834                     else
835                         data_fake.last_closep = &fake;
836                     next = regnext(scan);
837                     scan = NEXTOPER(scan);
838                     if (code != BRANCH)
839                         scan = NEXTOPER(scan);
840                     if (flags & SCF_DO_STCLASS) {
841                         cl_init(pRExC_state, &this_class);
842                         data_fake.start_class = &this_class;
843                         f = SCF_DO_STCLASS_AND;
844                     }           
845                     if (flags & SCF_WHILEM_VISITED_POS)
846                         f |= SCF_WHILEM_VISITED_POS;
847                     /* we suppose the run is continuous, last=next...*/
848                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
849                                           next, &data_fake, f);
850                     if (min1 > minnext)
851                         min1 = minnext;
852                     if (max1 < minnext + deltanext)
853                         max1 = minnext + deltanext;
854                     if (deltanext == I32_MAX)
855                         is_inf = is_inf_internal = 1;
856                     scan = next;
857                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
858                         pars++;
859                     if (data && (data_fake.flags & SF_HAS_EVAL))
860                         data->flags |= SF_HAS_EVAL;
861                     if (data)
862                         data->whilem_c = data_fake.whilem_c;
863                     if (flags & SCF_DO_STCLASS)
864                         cl_or(pRExC_state, &accum, &this_class);
865                     if (code == SUSPEND)
866                         break;
867                 }
868                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
869                     min1 = 0;
870                 if (flags & SCF_DO_SUBSTR) {
871                     data->pos_min += min1;
872                     data->pos_delta += max1 - min1;
873                     if (max1 != min1 || is_inf)
874                         data->longest = &(data->longest_float);
875                 }
876                 min += min1;
877                 delta += max1 - min1;
878                 if (flags & SCF_DO_STCLASS_OR) {
879                     cl_or(pRExC_state, data->start_class, &accum);
880                     if (min1) {
881                         cl_and(data->start_class, &and_with);
882                         flags &= ~SCF_DO_STCLASS;
883                     }
884                 }
885                 else if (flags & SCF_DO_STCLASS_AND) {
886                     if (min1) {
887                         cl_and(data->start_class, &accum);
888                         flags &= ~SCF_DO_STCLASS;
889                     }
890                     else {
891                         /* Switch to OR mode: cache the old value of
892                          * data->start_class */
893                         StructCopy(data->start_class, &and_with,
894                                    struct regnode_charclass_class);
895                         flags &= ~SCF_DO_STCLASS_AND;
896                         StructCopy(&accum, data->start_class,
897                                    struct regnode_charclass_class);
898                         flags |= SCF_DO_STCLASS_OR;
899                         data->start_class->flags |= ANYOF_EOS;
900                     }
901                 }
902             }
903             else if (code == BRANCHJ)   /* single branch is optimized. */
904                 scan = NEXTOPER(NEXTOPER(scan));
905             else                        /* single branch is optimized. */
906                 scan = NEXTOPER(scan);
907             continue;
908         }
909         else if (OP(scan) == EXACT) {
910             I32 l = STR_LEN(scan);
911             UV uc = *((U8*)STRING(scan));
912             if (UTF) {
913                 U8 *s = (U8*)STRING(scan);
914                 l = utf8_length(s, s + l);
915                 uc = utf8_to_uvchr(s, NULL);
916             }
917             min += l;
918             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919                 /* The code below prefers earlier match for fixed
920                    offset, later match for variable offset.  */
921                 if (data->last_end == -1) { /* Update the start info. */
922                     data->last_start_min = data->pos_min;
923                     data->last_start_max = is_inf
924                         ? I32_MAX : data->pos_min + data->pos_delta;
925                 }
926                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
927                 {
928                     SV * sv = data->last_found;
929                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
931                     if (mg && mg->mg_len >= 0)
932                         mg->mg_len += utf8_length((U8*)STRING(scan),
933                                                   (U8*)STRING(scan)+STR_LEN(scan));
934                 }
935                 if (UTF)
936                     SvUTF8_on(data->last_found);
937                 data->last_end = data->pos_min + l;
938                 data->pos_min += l; /* As in the first entry. */
939                 data->flags &= ~SF_BEFORE_EOL;
940             }
941             if (flags & SCF_DO_STCLASS_AND) {
942                 /* Check whether it is compatible with what we know already! */
943                 int compat = 1;
944
945                 if (uc >= 0x100 ||
946                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
948                     && (!(data->start_class->flags & ANYOF_FOLD)
949                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
950                     )
951                     compat = 0;
952                 ANYOF_CLASS_ZERO(data->start_class);
953                 ANYOF_BITMAP_ZERO(data->start_class);
954                 if (compat)
955                     ANYOF_BITMAP_SET(data->start_class, uc);
956                 data->start_class->flags &= ~ANYOF_EOS;
957                 if (uc < 0x100)
958                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959             }
960             else if (flags & SCF_DO_STCLASS_OR) {
961                 /* false positive possible if the class is case-folded */
962                 if (uc < 0x100)
963                     ANYOF_BITMAP_SET(data->start_class, uc);
964                 else
965                     data->start_class->flags |= ANYOF_UNICODE_ALL;
966                 data->start_class->flags &= ~ANYOF_EOS;
967                 cl_and(data->start_class, &and_with);
968             }
969             flags &= ~SCF_DO_STCLASS;
970         }
971         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972             I32 l = STR_LEN(scan);
973             UV uc = *((U8*)STRING(scan));
974
975             /* Search for fixed substrings supports EXACT only. */
976             if (flags & SCF_DO_SUBSTR)
977                 scan_commit(pRExC_state, data);
978             if (UTF) {
979                 U8 *s = (U8 *)STRING(scan);
980                 l = utf8_length(s, s + l);
981                 uc = utf8_to_uvchr(s, NULL);
982             }
983             min += l;
984             if (data && (flags & SCF_DO_SUBSTR))
985                 data->pos_min += l;
986             if (flags & SCF_DO_STCLASS_AND) {
987                 /* Check whether it is compatible with what we know already! */
988                 int compat = 1;
989
990                 if (uc >= 0x100 ||
991                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
993                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994                     compat = 0;
995                 ANYOF_CLASS_ZERO(data->start_class);
996                 ANYOF_BITMAP_ZERO(data->start_class);
997                 if (compat) {
998                     ANYOF_BITMAP_SET(data->start_class, uc);
999                     data->start_class->flags &= ~ANYOF_EOS;
1000                     data->start_class->flags |= ANYOF_FOLD;
1001                     if (OP(scan) == EXACTFL)
1002                         data->start_class->flags |= ANYOF_LOCALE;
1003                 }
1004             }
1005             else if (flags & SCF_DO_STCLASS_OR) {
1006                 if (data->start_class->flags & ANYOF_FOLD) {
1007                     /* false positive possible if the class is case-folded.
1008                        Assume that the locale settings are the same... */
1009                     if (uc < 0x100)
1010                         ANYOF_BITMAP_SET(data->start_class, uc);
1011                     data->start_class->flags &= ~ANYOF_EOS;
1012                 }
1013                 cl_and(data->start_class, &and_with);
1014             }
1015             flags &= ~SCF_DO_STCLASS;
1016         }
1017         else if (strchr((char*)PL_varies,OP(scan))) {
1018             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019             I32 f = flags, pos_before = 0;
1020             regnode *oscan = scan;
1021             struct regnode_charclass_class this_class;
1022             struct regnode_charclass_class *oclass = NULL;
1023             I32 next_is_eval = 0;
1024
1025             switch (PL_regkind[(U8)OP(scan)]) {
1026             case WHILEM:                /* End of (?:...)* . */
1027                 scan = NEXTOPER(scan);
1028                 goto finish;
1029             case PLUS:
1030                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031                     next = NEXTOPER(scan);
1032                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033                         mincount = 1;
1034                         maxcount = REG_INFTY;
1035                         next = regnext(scan);
1036                         scan = NEXTOPER(scan);
1037                         goto do_curly;
1038                     }
1039                 }
1040                 if (flags & SCF_DO_SUBSTR)
1041                     data->pos_min++;
1042                 min++;
1043                 /* Fall through. */
1044             case STAR:
1045                 if (flags & SCF_DO_STCLASS) {
1046                     mincount = 0;
1047                     maxcount = REG_INFTY;
1048                     next = regnext(scan);
1049                     scan = NEXTOPER(scan);
1050                     goto do_curly;
1051                 }
1052                 is_inf = is_inf_internal = 1;
1053                 scan = regnext(scan);
1054                 if (flags & SCF_DO_SUBSTR) {
1055                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056                     data->longest = &(data->longest_float);
1057                 }
1058                 goto optimize_curly_tail;
1059             case CURLY:
1060                 mincount = ARG1(scan);
1061                 maxcount = ARG2(scan);
1062                 next = regnext(scan);
1063                 if (OP(scan) == CURLYX) {
1064                     I32 lp = (data ? *(data->last_closep) : 0);
1065
1066                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067                 }
1068                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069                 next_is_eval = (OP(scan) == EVAL);
1070               do_curly:
1071                 if (flags & SCF_DO_SUBSTR) {
1072                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073                     pos_before = data->pos_min;
1074                 }
1075                 if (data) {
1076                     fl = data->flags;
1077                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078                     if (is_inf)
1079                         data->flags |= SF_IS_INF;
1080                 }
1081                 if (flags & SCF_DO_STCLASS) {
1082                     cl_init(pRExC_state, &this_class);
1083                     oclass = data->start_class;
1084                     data->start_class = &this_class;
1085                     f |= SCF_DO_STCLASS_AND;
1086                     f &= ~SCF_DO_STCLASS_OR;
1087                 }
1088                 /* These are the cases when once a subexpression
1089                    fails at a particular position, it cannot succeed
1090                    even after backtracking at the enclosing scope.
1091                 
1092                    XXXX what if minimal match and we are at the
1093                         initial run of {n,m}? */
1094                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095                     f &= ~SCF_WHILEM_VISITED_POS;
1096
1097                 /* This will finish on WHILEM, setting scan, or on NULL: */
1098                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099                                       mincount == 0
1100                                         ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102                 if (flags & SCF_DO_STCLASS)
1103                     data->start_class = oclass;
1104                 if (mincount == 0 || minnext == 0) {
1105                     if (flags & SCF_DO_STCLASS_OR) {
1106                         cl_or(pRExC_state, data->start_class, &this_class);
1107                     }
1108                     else if (flags & SCF_DO_STCLASS_AND) {
1109                         /* Switch to OR mode: cache the old value of
1110                          * data->start_class */
1111                         StructCopy(data->start_class, &and_with,
1112                                    struct regnode_charclass_class);
1113                         flags &= ~SCF_DO_STCLASS_AND;
1114                         StructCopy(&this_class, data->start_class,
1115                                    struct regnode_charclass_class);
1116                         flags |= SCF_DO_STCLASS_OR;
1117                         data->start_class->flags |= ANYOF_EOS;
1118                     }
1119                 } else {                /* Non-zero len */
1120                     if (flags & SCF_DO_STCLASS_OR) {
1121                         cl_or(pRExC_state, data->start_class, &this_class);
1122                         cl_and(data->start_class, &and_with);
1123                     }
1124                     else if (flags & SCF_DO_STCLASS_AND)
1125                         cl_and(data->start_class, &this_class);
1126                     flags &= ~SCF_DO_STCLASS;
1127                 }
1128                 if (!scan)              /* It was not CURLYX, but CURLY. */
1129                     scan = next;
1130                 if (ckWARN(WARN_REGEXP)
1131                        /* ? quantifier ok, except for (?{ ... }) */
1132                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1133                     && (minnext == 0) && (deltanext == 0)
1134                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1135                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1136                 {
1137                     vWARN(RExC_parse,
1138                           "Quantifier unexpected on zero-length expression");
1139                 }
1140
1141                 min += minnext * mincount;
1142                 is_inf_internal |= ((maxcount == REG_INFTY
1143                                      && (minnext + deltanext) > 0)
1144                                     || deltanext == I32_MAX);
1145                 is_inf |= is_inf_internal;
1146                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148                 /* Try powerful optimization CURLYX => CURLYN. */
1149                 if (  OP(oscan) == CURLYX && data
1150                       && data->flags & SF_IN_PAR
1151                       && !(data->flags & SF_HAS_EVAL)
1152                       && !deltanext && minnext == 1 ) {
1153                     /* Try to optimize to CURLYN.  */
1154                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155                     regnode *nxt1 = nxt;
1156 #ifdef DEBUGGING
1157                     regnode *nxt2;
1158 #endif
1159
1160                     /* Skip open. */
1161                     nxt = regnext(nxt);
1162                     if (!strchr((char*)PL_simple,OP(nxt))
1163                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164                              && STR_LEN(nxt) == 1))
1165                         goto nogo;
1166 #ifdef DEBUGGING
1167                     nxt2 = nxt;
1168 #endif
1169                     nxt = regnext(nxt);
1170                     if (OP(nxt) != CLOSE)
1171                         goto nogo;
1172                     /* Now we know that nxt2 is the only contents: */
1173                     oscan->flags = (U8)ARG(nxt);
1174                     OP(oscan) = CURLYN;
1175                     OP(nxt1) = NOTHING; /* was OPEN. */
1176 #ifdef DEBUGGING
1177                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1181                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1182                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1183 #endif
1184                 }
1185               nogo:
1186
1187                 /* Try optimization CURLYX => CURLYM. */
1188                 if (  OP(oscan) == CURLYX && data
1189                       && !(data->flags & SF_HAS_PAR)
1190                       && !(data->flags & SF_HAS_EVAL)
1191                       && !deltanext  ) {
1192                     /* XXXX How to optimize if data == 0? */
1193                     /* Optimize to a simpler form.  */
1194                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1195                     regnode *nxt2;
1196
1197                     OP(oscan) = CURLYM;
1198                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1199                             && (OP(nxt2) != WHILEM))
1200                         nxt = nxt2;
1201                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1202                     /* Need to optimize away parenths. */
1203                     if (data->flags & SF_IN_PAR) {
1204                         /* Set the parenth number.  */
1205                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1206
1207                         if (OP(nxt) != CLOSE)
1208                             FAIL("Panic opt close");
1209                         oscan->flags = (U8)ARG(nxt);
1210                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1211                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1212 #ifdef DEBUGGING
1213                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1214                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1215                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1216                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1217 #endif
1218 #if 0
1219                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1220                             regnode *nnxt = regnext(nxt1);
1221                         
1222                             if (nnxt == nxt) {
1223                                 if (reg_off_by_arg[OP(nxt1)])
1224                                     ARG_SET(nxt1, nxt2 - nxt1);
1225                                 else if (nxt2 - nxt1 < U16_MAX)
1226                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1227                                 else
1228                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1229                             }
1230                             nxt1 = nnxt;
1231                         }
1232 #endif
1233                         /* Optimize again: */
1234                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1235                                     NULL, 0);
1236                     }
1237                     else
1238                         oscan->flags = 0;
1239                 }
1240                 else if ((OP(oscan) == CURLYX)
1241                          && (flags & SCF_WHILEM_VISITED_POS)
1242                          /* See the comment on a similar expression above.
1243                             However, this time it not a subexpression
1244                             we care about, but the expression itself. */
1245                          && (maxcount == REG_INFTY)
1246                          && data && ++data->whilem_c < 16) {
1247                     /* This stays as CURLYX, we can put the count/of pair. */
1248                     /* Find WHILEM (as in regexec.c) */
1249                     regnode *nxt = oscan + NEXT_OFF(oscan);
1250
1251                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1252                         nxt += ARG(nxt);
1253                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1254                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1255                 }
1256                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1257                     pars++;
1258                 if (flags & SCF_DO_SUBSTR) {
1259                     SV *last_str = Nullsv;
1260                     int counted = mincount != 0;
1261
1262                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1263 #if defined(SPARC64_GCC_WORKAROUND)
1264                         I32 b = 0;
1265                         STRLEN l = 0;
1266                         char *s = NULL;
1267                         I32 old = 0;
1268
1269                         if (pos_before >= data->last_start_min)
1270                             b = pos_before;
1271                         else
1272                             b = data->last_start_min;
1273
1274                         l = 0;
1275                         s = SvPV(data->last_found, l);
1276                         old = b - data->last_start_min;
1277
1278 #else
1279                         I32 b = pos_before >= data->last_start_min
1280                             ? pos_before : data->last_start_min;
1281                         STRLEN l;
1282                         char *s = SvPV(data->last_found, l);
1283                         I32 old = b - data->last_start_min;
1284 #endif
1285
1286                         if (UTF)
1287                             old = utf8_hop((U8*)s, old) - (U8*)s;
1288                         
1289                         l -= old;
1290                         /* Get the added string: */
1291                         last_str = newSVpvn(s  + old, l);
1292                         if (UTF)
1293                             SvUTF8_on(last_str);
1294                         if (deltanext == 0 && pos_before == b) {
1295                             /* What was added is a constant string */
1296                             if (mincount > 1) {
1297                                 SvGROW(last_str, (mincount * l) + 1);
1298                                 repeatcpy(SvPVX(last_str) + l,
1299                                           SvPVX(last_str), l, mincount - 1);
1300                                 SvCUR(last_str) *= mincount;
1301                                 /* Add additional parts. */
1302                                 SvCUR_set(data->last_found,
1303                                           SvCUR(data->last_found) - l);
1304                                 sv_catsv(data->last_found, last_str);
1305                                 {
1306                                     SV * sv = data->last_found;
1307                                     MAGIC *mg =
1308                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1309                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1310                                     if (mg && mg->mg_len >= 0)
1311                                         mg->mg_len += CHR_SVLEN(last_str);
1312                                 }
1313                                 data->last_end += l * (mincount - 1);
1314                             }
1315                         } else {
1316                             /* start offset must point into the last copy */
1317                             data->last_start_min += minnext * (mincount - 1);
1318                             data->last_start_max += is_inf ? I32_MAX
1319                                 : (maxcount - 1) * (minnext + data->pos_delta);
1320                         }
1321                     }
1322                     /* It is counted once already... */
1323                     data->pos_min += minnext * (mincount - counted);
1324                     data->pos_delta += - counted * deltanext +
1325                         (minnext + deltanext) * maxcount - minnext * mincount;
1326                     if (mincount != maxcount) {
1327                          /* Cannot extend fixed substrings found inside
1328                             the group.  */
1329                         scan_commit(pRExC_state,data);
1330                         if (mincount && last_str) {
1331                             sv_setsv(data->last_found, last_str);
1332                             data->last_end = data->pos_min;
1333                             data->last_start_min =
1334                                 data->pos_min - CHR_SVLEN(last_str);
1335                             data->last_start_max = is_inf
1336                                 ? I32_MAX
1337                                 : data->pos_min + data->pos_delta
1338                                 - CHR_SVLEN(last_str);
1339                         }
1340                         data->longest = &(data->longest_float);
1341                     }
1342                     SvREFCNT_dec(last_str);
1343                 }
1344                 if (data && (fl & SF_HAS_EVAL))
1345                     data->flags |= SF_HAS_EVAL;
1346               optimize_curly_tail:
1347                 if (OP(oscan) != CURLYX) {
1348                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1349                            && NEXT_OFF(next))
1350                         NEXT_OFF(oscan) += NEXT_OFF(next);
1351                 }
1352                 continue;
1353             default:                    /* REF and CLUMP only? */
1354                 if (flags & SCF_DO_SUBSTR) {
1355                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1356                     data->longest = &(data->longest_float);
1357                 }
1358                 is_inf = is_inf_internal = 1;
1359                 if (flags & SCF_DO_STCLASS_OR)
1360                     cl_anything(pRExC_state, data->start_class);
1361                 flags &= ~SCF_DO_STCLASS;
1362                 break;
1363             }
1364         }
1365         else if (strchr((char*)PL_simple,OP(scan))) {
1366             int value = 0;
1367
1368             if (flags & SCF_DO_SUBSTR) {
1369                 scan_commit(pRExC_state,data);
1370                 data->pos_min++;
1371             }
1372             min++;
1373             if (flags & SCF_DO_STCLASS) {
1374                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1375
1376                 /* Some of the logic below assumes that switching
1377                    locale on will only add false positives. */
1378                 switch (PL_regkind[(U8)OP(scan)]) {
1379                 case SANY:
1380                 default:
1381                   do_default:
1382                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1383                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1384                         cl_anything(pRExC_state, data->start_class);
1385                     break;
1386                 case REG_ANY:
1387                     if (OP(scan) == SANY)
1388                         goto do_default;
1389                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1390                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1391                                  || (data->start_class->flags & ANYOF_CLASS));
1392                         cl_anything(pRExC_state, data->start_class);
1393                     }
1394                     if (flags & SCF_DO_STCLASS_AND || !value)
1395                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1396                     break;
1397                 case ANYOF:
1398                     if (flags & SCF_DO_STCLASS_AND)
1399                         cl_and(data->start_class,
1400                                (struct regnode_charclass_class*)scan);
1401                     else
1402                         cl_or(pRExC_state, data->start_class,
1403                               (struct regnode_charclass_class*)scan);
1404                     break;
1405                 case ALNUM:
1406                     if (flags & SCF_DO_STCLASS_AND) {
1407                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1408                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1409                             for (value = 0; value < 256; value++)
1410                                 if (!isALNUM(value))
1411                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1412                         }
1413                     }
1414                     else {
1415                         if (data->start_class->flags & ANYOF_LOCALE)
1416                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1417                         else {
1418                             for (value = 0; value < 256; value++)
1419                                 if (isALNUM(value))
1420                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1421                         }
1422                     }
1423                     break;
1424                 case ALNUML:
1425                     if (flags & SCF_DO_STCLASS_AND) {
1426                         if (data->start_class->flags & ANYOF_LOCALE)
1427                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1428                     }
1429                     else {
1430                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1431                         data->start_class->flags |= ANYOF_LOCALE;
1432                     }
1433                     break;
1434                 case NALNUM:
1435                     if (flags & SCF_DO_STCLASS_AND) {
1436                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1437                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1438                             for (value = 0; value < 256; value++)
1439                                 if (isALNUM(value))
1440                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1441                         }
1442                     }
1443                     else {
1444                         if (data->start_class->flags & ANYOF_LOCALE)
1445                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1446                         else {
1447                             for (value = 0; value < 256; value++)
1448                                 if (!isALNUM(value))
1449                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1450                         }
1451                     }
1452                     break;
1453                 case NALNUML:
1454                     if (flags & SCF_DO_STCLASS_AND) {
1455                         if (data->start_class->flags & ANYOF_LOCALE)
1456                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1457                     }
1458                     else {
1459                         data->start_class->flags |= ANYOF_LOCALE;
1460                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1461                     }
1462                     break;
1463                 case SPACE:
1464                     if (flags & SCF_DO_STCLASS_AND) {
1465                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1466                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1467                             for (value = 0; value < 256; value++)
1468                                 if (!isSPACE(value))
1469                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1470                         }
1471                     }
1472                     else {
1473                         if (data->start_class->flags & ANYOF_LOCALE)
1474                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1475                         else {
1476                             for (value = 0; value < 256; value++)
1477                                 if (isSPACE(value))
1478                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1479                         }
1480                     }
1481                     break;
1482                 case SPACEL:
1483                     if (flags & SCF_DO_STCLASS_AND) {
1484                         if (data->start_class->flags & ANYOF_LOCALE)
1485                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1486                     }
1487                     else {
1488                         data->start_class->flags |= ANYOF_LOCALE;
1489                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1490                     }
1491                     break;
1492                 case NSPACE:
1493                     if (flags & SCF_DO_STCLASS_AND) {
1494                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1495                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1496                             for (value = 0; value < 256; value++)
1497                                 if (isSPACE(value))
1498                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1499                         }
1500                     }
1501                     else {
1502                         if (data->start_class->flags & ANYOF_LOCALE)
1503                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1504                         else {
1505                             for (value = 0; value < 256; value++)
1506                                 if (!isSPACE(value))
1507                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1508                         }
1509                     }
1510                     break;
1511                 case NSPACEL:
1512                     if (flags & SCF_DO_STCLASS_AND) {
1513                         if (data->start_class->flags & ANYOF_LOCALE) {
1514                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1515                             for (value = 0; value < 256; value++)
1516                                 if (!isSPACE(value))
1517                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1518                         }
1519                     }
1520                     else {
1521                         data->start_class->flags |= ANYOF_LOCALE;
1522                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1523                     }
1524                     break;
1525                 case DIGIT:
1526                     if (flags & SCF_DO_STCLASS_AND) {
1527                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1528                         for (value = 0; value < 256; value++)
1529                             if (!isDIGIT(value))
1530                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1531                     }
1532                     else {
1533                         if (data->start_class->flags & ANYOF_LOCALE)
1534                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1535                         else {
1536                             for (value = 0; value < 256; value++)
1537                                 if (isDIGIT(value))
1538                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1539                         }
1540                     }
1541                     break;
1542                 case NDIGIT:
1543                     if (flags & SCF_DO_STCLASS_AND) {
1544                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1545                         for (value = 0; value < 256; value++)
1546                             if (isDIGIT(value))
1547                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1548                     }
1549                     else {
1550                         if (data->start_class->flags & ANYOF_LOCALE)
1551                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1552                         else {
1553                             for (value = 0; value < 256; value++)
1554                                 if (!isDIGIT(value))
1555                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1556                         }
1557                     }
1558                     break;
1559                 }
1560                 if (flags & SCF_DO_STCLASS_OR)
1561                     cl_and(data->start_class, &and_with);
1562                 flags &= ~SCF_DO_STCLASS;
1563             }
1564         }
1565         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1566             data->flags |= (OP(scan) == MEOL
1567                             ? SF_BEFORE_MEOL
1568                             : SF_BEFORE_SEOL);
1569         }
1570         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1571                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1572                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1573                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1574             /* Lookahead/lookbehind */
1575             I32 deltanext, minnext, fake = 0;
1576             regnode *nscan;
1577             struct regnode_charclass_class intrnl;
1578             int f = 0;
1579
1580             data_fake.flags = 0;
1581             if (data) {         
1582                 data_fake.whilem_c = data->whilem_c;
1583                 data_fake.last_closep = data->last_closep;
1584             }
1585             else
1586                 data_fake.last_closep = &fake;
1587             if ( flags & SCF_DO_STCLASS && !scan->flags
1588                  && OP(scan) == IFMATCH ) { /* Lookahead */
1589                 cl_init(pRExC_state, &intrnl);
1590                 data_fake.start_class = &intrnl;
1591                 f |= SCF_DO_STCLASS_AND;
1592             }
1593             if (flags & SCF_WHILEM_VISITED_POS)
1594                 f |= SCF_WHILEM_VISITED_POS;
1595             next = regnext(scan);
1596             nscan = NEXTOPER(NEXTOPER(scan));
1597             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1598             if (scan->flags) {
1599                 if (deltanext) {
1600                     vFAIL("Variable length lookbehind not implemented");
1601                 }
1602                 else if (minnext > U8_MAX) {
1603                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1604                 }
1605                 scan->flags = (U8)minnext;
1606             }
1607             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1608                 pars++;
1609             if (data && (data_fake.flags & SF_HAS_EVAL))
1610                 data->flags |= SF_HAS_EVAL;
1611             if (data)
1612                 data->whilem_c = data_fake.whilem_c;
1613             if (f & SCF_DO_STCLASS_AND) {
1614                 int was = (data->start_class->flags & ANYOF_EOS);
1615
1616                 cl_and(data->start_class, &intrnl);
1617                 if (was)
1618                     data->start_class->flags |= ANYOF_EOS;
1619             }
1620         }
1621         else if (OP(scan) == OPEN) {
1622             pars++;
1623         }
1624         else if (OP(scan) == CLOSE) {
1625             if ((I32)ARG(scan) == is_par) {
1626                 next = regnext(scan);
1627
1628                 if ( next && (OP(next) != WHILEM) && next < last)
1629                     is_par = 0;         /* Disable optimization */
1630             }
1631             if (data)
1632                 *(data->last_closep) = ARG(scan);
1633         }
1634         else if (OP(scan) == EVAL) {
1635                 if (data)
1636                     data->flags |= SF_HAS_EVAL;
1637         }
1638         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1639                 if (flags & SCF_DO_SUBSTR) {
1640                     scan_commit(pRExC_state,data);
1641                     data->longest = &(data->longest_float);
1642                 }
1643                 is_inf = is_inf_internal = 1;
1644                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1645                     cl_anything(pRExC_state, data->start_class);
1646                 flags &= ~SCF_DO_STCLASS;
1647         }
1648         /* Else: zero-length, ignore. */
1649         scan = regnext(scan);
1650     }
1651
1652   finish:
1653     *scanp = scan;
1654     *deltap = is_inf_internal ? I32_MAX : delta;
1655     if (flags & SCF_DO_SUBSTR && is_inf)
1656         data->pos_delta = I32_MAX - data->pos_min;
1657     if (is_par > U8_MAX)
1658         is_par = 0;
1659     if (is_par && pars==1 && data) {
1660         data->flags |= SF_IN_PAR;
1661         data->flags &= ~SF_HAS_PAR;
1662     }
1663     else if (pars && data) {
1664         data->flags |= SF_HAS_PAR;
1665         data->flags &= ~SF_IN_PAR;
1666     }
1667     if (flags & SCF_DO_STCLASS_OR)
1668         cl_and(data->start_class, &and_with);
1669     return min;
1670 }
1671
1672 STATIC I32
1673 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1674 {
1675     if (RExC_rx->data) {
1676         Renewc(RExC_rx->data,
1677                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1678                char, struct reg_data);
1679         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1680         RExC_rx->data->count += n;
1681     }
1682     else {
1683         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1684              char, struct reg_data);
1685         New(1208, RExC_rx->data->what, n, U8);
1686         RExC_rx->data->count = n;
1687     }
1688     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1689     return RExC_rx->data->count - n;
1690 }
1691
1692 void
1693 Perl_reginitcolors(pTHX)
1694 {
1695     int i = 0;
1696     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1697         
1698     if (s) {
1699         PL_colors[0] = s = savepv(s);
1700         while (++i < 6) {
1701             s = strchr(s, '\t');
1702             if (s) {
1703                 *s = '\0';
1704                 PL_colors[i] = ++s;
1705             }
1706             else
1707                 PL_colors[i] = s = "";
1708         }
1709     } else {
1710         while (i < 6)
1711             PL_colors[i++] = "";
1712     }
1713     PL_colorset = 1;
1714 }
1715
1716
1717 /*
1718  - pregcomp - compile a regular expression into internal code
1719  *
1720  * We can't allocate space until we know how big the compiled form will be,
1721  * but we can't compile it (and thus know how big it is) until we've got a
1722  * place to put the code.  So we cheat:  we compile it twice, once with code
1723  * generation turned off and size counting turned on, and once "for real".
1724  * This also means that we don't allocate space until we are sure that the
1725  * thing really will compile successfully, and we never have to move the
1726  * code and thus invalidate pointers into it.  (Note that it has to be in
1727  * one piece because free() must be able to free it all.) [NB: not true in perl]
1728  *
1729  * Beware that the optimization-preparation code in here knows about some
1730  * of the structure of the compiled regexp.  [I'll say.]
1731  */
1732 regexp *
1733 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1734 {
1735     register regexp *r;
1736     regnode *scan;
1737     regnode *first;
1738     I32 flags;
1739     I32 minlen = 0;
1740     I32 sawplus = 0;
1741     I32 sawopen = 0;
1742     scan_data_t data;
1743     RExC_state_t RExC_state;
1744     RExC_state_t *pRExC_state = &RExC_state;
1745
1746     if (exp == NULL)
1747         FAIL("NULL regexp argument");
1748
1749     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1750
1751     RExC_precomp = exp;
1752     DEBUG_r({
1753          if (!PL_colorset) reginitcolors();
1754          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1755                        PL_colors[4],PL_colors[5],PL_colors[0],
1756                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1757     });
1758     RExC_flags = pm->op_pmflags;
1759     RExC_sawback = 0;
1760
1761     RExC_seen = 0;
1762     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1763     RExC_seen_evals = 0;
1764     RExC_extralen = 0;
1765
1766     /* First pass: determine size, legality. */
1767     RExC_parse = exp;
1768     RExC_start = exp;
1769     RExC_end = xend;
1770     RExC_naughty = 0;
1771     RExC_npar = 1;
1772     RExC_size = 0L;
1773     RExC_emit = &PL_regdummy;
1774     RExC_whilem_seen = 0;
1775 #if 0 /* REGC() is (currently) a NOP at the first pass.
1776        * Clever compilers notice this and complain. --jhi */
1777     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1778 #endif
1779     if (reg(pRExC_state, 0, &flags) == NULL) {
1780         RExC_precomp = Nullch;
1781         return(NULL);
1782     }
1783     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1784
1785     /* Small enough for pointer-storage convention?
1786        If extralen==0, this means that we will not need long jumps. */
1787     if (RExC_size >= 0x10000L && RExC_extralen)
1788         RExC_size += RExC_extralen;
1789     else
1790         RExC_extralen = 0;
1791     if (RExC_whilem_seen > 15)
1792         RExC_whilem_seen = 15;
1793
1794     /* Allocate space and initialize. */
1795     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1796          char, regexp);
1797     if (r == NULL)
1798         FAIL("Regexp out of space");
1799
1800 #ifdef DEBUGGING
1801     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1802     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1803 #endif
1804     r->refcnt = 1;
1805     r->prelen = xend - exp;
1806     r->precomp = savepvn(RExC_precomp, r->prelen);
1807     r->subbeg = NULL;
1808 #ifdef PERL_COPY_ON_WRITE
1809     r->saved_copy = Nullsv;
1810 #endif
1811     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1812     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1813
1814     r->substrs = 0;                     /* Useful during FAIL. */
1815     r->startp = 0;                      /* Useful during FAIL. */
1816     r->endp = 0;                        /* Useful during FAIL. */
1817
1818     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1819     if (r->offsets) {
1820       r->offsets[0] = RExC_size; 
1821     }
1822     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1823                           "%s %"UVuf" bytes for offset annotations.\n", 
1824                           r->offsets ? "Got" : "Couldn't get", 
1825                           (UV)((2*RExC_size+1) * sizeof(U32))));
1826
1827     RExC_rx = r;
1828
1829     /* Second pass: emit code. */
1830     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1831     RExC_parse = exp;
1832     RExC_end = xend;
1833     RExC_naughty = 0;
1834     RExC_npar = 1;
1835     RExC_emit_start = r->program;
1836     RExC_emit = r->program;
1837     /* Store the count of eval-groups for security checks: */
1838     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1839     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1840     r->data = 0;
1841     if (reg(pRExC_state, 0, &flags) == NULL)
1842         return(NULL);
1843
1844     /* Dig out information for optimizations. */
1845     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1846     pm->op_pmflags = RExC_flags;
1847     if (UTF)
1848         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1849     r->regstclass = NULL;
1850     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1851         r->reganch |= ROPT_NAUGHTY;
1852     scan = r->program + 1;              /* First BRANCH. */
1853
1854     /* XXXX To minimize changes to RE engine we always allocate
1855        3-units-long substrs field. */
1856     Newz(1004, r->substrs, 1, struct reg_substr_data);
1857
1858     StructCopy(&zero_scan_data, &data, scan_data_t);
1859     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1860     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1861         I32 fake;
1862         STRLEN longest_float_length, longest_fixed_length;
1863         struct regnode_charclass_class ch_class;
1864         int stclass_flag;
1865         I32 last_close = 0;
1866
1867         first = scan;
1868         /* Skip introductions and multiplicators >= 1. */
1869         while ((OP(first) == OPEN && (sawopen = 1)) ||
1870                /* An OR of *one* alternative - should not happen now. */
1871             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1872             (OP(first) == PLUS) ||
1873             (OP(first) == MINMOD) ||
1874                /* An {n,m} with n>0 */
1875             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1876                 if (OP(first) == PLUS)
1877                     sawplus = 1;
1878                 else
1879                     first += regarglen[(U8)OP(first)];
1880                 first = NEXTOPER(first);
1881         }
1882
1883         /* Starting-point info. */
1884       again:
1885         if (PL_regkind[(U8)OP(first)] == EXACT) {
1886             if (OP(first) == EXACT)
1887                 ;       /* Empty, get anchored substr later. */
1888             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1889                 r->regstclass = first;
1890         }
1891         else if (strchr((char*)PL_simple,OP(first)))
1892             r->regstclass = first;
1893         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1894                  PL_regkind[(U8)OP(first)] == NBOUND)
1895             r->regstclass = first;
1896         else if (PL_regkind[(U8)OP(first)] == BOL) {
1897             r->reganch |= (OP(first) == MBOL
1898                            ? ROPT_ANCH_MBOL
1899                            : (OP(first) == SBOL
1900                               ? ROPT_ANCH_SBOL
1901                               : ROPT_ANCH_BOL));
1902             first = NEXTOPER(first);
1903             goto again;
1904         }
1905         else if (OP(first) == GPOS) {
1906             r->reganch |= ROPT_ANCH_GPOS;
1907             first = NEXTOPER(first);
1908             goto again;
1909         }
1910         else if (!sawopen && (OP(first) == STAR &&
1911             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1912             !(r->reganch & ROPT_ANCH) )
1913         {
1914             /* turn .* into ^.* with an implied $*=1 */
1915             int type = OP(NEXTOPER(first));
1916
1917             if (type == REG_ANY)
1918                 type = ROPT_ANCH_MBOL;
1919             else
1920                 type = ROPT_ANCH_SBOL;
1921
1922             r->reganch |= type | ROPT_IMPLICIT;
1923             first = NEXTOPER(first);
1924             goto again;
1925         }
1926         if (sawplus && (!sawopen || !RExC_sawback)
1927             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1928             /* x+ must match at the 1st pos of run of x's */
1929             r->reganch |= ROPT_SKIP;
1930
1931         /* Scan is after the zeroth branch, first is atomic matcher. */
1932         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1933                               (IV)(first - scan + 1)));
1934         /*
1935         * If there's something expensive in the r.e., find the
1936         * longest literal string that must appear and make it the
1937         * regmust.  Resolve ties in favor of later strings, since
1938         * the regstart check works with the beginning of the r.e.
1939         * and avoiding duplication strengthens checking.  Not a
1940         * strong reason, but sufficient in the absence of others.
1941         * [Now we resolve ties in favor of the earlier string if
1942         * it happens that c_offset_min has been invalidated, since the
1943         * earlier string may buy us something the later one won't.]
1944         */
1945         minlen = 0;
1946
1947         data.longest_fixed = newSVpvn("",0);
1948         data.longest_float = newSVpvn("",0);
1949         data.last_found = newSVpvn("",0);
1950         data.longest = &(data.longest_fixed);
1951         first = scan;
1952         if (!r->regstclass) {
1953             cl_init(pRExC_state, &ch_class);
1954             data.start_class = &ch_class;
1955             stclass_flag = SCF_DO_STCLASS_AND;
1956         } else                          /* XXXX Check for BOUND? */
1957             stclass_flag = 0;
1958         data.last_closep = &last_close;
1959
1960         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1961                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1962         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1963              && data.last_start_min == 0 && data.last_end > 0
1964              && !RExC_seen_zerolen
1965              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1966             r->reganch |= ROPT_CHECK_ALL;
1967         scan_commit(pRExC_state, &data);
1968         SvREFCNT_dec(data.last_found);
1969
1970         longest_float_length = CHR_SVLEN(data.longest_float);
1971         if (longest_float_length
1972             || (data.flags & SF_FL_BEFORE_EOL
1973                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1974                     || (RExC_flags & PMf_MULTILINE)))) {
1975             int t;
1976
1977             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1978                 && data.offset_fixed == data.offset_float_min
1979                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1980                     goto remove_float;          /* As in (a)+. */
1981
1982             if (SvUTF8(data.longest_float)) {
1983                 r->float_utf8 = data.longest_float;
1984                 r->float_substr = Nullsv;
1985             } else {
1986                 r->float_substr = data.longest_float;
1987                 r->float_utf8 = Nullsv;
1988             }
1989             r->float_min_offset = data.offset_float_min;
1990             r->float_max_offset = data.offset_float_max;
1991             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1992                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1993                            || (RExC_flags & PMf_MULTILINE)));
1994             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1995         }
1996         else {
1997           remove_float:
1998             r->float_substr = r->float_utf8 = Nullsv;
1999             SvREFCNT_dec(data.longest_float);
2000             longest_float_length = 0;
2001         }
2002
2003         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2004         if (longest_fixed_length
2005             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2006                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2007                     || (RExC_flags & PMf_MULTILINE)))) {
2008             int t;
2009
2010             if (SvUTF8(data.longest_fixed)) {
2011                 r->anchored_utf8 = data.longest_fixed;
2012                 r->anchored_substr = Nullsv;
2013             } else {
2014                 r->anchored_substr = data.longest_fixed;
2015                 r->anchored_utf8 = Nullsv;
2016             }
2017             r->anchored_offset = data.offset_fixed;
2018             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2019                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2020                      || (RExC_flags & PMf_MULTILINE)));
2021             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2022         }
2023         else {
2024             r->anchored_substr = r->anchored_utf8 = Nullsv;
2025             SvREFCNT_dec(data.longest_fixed);
2026             longest_fixed_length = 0;
2027         }
2028         if (r->regstclass
2029             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2030             r->regstclass = NULL;
2031         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2032             && stclass_flag
2033             && !(data.start_class->flags & ANYOF_EOS)
2034             && !cl_is_anything(data.start_class))
2035         {
2036             I32 n = add_data(pRExC_state, 1, "f");
2037
2038             New(1006, RExC_rx->data->data[n], 1,
2039                 struct regnode_charclass_class);
2040             StructCopy(data.start_class,
2041                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2042                        struct regnode_charclass_class);
2043             r->regstclass = (regnode*)RExC_rx->data->data[n];
2044             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2045             PL_regdata = r->data; /* for regprop() */
2046             DEBUG_r({ SV *sv = sv_newmortal();
2047                       regprop(sv, (regnode*)data.start_class);
2048                       PerlIO_printf(Perl_debug_log,
2049                                     "synthetic stclass `%s'.\n",
2050                                     SvPVX(sv));});
2051         }
2052
2053         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2054         if (longest_fixed_length > longest_float_length) {
2055             r->check_substr = r->anchored_substr;
2056             r->check_utf8 = r->anchored_utf8;
2057             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2058             if (r->reganch & ROPT_ANCH_SINGLE)
2059                 r->reganch |= ROPT_NOSCAN;
2060         }
2061         else {
2062             r->check_substr = r->float_substr;
2063             r->check_utf8 = r->float_utf8;
2064             r->check_offset_min = data.offset_float_min;
2065             r->check_offset_max = data.offset_float_max;
2066         }
2067         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2068            This should be changed ASAP!  */
2069         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2070             r->reganch |= RE_USE_INTUIT;
2071             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2072                 r->reganch |= RE_INTUIT_TAIL;
2073         }
2074     }
2075     else {
2076         /* Several toplevels. Best we can is to set minlen. */
2077         I32 fake;
2078         struct regnode_charclass_class ch_class;
2079         I32 last_close = 0;
2080         
2081         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2082         scan = r->program + 1;
2083         cl_init(pRExC_state, &ch_class);
2084         data.start_class = &ch_class;
2085         data.last_closep = &last_close;
2086         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2087         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2088                 = r->float_substr = r->float_utf8 = Nullsv;
2089         if (!(data.start_class->flags & ANYOF_EOS)
2090             && !cl_is_anything(data.start_class))
2091         {
2092             I32 n = add_data(pRExC_state, 1, "f");
2093
2094             New(1006, RExC_rx->data->data[n], 1,
2095                 struct regnode_charclass_class);
2096             StructCopy(data.start_class,
2097                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2098                        struct regnode_charclass_class);
2099             r->regstclass = (regnode*)RExC_rx->data->data[n];
2100             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2101             DEBUG_r({ SV* sv = sv_newmortal();
2102                       regprop(sv, (regnode*)data.start_class);
2103                       PerlIO_printf(Perl_debug_log,
2104                                     "synthetic stclass `%s'.\n",
2105                                     SvPVX(sv));});
2106         }
2107     }
2108
2109     r->minlen = minlen;
2110     if (RExC_seen & REG_SEEN_GPOS)
2111         r->reganch |= ROPT_GPOS_SEEN;
2112     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2113         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2114     if (RExC_seen & REG_SEEN_EVAL)
2115         r->reganch |= ROPT_EVAL_SEEN;
2116     if (RExC_seen & REG_SEEN_CANY)
2117         r->reganch |= ROPT_CANY_SEEN;
2118     Newz(1002, r->startp, RExC_npar, I32);
2119     Newz(1002, r->endp, RExC_npar, I32);
2120     PL_regdata = r->data; /* for regprop() */
2121     DEBUG_r(regdump(r));
2122     return(r);
2123 }
2124
2125 /*
2126  - reg - regular expression, i.e. main body or parenthesized thing
2127  *
2128  * Caller must absorb opening parenthesis.
2129  *
2130  * Combining parenthesis handling with the base level of regular expression
2131  * is a trifle forced, but the need to tie the tails of the branches to what
2132  * follows makes it hard to avoid.
2133  */
2134 STATIC regnode *
2135 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2136     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2137 {
2138     register regnode *ret;              /* Will be the head of the group. */
2139     register regnode *br;
2140     register regnode *lastbr;
2141     register regnode *ender = 0;
2142     register I32 parno = 0;
2143     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2144
2145     /* for (?g), (?gc), and (?o) warnings; warning
2146        about (?c) will warn about (?g) -- japhy    */
2147
2148     I32 wastedflags = 0x00,
2149         wasted_o    = 0x01,
2150         wasted_g    = 0x02,
2151         wasted_gc   = 0x02 | 0x04,
2152         wasted_c    = 0x04;
2153
2154     char * parse_start = RExC_parse; /* MJD */
2155     char *oregcomp_parse = RExC_parse;
2156     char c;
2157
2158     *flagp = 0;                         /* Tentatively. */
2159
2160
2161     /* Make an OPEN node, if parenthesized. */
2162     if (paren) {
2163         if (*RExC_parse == '?') { /* (?...) */
2164             U32 posflags = 0, negflags = 0;
2165             U32 *flagsp = &posflags;
2166             int logical = 0;
2167             char *seqstart = RExC_parse;
2168
2169             RExC_parse++;
2170             paren = *RExC_parse++;
2171             ret = NULL;                 /* For look-ahead/behind. */
2172             switch (paren) {
2173             case '<':           /* (?<...) */
2174                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2175                 if (*RExC_parse == '!')
2176                     paren = ',';
2177                 if (*RExC_parse != '=' && *RExC_parse != '!')
2178                     goto unknown;
2179                 RExC_parse++;
2180             case '=':           /* (?=...) */
2181             case '!':           /* (?!...) */
2182                 RExC_seen_zerolen++;
2183             case ':':           /* (?:...) */
2184             case '>':           /* (?>...) */
2185                 break;
2186             case '$':           /* (?$...) */
2187             case '@':           /* (?@...) */
2188                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2189                 break;
2190             case '#':           /* (?#...) */
2191                 while (*RExC_parse && *RExC_parse != ')')
2192                     RExC_parse++;
2193                 if (*RExC_parse != ')')
2194                     FAIL("Sequence (?#... not terminated");
2195                 nextchar(pRExC_state);
2196                 *flagp = TRYAGAIN;
2197                 return NULL;
2198             case 'p':           /* (?p...) */
2199                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2200                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2201                 /* FALL THROUGH*/
2202             case '?':           /* (??...) */
2203                 logical = 1;
2204                 if (*RExC_parse != '{')
2205                     goto unknown;
2206                 paren = *RExC_parse++;
2207                 /* FALL THROUGH */
2208             case '{':           /* (?{...}) */
2209             {
2210                 I32 count = 1, n = 0;
2211                 char c;
2212                 char *s = RExC_parse;
2213                 SV *sv;
2214                 OP_4tree *sop, *rop;
2215
2216                 RExC_seen_zerolen++;
2217                 RExC_seen |= REG_SEEN_EVAL;
2218                 while (count && (c = *RExC_parse)) {
2219                     if (c == '\\' && RExC_parse[1])
2220                         RExC_parse++;
2221                     else if (c == '{')
2222                         count++;
2223                     else if (c == '}')
2224                         count--;
2225                     RExC_parse++;
2226                 }
2227                 if (*RExC_parse != ')')
2228                 {
2229                     RExC_parse = s;             
2230                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2231                 }
2232                 if (!SIZE_ONLY) {
2233                     PAD *pad;
2234                 
2235                     if (RExC_parse - 1 - s)
2236                         sv = newSVpvn(s, RExC_parse - 1 - s);
2237                     else
2238                         sv = newSVpvn("", 0);
2239
2240                     ENTER;
2241                     Perl_save_re_context(aTHX);
2242                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2243                     sop->op_private |= OPpREFCOUNTED;
2244                     /* re_dup will OpREFCNT_inc */
2245                     OpREFCNT_set(sop, 1);
2246                     LEAVE;
2247
2248                     n = add_data(pRExC_state, 3, "nop");
2249                     RExC_rx->data->data[n] = (void*)rop;
2250                     RExC_rx->data->data[n+1] = (void*)sop;
2251                     RExC_rx->data->data[n+2] = (void*)pad;
2252                     SvREFCNT_dec(sv);
2253                 }
2254                 else {                                          /* First pass */
2255                     if (PL_reginterp_cnt < ++RExC_seen_evals
2256                         && PL_curcop != &PL_compiling)
2257                         /* No compiled RE interpolated, has runtime
2258                            components ===> unsafe.  */
2259                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2260                     if (PL_tainting && PL_tainted)
2261                         FAIL("Eval-group in insecure regular expression");
2262                     if (PL_curcop == &PL_compiling)
2263                         PL_cv_has_eval = 1;
2264                 }
2265
2266                 nextchar(pRExC_state);
2267                 if (logical) {
2268                     ret = reg_node(pRExC_state, LOGICAL);
2269                     if (!SIZE_ONLY)
2270                         ret->flags = 2;
2271                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2272                     /* deal with the length of this later - MJD */
2273                     return ret;
2274                 }
2275                 ret = reganode(pRExC_state, EVAL, n);
2276                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2277                 Set_Node_Offset(ret, parse_start);
2278                 return ret;
2279             }
2280             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2281             {
2282                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2283                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2284                         || RExC_parse[1] == '<'
2285                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2286                         I32 flag;
2287                         
2288                         ret = reg_node(pRExC_state, LOGICAL);
2289                         if (!SIZE_ONLY)
2290                             ret->flags = 1;
2291                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2292                         goto insert_if;
2293                     }
2294                 }
2295                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2296                     /* (?(1)...) */
2297                     parno = atoi(RExC_parse++);
2298
2299                     while (isDIGIT(*RExC_parse))
2300                         RExC_parse++;
2301                     ret = reganode(pRExC_state, GROUPP, parno);
2302                     
2303                     if ((c = *nextchar(pRExC_state)) != ')')
2304                         vFAIL("Switch condition not recognized");
2305                   insert_if:
2306                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2307                     br = regbranch(pRExC_state, &flags, 1);
2308                     if (br == NULL)
2309                         br = reganode(pRExC_state, LONGJMP, 0);
2310                     else
2311                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2312                     c = *nextchar(pRExC_state);
2313                     if (flags&HASWIDTH)
2314                         *flagp |= HASWIDTH;
2315                     if (c == '|') {
2316                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2317                         regbranch(pRExC_state, &flags, 1);
2318                         regtail(pRExC_state, ret, lastbr);
2319                         if (flags&HASWIDTH)
2320                             *flagp |= HASWIDTH;
2321                         c = *nextchar(pRExC_state);
2322                     }
2323                     else
2324                         lastbr = NULL;
2325                     if (c != ')')
2326                         vFAIL("Switch (?(condition)... contains too many branches");
2327                     ender = reg_node(pRExC_state, TAIL);
2328                     regtail(pRExC_state, br, ender);
2329                     if (lastbr) {
2330                         regtail(pRExC_state, lastbr, ender);
2331                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2332                     }
2333                     else
2334                         regtail(pRExC_state, ret, ender);
2335                     return ret;
2336                 }
2337                 else {
2338                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2339                 }
2340             }
2341             case 0:
2342                 RExC_parse--; /* for vFAIL to print correctly */
2343                 vFAIL("Sequence (? incomplete");
2344                 break;
2345             default:
2346                 --RExC_parse;
2347               parse_flags:      /* (?i) */
2348                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2349                     /* (?g), (?gc) and (?o) are useless here
2350                        and must be globally applied -- japhy */
2351
2352                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2353                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2354                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2355                             if (! (wastedflags & wflagbit) ) {
2356                                 wastedflags |= wflagbit;
2357                                 vWARN5(
2358                                     RExC_parse + 1,
2359                                     "Useless (%s%c) - %suse /%c modifier",
2360                                     flagsp == &negflags ? "?-" : "?",
2361                                     *RExC_parse,
2362                                     flagsp == &negflags ? "don't " : "",
2363                                     *RExC_parse
2364                                 );
2365                             }
2366                         }
2367                     }
2368                     else if (*RExC_parse == 'c') {
2369                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2370                             if (! (wastedflags & wasted_c) ) {
2371                                 wastedflags |= wasted_gc;
2372                                 vWARN3(
2373                                     RExC_parse + 1,
2374                                     "Useless (%sc) - %suse /gc modifier",
2375                                     flagsp == &negflags ? "?-" : "?",
2376                                     flagsp == &negflags ? "don't " : ""
2377                                 );
2378                             }
2379                         }
2380                     }
2381                     else { pmflag(flagsp, *RExC_parse); }
2382
2383                     ++RExC_parse;
2384                 }
2385                 if (*RExC_parse == '-') {
2386                     flagsp = &negflags;
2387                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2388                     ++RExC_parse;
2389                     goto parse_flags;
2390                 }
2391                 RExC_flags |= posflags;
2392                 RExC_flags &= ~negflags;
2393                 if (*RExC_parse == ':') {
2394                     RExC_parse++;
2395                     paren = ':';
2396                     break;
2397                 }               
2398               unknown:
2399                 if (*RExC_parse != ')') {
2400                     RExC_parse++;
2401                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2402                 }
2403                 nextchar(pRExC_state);
2404                 *flagp = TRYAGAIN;
2405                 return NULL;
2406             }
2407         }
2408         else {                  /* (...) */
2409             parno = RExC_npar;
2410             RExC_npar++;
2411             ret = reganode(pRExC_state, OPEN, parno);
2412             Set_Node_Length(ret, 1); /* MJD */
2413             Set_Node_Offset(ret, RExC_parse); /* MJD */
2414             open = 1;
2415         }
2416     }
2417     else                        /* ! paren */
2418         ret = NULL;
2419
2420     /* Pick up the branches, linking them together. */
2421     parse_start = RExC_parse;   /* MJD */
2422     br = regbranch(pRExC_state, &flags, 1);
2423     /*     branch_len = (paren != 0); */
2424     
2425     if (br == NULL)
2426         return(NULL);
2427     if (*RExC_parse == '|') {
2428         if (!SIZE_ONLY && RExC_extralen) {
2429             reginsert(pRExC_state, BRANCHJ, br);
2430         }
2431         else {                  /* MJD */
2432             reginsert(pRExC_state, BRANCH, br);
2433             Set_Node_Length(br, paren != 0);
2434             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2435         }
2436         have_branch = 1;
2437         if (SIZE_ONLY)
2438             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2439     }
2440     else if (paren == ':') {
2441         *flagp |= flags&SIMPLE;
2442     }
2443     if (open) {                         /* Starts with OPEN. */
2444         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2445     }
2446     else if (paren != '?')              /* Not Conditional */
2447         ret = br;
2448     *flagp |= flags & (SPSTART | HASWIDTH);
2449     lastbr = br;
2450     while (*RExC_parse == '|') {
2451         if (!SIZE_ONLY && RExC_extralen) {
2452             ender = reganode(pRExC_state, LONGJMP,0);
2453             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2454         }
2455         if (SIZE_ONLY)
2456             RExC_extralen += 2;         /* Account for LONGJMP. */
2457         nextchar(pRExC_state);
2458         br = regbranch(pRExC_state, &flags, 0);
2459         
2460         if (br == NULL)
2461             return(NULL);
2462         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2463         lastbr = br;
2464         if (flags&HASWIDTH)
2465             *flagp |= HASWIDTH;
2466         *flagp |= flags&SPSTART;
2467     }
2468
2469     if (have_branch || paren != ':') {
2470         /* Make a closing node, and hook it on the end. */
2471         switch (paren) {
2472         case ':':
2473             ender = reg_node(pRExC_state, TAIL);
2474             break;
2475         case 1:
2476             ender = reganode(pRExC_state, CLOSE, parno);
2477             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2478             Set_Node_Length(ender,1); /* MJD */
2479             break;
2480         case '<':
2481         case ',':
2482         case '=':
2483         case '!':
2484             *flagp &= ~HASWIDTH;
2485             /* FALL THROUGH */
2486         case '>':
2487             ender = reg_node(pRExC_state, SUCCEED);
2488             break;
2489         case 0:
2490             ender = reg_node(pRExC_state, END);
2491             break;
2492         }
2493         regtail(pRExC_state, lastbr, ender);
2494
2495         if (have_branch) {
2496             /* Hook the tails of the branches to the closing node. */
2497             for (br = ret; br != NULL; br = regnext(br)) {
2498                 regoptail(pRExC_state, br, ender);
2499             }
2500         }
2501     }
2502
2503     {
2504         char *p;
2505         static char parens[] = "=!<,>";
2506
2507         if (paren && (p = strchr(parens, paren))) {
2508             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2509             int flag = (p - parens) > 1;
2510
2511             if (paren == '>')
2512                 node = SUSPEND, flag = 0;
2513             reginsert(pRExC_state, node,ret);
2514             Set_Node_Offset(ret, oregcomp_parse);
2515             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2516             ret->flags = flag;
2517             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2518         }
2519     }
2520
2521     /* Check for proper termination. */
2522     if (paren) {
2523         RExC_flags = oregflags;
2524         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2525             RExC_parse = oregcomp_parse;
2526             vFAIL("Unmatched (");
2527         }
2528     }
2529     else if (!paren && RExC_parse < RExC_end) {
2530         if (*RExC_parse == ')') {
2531             RExC_parse++;
2532             vFAIL("Unmatched )");
2533         }
2534         else
2535             FAIL("Junk on end of regexp");      /* "Can't happen". */
2536         /* NOTREACHED */
2537     }
2538
2539     return(ret);
2540 }
2541
2542 /*
2543  - regbranch - one alternative of an | operator
2544  *
2545  * Implements the concatenation operator.
2546  */
2547 STATIC regnode *
2548 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2549 {
2550     register regnode *ret;
2551     register regnode *chain = NULL;
2552     register regnode *latest;
2553     I32 flags = 0, c = 0;
2554
2555     if (first)
2556         ret = NULL;
2557     else {
2558         if (!SIZE_ONLY && RExC_extralen)
2559             ret = reganode(pRExC_state, BRANCHJ,0);
2560         else {
2561             ret = reg_node(pRExC_state, BRANCH);
2562             Set_Node_Length(ret, 1);
2563         }
2564     }
2565         
2566     if (!first && SIZE_ONLY)
2567         RExC_extralen += 1;                     /* BRANCHJ */
2568
2569     *flagp = WORST;                     /* Tentatively. */
2570
2571     RExC_parse--;
2572     nextchar(pRExC_state);
2573     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2574         flags &= ~TRYAGAIN;
2575         latest = regpiece(pRExC_state, &flags);
2576         if (latest == NULL) {
2577             if (flags & TRYAGAIN)
2578                 continue;
2579             return(NULL);
2580         }
2581         else if (ret == NULL)
2582             ret = latest;
2583         *flagp |= flags&HASWIDTH;
2584         if (chain == NULL)      /* First piece. */
2585             *flagp |= flags&SPSTART;
2586         else {
2587             RExC_naughty++;
2588             regtail(pRExC_state, chain, latest);
2589         }
2590         chain = latest;
2591         c++;
2592     }
2593     if (chain == NULL) {        /* Loop ran zero times. */
2594         chain = reg_node(pRExC_state, NOTHING);
2595         if (ret == NULL)
2596             ret = chain;
2597     }
2598     if (c == 1) {
2599         *flagp |= flags&SIMPLE;
2600     }
2601
2602     return(ret);
2603 }
2604
2605 /*
2606  - regpiece - something followed by possible [*+?]
2607  *
2608  * Note that the branching code sequences used for ? and the general cases
2609  * of * and + are somewhat optimized:  they use the same NOTHING node as
2610  * both the endmarker for their branch list and the body of the last branch.
2611  * It might seem that this node could be dispensed with entirely, but the
2612  * endmarker role is not redundant.
2613  */
2614 STATIC regnode *
2615 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2616 {
2617     register regnode *ret;
2618     register char op;
2619     register char *next;
2620     I32 flags;
2621     char *origparse = RExC_parse;
2622     char *maxpos;
2623     I32 min;
2624     I32 max = REG_INFTY;
2625     char *parse_start;
2626
2627     ret = regatom(pRExC_state, &flags);
2628     if (ret == NULL) {
2629         if (flags & TRYAGAIN)
2630             *flagp |= TRYAGAIN;
2631         return(NULL);
2632     }
2633
2634     op = *RExC_parse;
2635
2636     if (op == '{' && regcurly(RExC_parse)) {
2637         parse_start = RExC_parse; /* MJD */
2638         next = RExC_parse + 1;
2639         maxpos = Nullch;
2640         while (isDIGIT(*next) || *next == ',') {
2641             if (*next == ',') {
2642                 if (maxpos)
2643                     break;
2644                 else
2645                     maxpos = next;
2646             }
2647             next++;
2648         }
2649         if (*next == '}') {             /* got one */
2650             if (!maxpos)
2651                 maxpos = next;
2652             RExC_parse++;
2653             min = atoi(RExC_parse);
2654             if (*maxpos == ',')
2655                 maxpos++;
2656             else
2657                 maxpos = RExC_parse;
2658             max = atoi(maxpos);
2659             if (!max && *maxpos != '0')
2660                 max = REG_INFTY;                /* meaning "infinity" */
2661             else if (max >= REG_INFTY)
2662                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2663             RExC_parse = next;
2664             nextchar(pRExC_state);
2665
2666         do_curly:
2667             if ((flags&SIMPLE)) {
2668                 RExC_naughty += 2 + RExC_naughty / 2;
2669                 reginsert(pRExC_state, CURLY, ret);
2670                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2671                 Set_Node_Cur_Length(ret);
2672             }
2673             else {
2674                 regnode *w = reg_node(pRExC_state, WHILEM);
2675
2676                 w->flags = 0;
2677                 regtail(pRExC_state, ret, w);
2678                 if (!SIZE_ONLY && RExC_extralen) {
2679                     reginsert(pRExC_state, LONGJMP,ret);
2680                     reginsert(pRExC_state, NOTHING,ret);
2681                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2682                 }
2683                 reginsert(pRExC_state, CURLYX,ret);
2684                                 /* MJD hk */
2685                 Set_Node_Offset(ret, parse_start+1);
2686                 Set_Node_Length(ret, 
2687                                 op == '{' ? (RExC_parse - parse_start) : 1);
2688                 
2689                 if (!SIZE_ONLY && RExC_extralen)
2690                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2691                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2692                 if (SIZE_ONLY)
2693                     RExC_whilem_seen++, RExC_extralen += 3;
2694                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2695             }
2696             ret->flags = 0;
2697
2698             if (min > 0)
2699                 *flagp = WORST;
2700             if (max > 0)
2701                 *flagp |= HASWIDTH;
2702             if (max && max < min)
2703                 vFAIL("Can't do {n,m} with n > m");
2704             if (!SIZE_ONLY) {
2705                 ARG1_SET(ret, (U16)min);
2706                 ARG2_SET(ret, (U16)max);
2707             }
2708
2709             goto nest_check;
2710         }
2711     }
2712
2713     if (!ISMULT1(op)) {
2714         *flagp = flags;
2715         return(ret);
2716     }
2717
2718 #if 0                           /* Now runtime fix should be reliable. */
2719
2720     /* if this is reinstated, don't forget to put this back into perldiag:
2721
2722             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2723
2724            (F) The part of the regexp subject to either the * or + quantifier
2725            could match an empty string. The {#} shows in the regular
2726            expression about where the problem was discovered.
2727
2728     */
2729
2730     if (!(flags&HASWIDTH) && op != '?')
2731       vFAIL("Regexp *+ operand could be empty");
2732 #endif
2733
2734     parse_start = RExC_parse;
2735     nextchar(pRExC_state);
2736
2737     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2738
2739     if (op == '*' && (flags&SIMPLE)) {
2740         reginsert(pRExC_state, STAR, ret);
2741         ret->flags = 0;
2742         RExC_naughty += 4;
2743     }
2744     else if (op == '*') {
2745         min = 0;
2746         goto do_curly;
2747     }
2748     else if (op == '+' && (flags&SIMPLE)) {
2749         reginsert(pRExC_state, PLUS, ret);
2750         ret->flags = 0;
2751         RExC_naughty += 3;
2752     }
2753     else if (op == '+') {
2754         min = 1;
2755         goto do_curly;
2756     }
2757     else if (op == '?') {
2758         min = 0; max = 1;
2759         goto do_curly;
2760     }
2761   nest_check:
2762     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2763         vWARN3(RExC_parse,
2764                "%.*s matches null string many times",
2765                RExC_parse - origparse,
2766                origparse);
2767     }
2768
2769     if (*RExC_parse == '?') {
2770         nextchar(pRExC_state);
2771         reginsert(pRExC_state, MINMOD, ret);
2772         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2773     }
2774     if (ISMULT2(RExC_parse)) {
2775         RExC_parse++;
2776         vFAIL("Nested quantifiers");
2777     }
2778
2779     return(ret);
2780 }
2781
2782 /*
2783  - regatom - the lowest level
2784  *
2785  * Optimization:  gobbles an entire sequence of ordinary characters so that
2786  * it can turn them into a single node, which is smaller to store and
2787  * faster to run.  Backslashed characters are exceptions, each becoming a
2788  * separate node; the code is simpler that way and it's not worth fixing.
2789  *
2790  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2791 STATIC regnode *
2792 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2793 {
2794     register regnode *ret = 0;
2795     I32 flags;
2796     char *parse_start = 0;
2797
2798     *flagp = WORST;             /* Tentatively. */
2799
2800 tryagain:
2801     switch (*RExC_parse) {
2802     case '^':
2803         RExC_seen_zerolen++;
2804         nextchar(pRExC_state);
2805         if (RExC_flags & PMf_MULTILINE)
2806             ret = reg_node(pRExC_state, MBOL);
2807         else if (RExC_flags & PMf_SINGLELINE)
2808             ret = reg_node(pRExC_state, SBOL);
2809         else
2810             ret = reg_node(pRExC_state, BOL);
2811         Set_Node_Length(ret, 1); /* MJD */
2812         break;
2813     case '$':
2814         nextchar(pRExC_state);
2815         if (*RExC_parse)
2816             RExC_seen_zerolen++;
2817         if (RExC_flags & PMf_MULTILINE)
2818             ret = reg_node(pRExC_state, MEOL);
2819         else if (RExC_flags & PMf_SINGLELINE)
2820             ret = reg_node(pRExC_state, SEOL);
2821         else
2822             ret = reg_node(pRExC_state, EOL);
2823         Set_Node_Length(ret, 1); /* MJD */
2824         break;
2825     case '.':
2826         nextchar(pRExC_state);
2827         if (RExC_flags & PMf_SINGLELINE)
2828             ret = reg_node(pRExC_state, SANY);
2829         else
2830             ret = reg_node(pRExC_state, REG_ANY);
2831         *flagp |= HASWIDTH|SIMPLE;
2832         RExC_naughty++;
2833         Set_Node_Length(ret, 1); /* MJD */
2834         break;
2835     case '[':
2836     {
2837         char *oregcomp_parse = ++RExC_parse;
2838         ret = regclass(pRExC_state);
2839         if (*RExC_parse != ']') {
2840             RExC_parse = oregcomp_parse;
2841             vFAIL("Unmatched [");
2842         }
2843         nextchar(pRExC_state);
2844         *flagp |= HASWIDTH|SIMPLE;
2845         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2846         break;
2847     }
2848     case '(':
2849         nextchar(pRExC_state);
2850         ret = reg(pRExC_state, 1, &flags);
2851         if (ret == NULL) {
2852                 if (flags & TRYAGAIN) {
2853                     if (RExC_parse == RExC_end) {
2854                          /* Make parent create an empty node if needed. */
2855                         *flagp |= TRYAGAIN;
2856                         return(NULL);
2857                     }
2858                     goto tryagain;
2859                 }
2860                 return(NULL);
2861         }
2862         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2863         break;
2864     case '|':
2865     case ')':
2866         if (flags & TRYAGAIN) {
2867             *flagp |= TRYAGAIN;
2868             return NULL;
2869         }
2870         vFAIL("Internal urp");
2871                                 /* Supposed to be caught earlier. */
2872         break;
2873     case '{':
2874         if (!regcurly(RExC_parse)) {
2875             RExC_parse++;
2876             goto defchar;
2877         }
2878         /* FALL THROUGH */
2879     case '?':
2880     case '+':
2881     case '*':
2882         RExC_parse++;
2883         vFAIL("Quantifier follows nothing");
2884         break;
2885     case '\\':
2886         switch (*++RExC_parse) {
2887         case 'A':
2888             RExC_seen_zerolen++;
2889             ret = reg_node(pRExC_state, SBOL);
2890             *flagp |= SIMPLE;
2891             nextchar(pRExC_state);
2892             Set_Node_Length(ret, 2); /* MJD */
2893             break;
2894         case 'G':
2895             ret = reg_node(pRExC_state, GPOS);
2896             RExC_seen |= REG_SEEN_GPOS;
2897             *flagp |= SIMPLE;
2898             nextchar(pRExC_state);
2899             Set_Node_Length(ret, 2); /* MJD */
2900             break;
2901         case 'Z':
2902             ret = reg_node(pRExC_state, SEOL);
2903             *flagp |= SIMPLE;
2904             RExC_seen_zerolen++;                /* Do not optimize RE away */
2905             nextchar(pRExC_state);
2906             break;
2907         case 'z':
2908             ret = reg_node(pRExC_state, EOS);
2909             *flagp |= SIMPLE;
2910             RExC_seen_zerolen++;                /* Do not optimize RE away */
2911             nextchar(pRExC_state);
2912             Set_Node_Length(ret, 2); /* MJD */
2913             break;
2914         case 'C':
2915             ret = reg_node(pRExC_state, CANY);
2916             RExC_seen |= REG_SEEN_CANY;
2917             *flagp |= HASWIDTH|SIMPLE;
2918             nextchar(pRExC_state);
2919             Set_Node_Length(ret, 2); /* MJD */
2920             break;
2921         case 'X':
2922             ret = reg_node(pRExC_state, CLUMP);
2923             *flagp |= HASWIDTH;
2924             nextchar(pRExC_state);
2925             Set_Node_Length(ret, 2); /* MJD */
2926             break;
2927         case 'w':
2928             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2929             *flagp |= HASWIDTH|SIMPLE;
2930             nextchar(pRExC_state);
2931             Set_Node_Length(ret, 2); /* MJD */
2932             break;
2933         case 'W':
2934             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2935             *flagp |= HASWIDTH|SIMPLE;
2936             nextchar(pRExC_state);
2937             Set_Node_Length(ret, 2); /* MJD */
2938             break;
2939         case 'b':
2940             RExC_seen_zerolen++;
2941             RExC_seen |= REG_SEEN_LOOKBEHIND;
2942             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2943             *flagp |= SIMPLE;
2944             nextchar(pRExC_state);
2945             Set_Node_Length(ret, 2); /* MJD */
2946             break;
2947         case 'B':
2948             RExC_seen_zerolen++;
2949             RExC_seen |= REG_SEEN_LOOKBEHIND;
2950             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2951             *flagp |= SIMPLE;
2952             nextchar(pRExC_state);
2953             Set_Node_Length(ret, 2); /* MJD */
2954             break;
2955         case 's':
2956             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2957             *flagp |= HASWIDTH|SIMPLE;
2958             nextchar(pRExC_state);
2959             Set_Node_Length(ret, 2); /* MJD */
2960             break;
2961         case 'S':
2962             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2963             *flagp |= HASWIDTH|SIMPLE;
2964             nextchar(pRExC_state);
2965             Set_Node_Length(ret, 2); /* MJD */
2966             break;
2967         case 'd':
2968             ret = reg_node(pRExC_state, DIGIT);
2969             *flagp |= HASWIDTH|SIMPLE;
2970             nextchar(pRExC_state);
2971             Set_Node_Length(ret, 2); /* MJD */
2972             break;
2973         case 'D':
2974             ret = reg_node(pRExC_state, NDIGIT);
2975             *flagp |= HASWIDTH|SIMPLE;
2976             nextchar(pRExC_state);
2977             Set_Node_Length(ret, 2); /* MJD */
2978             break;
2979         case 'p':
2980         case 'P':
2981             {   
2982                 char* oldregxend = RExC_end;
2983                 char* parse_start = RExC_parse - 2;
2984
2985                 if (RExC_parse[1] == '{') {
2986                   /* a lovely hack--pretend we saw [\pX] instead */
2987                     RExC_end = strchr(RExC_parse, '}');
2988                     if (!RExC_end) {
2989                         U8 c = (U8)*RExC_parse;
2990                         RExC_parse += 2;
2991                         RExC_end = oldregxend;
2992                         vFAIL2("Missing right brace on \\%c{}", c);
2993                     }
2994                     RExC_end++;
2995                 }
2996                 else {
2997                     RExC_end = RExC_parse + 2;
2998                     if (RExC_end > oldregxend)
2999                         RExC_end = oldregxend;
3000                 }
3001                 RExC_parse--;
3002
3003                 ret = regclass(pRExC_state);
3004
3005                 RExC_end = oldregxend;
3006                 RExC_parse--;
3007
3008                 Set_Node_Offset(ret, parse_start + 2);
3009                 Set_Node_Cur_Length(ret);
3010                 nextchar(pRExC_state);
3011                 *flagp |= HASWIDTH|SIMPLE;
3012             }
3013             break;
3014         case 'n':
3015         case 'r':
3016         case 't':
3017         case 'f':
3018         case 'e':
3019         case 'a':
3020         case 'x':
3021         case 'c':
3022         case '0':
3023             goto defchar;
3024         case '1': case '2': case '3': case '4':
3025         case '5': case '6': case '7': case '8': case '9':
3026             {
3027                 I32 num = atoi(RExC_parse);
3028
3029                 if (num > 9 && num >= RExC_npar)
3030                     goto defchar;
3031                 else {
3032                     char * parse_start = RExC_parse - 1; /* MJD */
3033                     while (isDIGIT(*RExC_parse))
3034                         RExC_parse++;
3035
3036                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3037                         vFAIL("Reference to nonexistent group");
3038                     RExC_sawback = 1;
3039                     ret = reganode(pRExC_state,
3040                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3041                                    num);
3042                     *flagp |= HASWIDTH;
3043                     
3044                     /* override incorrect value set in reganode MJD */
3045                     Set_Node_Offset(ret, parse_start+1); 
3046                     Set_Node_Cur_Length(ret); /* MJD */
3047                     RExC_parse--;
3048                     nextchar(pRExC_state);
3049                 }
3050             }
3051             break;
3052         case '\0':
3053             if (RExC_parse >= RExC_end)
3054                 FAIL("Trailing \\");
3055             /* FALL THROUGH */
3056         default:
3057             /* Do not generate `unrecognized' warnings here, we fall
3058                back into the quick-grab loop below */
3059             goto defchar;
3060         }
3061         break;
3062
3063     case '#':
3064         if (RExC_flags & PMf_EXTENDED) {
3065             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3066             if (RExC_parse < RExC_end)
3067                 goto tryagain;
3068         }
3069         /* FALL THROUGH */
3070
3071     default: {
3072             register STRLEN len;
3073             register UV ender;
3074             register char *p;
3075             char *oldp, *s;
3076             STRLEN numlen;
3077             STRLEN foldlen;
3078             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3079
3080             parse_start = RExC_parse - 1;
3081
3082             RExC_parse++;
3083
3084         defchar:
3085             ender = 0;
3086             ret = reg_node(pRExC_state,
3087                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3088             s = STRING(ret);
3089             for (len = 0, p = RExC_parse - 1;
3090               len < 127 && p < RExC_end;
3091               len++)
3092             {
3093                 oldp = p;
3094
3095                 if (RExC_flags & PMf_EXTENDED)
3096                     p = regwhite(p, RExC_end);
3097                 switch (*p) {
3098                 case '^':
3099                 case '$':
3100                 case '.':
3101                 case '[':
3102                 case '(':
3103                 case ')':
3104                 case '|':
3105                     goto loopdone;
3106                 case '\\':
3107                     switch (*++p) {
3108                     case 'A':
3109                     case 'C':
3110                     case 'X':
3111                     case 'G':
3112                     case 'Z':
3113                     case 'z':
3114                     case 'w':
3115                     case 'W':
3116                     case 'b':
3117                     case 'B':
3118                     case 's':
3119                     case 'S':
3120                     case 'd':
3121                     case 'D':
3122                     case 'p':
3123                     case 'P':
3124                         --p;
3125                         goto loopdone;
3126                     case 'n':
3127                         ender = '\n';
3128                         p++;
3129                         break;
3130                     case 'r':
3131                         ender = '\r';
3132                         p++;
3133                         break;
3134                     case 't':
3135                         ender = '\t';
3136                         p++;
3137                         break;
3138                     case 'f':
3139                         ender = '\f';
3140                         p++;
3141                         break;
3142                     case 'e':
3143                           ender = ASCII_TO_NATIVE('\033');
3144                         p++;
3145                         break;
3146                     case 'a':
3147                           ender = ASCII_TO_NATIVE('\007');
3148                         p++;
3149                         break;
3150                     case 'x':
3151                         if (*++p == '{') {
3152                             char* e = strchr(p, '}');
3153         
3154                             if (!e) {
3155                                 RExC_parse = p + 1;
3156                                 vFAIL("Missing right brace on \\x{}");
3157                             }
3158                             else {
3159                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3160                                     | PERL_SCAN_DISALLOW_PREFIX;
3161                                 numlen = e - p - 1;
3162                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3163                                 if (ender > 0xff)
3164                                     RExC_utf8 = 1;
3165                                 /* numlen is generous */
3166                                 if (numlen + len >= 127) {
3167                                     p--;
3168                                     goto loopdone;
3169                                 }
3170                                 p = e + 1;
3171                             }
3172                         }
3173                         else {
3174                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3175                             numlen = 2;
3176                             ender = grok_hex(p, &numlen, &flags, NULL);
3177                             p += numlen;
3178                         }
3179                         break;
3180                     case 'c':
3181                         p++;
3182                         ender = UCHARAT(p++);
3183                         ender = toCTRL(ender);
3184                         break;
3185                     case '0': case '1': case '2': case '3':case '4':
3186                     case '5': case '6': case '7': case '8':case '9':
3187                         if (*p == '0' ||
3188                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3189                             I32 flags = 0;
3190                             numlen = 3;
3191                             ender = grok_oct(p, &numlen, &flags, NULL);
3192                             p += numlen;
3193                         }
3194                         else {
3195                             --p;
3196                             goto loopdone;
3197                         }
3198                         break;
3199                     case '\0':
3200                         if (p >= RExC_end)
3201                             FAIL("Trailing \\");
3202                         /* FALL THROUGH */
3203                     default:
3204                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3205                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3206                         goto normal_default;
3207                     }
3208                     break;
3209                 default:
3210                   normal_default:
3211                     if (UTF8_IS_START(*p) && UTF) {
3212                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3213                                                &numlen, 0);
3214                         p += numlen;
3215                     }
3216                     else
3217                         ender = *p++;
3218                     break;
3219                 }
3220                 if (RExC_flags & PMf_EXTENDED)
3221                     p = regwhite(p, RExC_end);
3222                 if (UTF && FOLD) {
3223                     /* Prime the casefolded buffer. */
3224                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3225                 }
3226                 if (ISMULT2(p)) { /* Back off on ?+*. */
3227                     if (len)
3228                         p = oldp;
3229                     else if (UTF) {
3230                          STRLEN unilen;
3231
3232                          if (FOLD) {
3233                               /* Emit all the Unicode characters. */
3234                               for (foldbuf = tmpbuf;
3235                                    foldlen;
3236                                    foldlen -= numlen) {
3237                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3238                                    if (numlen > 0) {
3239                                         reguni(pRExC_state, ender, s, &unilen);
3240                                         s       += unilen;
3241                                         len     += unilen;
3242                                         /* In EBCDIC the numlen
3243                                          * and unilen can differ. */
3244                                         foldbuf += numlen;
3245                                         if (numlen >= foldlen)
3246                                              break;
3247                                    }
3248                                    else
3249                                         break; /* "Can't happen." */
3250                               }
3251                          }
3252                          else {
3253                               reguni(pRExC_state, ender, s, &unilen);
3254                               if (unilen > 0) {
3255                                    s   += unilen;
3256                                    len += unilen;
3257                               }
3258                          }
3259                     }
3260                     else {
3261                         len++;
3262                         REGC((char)ender, s++);
3263                     }
3264                     break;
3265                 }
3266                 if (UTF) {
3267                      STRLEN unilen;
3268
3269                      if (FOLD) {
3270                           /* Emit all the Unicode characters. */
3271                           for (foldbuf = tmpbuf;
3272                                foldlen;
3273                                foldlen -= numlen) {
3274                                ender = utf8_to_uvchr(foldbuf, &numlen);
3275                                if (numlen > 0) {
3276                                     reguni(pRExC_state, ender, s, &unilen);
3277                                     len     += unilen;
3278                                     s       += unilen;
3279                                     /* In EBCDIC the numlen
3280                                      * and unilen can differ. */
3281                                     foldbuf += numlen;
3282                                     if (numlen >= foldlen)
3283                                          break;
3284                                }
3285                                else
3286                                     break;
3287                           }
3288                      }
3289                      else {
3290                           reguni(pRExC_state, ender, s, &unilen);
3291                           if (unilen > 0) {
3292                                s   += unilen;
3293                                len += unilen;
3294                           }
3295                      }
3296                      len--;
3297                 }
3298                 else
3299                     REGC((char)ender, s++);
3300             }
3301         loopdone:
3302             RExC_parse = p - 1;
3303             Set_Node_Cur_Length(ret); /* MJD */
3304             nextchar(pRExC_state);
3305             {
3306                 /* len is STRLEN which is unsigned, need to copy to signed */
3307                 IV iv = len;
3308                 if (iv < 0)
3309                     vFAIL("Internal disaster");
3310             }
3311             if (len > 0)
3312                 *flagp |= HASWIDTH;
3313             if (len == 1)
3314                 *flagp |= SIMPLE;
3315             if (!SIZE_ONLY)
3316                 STR_LEN(ret) = len;
3317             if (SIZE_ONLY)
3318                 RExC_size += STR_SZ(len);
3319             else
3320                 RExC_emit += STR_SZ(len);
3321         }
3322         break;
3323     }
3324
3325     /* If the encoding pragma is in effect recode the text of
3326      * any EXACT-kind nodes. */
3327     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3328         STRLEN oldlen = STR_LEN(ret);
3329         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3330
3331         if (RExC_utf8)
3332             SvUTF8_on(sv);
3333         if (sv_utf8_downgrade(sv, TRUE)) {
3334             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3335             STRLEN newlen = SvCUR(sv);
3336
3337             if (SvUTF8(sv))
3338                 RExC_utf8 = 1;
3339             if (!SIZE_ONLY) {
3340                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3341                                       (int)oldlen, STRING(ret),
3342                                       (int)newlen, s));
3343                 Copy(s, STRING(ret), newlen, char);
3344                 STR_LEN(ret) += newlen - oldlen;
3345                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3346             } else
3347                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3348         }
3349     }
3350
3351     return(ret);
3352 }
3353
3354 STATIC char *
3355 S_regwhite(pTHX_ char *p, char *e)
3356 {
3357     while (p < e) {
3358         if (isSPACE(*p))
3359             ++p;
3360         else if (*p == '#') {
3361             do {
3362                 p++;
3363             } while (p < e && *p != '\n');
3364         }
3365         else
3366             break;
3367     }
3368     return p;
3369 }
3370
3371 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3372    Character classes ([:foo:]) can also be negated ([:^foo:]).
3373    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3374    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3375    but trigger failures because they are currently unimplemented. */
3376
3377 #define POSIXCC_DONE(c)   ((c) == ':')
3378 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3379 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3380
3381 STATIC I32
3382 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3383 {
3384     char *posixcc = 0;
3385     I32 namedclass = OOB_NAMEDCLASS;
3386
3387     if (value == '[' && RExC_parse + 1 < RExC_end &&
3388         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3389         POSIXCC(UCHARAT(RExC_parse))) {
3390         char  c = UCHARAT(RExC_parse);
3391         char* s = RExC_parse++;
3392         
3393         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3394             RExC_parse++;
3395         if (RExC_parse == RExC_end)
3396             /* Grandfather lone [:, [=, [. */
3397             RExC_parse = s;
3398         else {
3399             char* t = RExC_parse++; /* skip over the c */
3400
3401             if (UCHARAT(RExC_parse) == ']') {
3402                 RExC_parse++; /* skip over the ending ] */
3403                 posixcc = s + 1;
3404                 if (*s == ':') {
3405                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3406                     I32 skip = 5; /* the most common skip */
3407
3408                     switch (*posixcc) {
3409                     case 'a':
3410                         if (strnEQ(posixcc, "alnum", 5))
3411                             namedclass =
3412                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3413                         else if (strnEQ(posixcc, "alpha", 5))
3414                             namedclass =
3415                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3416                         else if (strnEQ(posixcc, "ascii", 5))
3417                             namedclass =
3418                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3419                         break;
3420                     case 'b':
3421                         if (strnEQ(posixcc, "blank", 5))
3422                             namedclass =
3423                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3424                         break;
3425                     case 'c':
3426                         if (strnEQ(posixcc, "cntrl", 5))
3427                             namedclass =
3428                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3429                         break;
3430                     case 'd':
3431                         if (strnEQ(posixcc, "digit", 5))
3432                             namedclass =
3433                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3434                         break;
3435                     case 'g':
3436                         if (strnEQ(posixcc, "graph", 5))
3437                             namedclass =
3438                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3439                         break;
3440                     case 'l':
3441                         if (strnEQ(posixcc, "lower", 5))
3442                             namedclass =
3443                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3444                         break;
3445                     case 'p':
3446                         if (strnEQ(posixcc, "print", 5))
3447                             namedclass =
3448                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3449                         else if (strnEQ(posixcc, "punct", 5))
3450                             namedclass =
3451                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3452                         break;
3453                     case 's':
3454                         if (strnEQ(posixcc, "space", 5))
3455                             namedclass =
3456                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3457                         break;
3458                     case 'u':
3459                         if (strnEQ(posixcc, "upper", 5))
3460                             namedclass =
3461                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3462                         break;
3463                     case 'w': /* this is not POSIX, this is the Perl \w */
3464                         if (strnEQ(posixcc, "word", 4)) {
3465                             namedclass =
3466                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3467                             skip = 4;
3468                         }
3469                         break;
3470                     case 'x':
3471                         if (strnEQ(posixcc, "xdigit", 6)) {
3472                             namedclass =
3473                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3474                             skip = 6;
3475                         }
3476                         break;
3477                     }
3478                     if (namedclass == OOB_NAMEDCLASS ||
3479                         posixcc[skip] != ':' ||
3480                         posixcc[skip+1] != ']')
3481                     {
3482                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3483                                       t - s - 1, s + 1);
3484                     }
3485                 } else if (!SIZE_ONLY) {
3486                     /* [[=foo=]] and [[.foo.]] are still future. */
3487
3488                     /* adjust RExC_parse so the warning shows after
3489                        the class closes */
3490                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3491                         RExC_parse++;
3492                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3493                 }
3494             } else {
3495                 /* Maternal grandfather:
3496                  * "[:" ending in ":" but not in ":]" */
3497                 RExC_parse = s;
3498             }
3499         }
3500     }
3501
3502     return namedclass;
3503 }
3504
3505 STATIC void
3506 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3507 {
3508     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3509         char *s = RExC_parse;
3510         char  c = *s++;
3511
3512         while(*s && isALNUM(*s))
3513             s++;
3514         if (*s && c == *s && s[1] == ']') {
3515             if (ckWARN(WARN_REGEXP))
3516                 vWARN3(s+2,
3517                         "POSIX syntax [%c %c] belongs inside character classes",
3518                         c, c);
3519
3520             /* [[=foo=]] and [[.foo.]] are still future. */
3521             if (POSIXCC_NOTYET(c)) {
3522                 /* adjust RExC_parse so the error shows after
3523                    the class closes */
3524                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3525                     ;
3526                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3527             }
3528         }
3529     }
3530 }
3531
3532 STATIC regnode *
3533 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3534 {
3535     register UV value;
3536     register UV nextvalue;
3537     register IV prevvalue = OOB_UNICODE;
3538     register IV range = 0;
3539     register regnode *ret;
3540     STRLEN numlen;
3541     IV namedclass;
3542     char *rangebegin = 0;
3543     bool need_class = 0;
3544     SV *listsv = Nullsv;
3545     register char *e;
3546     UV n;
3547     bool optimize_invert   = TRUE;
3548     AV* unicode_alternate  = 0;
3549 #ifdef EBCDIC
3550     UV literal_endpoint = 0;
3551 #endif
3552
3553     ret = reganode(pRExC_state, ANYOF, 0);
3554
3555     if (!SIZE_ONLY)
3556         ANYOF_FLAGS(ret) = 0;
3557
3558     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3559         RExC_naughty++;
3560         RExC_parse++;
3561         if (!SIZE_ONLY)
3562             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3563     }
3564
3565     if (SIZE_ONLY)
3566         RExC_size += ANYOF_SKIP;
3567     else {
3568         RExC_emit += ANYOF_SKIP;
3569         if (FOLD)
3570             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3571         if (LOC)
3572             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3573         ANYOF_BITMAP_ZERO(ret);
3574         listsv = newSVpvn("# comment\n", 10);
3575     }
3576
3577     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3578
3579     if (!SIZE_ONLY && POSIXCC(nextvalue))
3580         checkposixcc(pRExC_state);
3581
3582     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3583     if (UCHARAT(RExC_parse) == ']')
3584         goto charclassloop;
3585
3586     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3587
3588     charclassloop:
3589
3590         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3591
3592         if (!range)
3593             rangebegin = RExC_parse;
3594         if (UTF) {
3595             value = utf8n_to_uvchr((U8*)RExC_parse,
3596                                    RExC_end - RExC_parse,
3597                                    &numlen, 0);
3598             RExC_parse += numlen;
3599         }
3600         else
3601             value = UCHARAT(RExC_parse++);
3602         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3603         if (value == '[' && POSIXCC(nextvalue))
3604             namedclass = regpposixcc(pRExC_state, value);
3605         else if (value == '\\') {
3606             if (UTF) {
3607                 value = utf8n_to_uvchr((U8*)RExC_parse,
3608                                    RExC_end - RExC_parse,
3609                                    &numlen, 0);
3610                 RExC_parse += numlen;
3611             }
3612             else
3613                 value = UCHARAT(RExC_parse++);
3614             /* Some compilers cannot handle switching on 64-bit integer
3615              * values, therefore value cannot be an UV.  Yes, this will
3616              * be a problem later if we want switch on Unicode.
3617              * A similar issue a little bit later when switching on
3618              * namedclass. --jhi */
3619             switch ((I32)value) {
3620             case 'w':   namedclass = ANYOF_ALNUM;       break;
3621             case 'W':   namedclass = ANYOF_NALNUM;      break;
3622             case 's':   namedclass = ANYOF_SPACE;       break;
3623             case 'S':   namedclass = ANYOF_NSPACE;      break;
3624             case 'd':   namedclass = ANYOF_DIGIT;       break;
3625             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3626             case 'p':
3627             case 'P':
3628                 if (RExC_parse >= RExC_end)
3629                     vFAIL2("Empty \\%c{}", (U8)value);
3630                 if (*RExC_parse == '{') {
3631                     U8 c = (U8)value;
3632                     e = strchr(RExC_parse++, '}');
3633                     if (!e)
3634                         vFAIL2("Missing right brace on \\%c{}", c);
3635                     while (isSPACE(UCHARAT(RExC_parse)))
3636                         RExC_parse++;
3637                     if (e == RExC_parse)
3638                         vFAIL2("Empty \\%c{}", c);
3639                     n = e - RExC_parse;
3640                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3641                         n--;
3642                 }
3643                 else {
3644                     e = RExC_parse;
3645                     n = 1;
3646                 }
3647                 if (!SIZE_ONLY) {
3648                     if (UCHARAT(RExC_parse) == '^') {
3649                          RExC_parse++;
3650                          n--;
3651                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3652                          while (isSPACE(UCHARAT(RExC_parse))) {
3653                               RExC_parse++;
3654                               n--;
3655                          }
3656                     }
3657                     if (value == 'p')
3658                          Perl_sv_catpvf(aTHX_ listsv,
3659                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3660                     else
3661                          Perl_sv_catpvf(aTHX_ listsv,
3662                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3663                 }
3664                 RExC_parse = e + 1;
3665                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3666                 continue;
3667             case 'n':   value = '\n';                   break;
3668             case 'r':   value = '\r';                   break;
3669             case 't':   value = '\t';                   break;
3670             case 'f':   value = '\f';                   break;
3671             case 'b':   value = '\b';                   break;
3672             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3673             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3674             case 'x':
3675                 if (*RExC_parse == '{') {
3676                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3677                         | PERL_SCAN_DISALLOW_PREFIX;
3678                     e = strchr(RExC_parse++, '}');
3679                     if (!e)
3680                         vFAIL("Missing right brace on \\x{}");
3681
3682                     numlen = e - RExC_parse;
3683                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3684                     RExC_parse = e + 1;
3685                 }
3686                 else {
3687                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3688                     numlen = 2;
3689                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3690                     RExC_parse += numlen;
3691                 }
3692                 break;
3693             case 'c':
3694                 value = UCHARAT(RExC_parse++);
3695                 value = toCTRL(value);
3696                 break;
3697             case '0': case '1': case '2': case '3': case '4':
3698             case '5': case '6': case '7': case '8': case '9':
3699             {
3700                 I32 flags = 0;
3701                 numlen = 3;
3702                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3703                 RExC_parse += numlen;
3704                 break;
3705             }
3706             default:
3707                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3708                     vWARN2(RExC_parse,
3709                            "Unrecognized escape \\%c in character class passed through",
3710                            (int)value);
3711                 break;
3712             }
3713         } /* end of \blah */
3714 #ifdef EBCDIC
3715         else
3716             literal_endpoint++;
3717 #endif
3718
3719         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3720
3721             if (!SIZE_ONLY && !need_class)
3722                 ANYOF_CLASS_ZERO(ret);
3723
3724             need_class = 1;
3725
3726             /* a bad range like a-\d, a-[:digit:] ? */
3727             if (range) {
3728                 if (!SIZE_ONLY) {
3729                     if (ckWARN(WARN_REGEXP))
3730                         vWARN4(RExC_parse,
3731                                "False [] range \"%*.*s\"",
3732                                RExC_parse - rangebegin,
3733                                RExC_parse - rangebegin,
3734                                rangebegin);
3735                     if (prevvalue < 256) {
3736                         ANYOF_BITMAP_SET(ret, prevvalue);
3737                         ANYOF_BITMAP_SET(ret, '-');
3738                     }
3739                     else {
3740                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3741                         Perl_sv_catpvf(aTHX_ listsv,
3742                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3743                     }
3744                 }
3745
3746                 range = 0; /* this was not a true range */
3747             }
3748
3749             if (!SIZE_ONLY) {
3750                 if (namedclass > OOB_NAMEDCLASS)
3751                     optimize_invert = FALSE;
3752                 /* Possible truncation here but in some 64-bit environments
3753                  * the compiler gets heartburn about switch on 64-bit values.
3754                  * A similar issue a little earlier when switching on value.
3755                  * --jhi */
3756                 switch ((I32)namedclass) {
3757                 case ANYOF_ALNUM:
3758                     if (LOC)
3759                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3760                     else {
3761                         for (value = 0; value < 256; value++)
3762                             if (isALNUM(value))
3763                                 ANYOF_BITMAP_SET(ret, value);
3764                     }
3765                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3766                     break;
3767                 case ANYOF_NALNUM:
3768                     if (LOC)
3769                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3770                     else {
3771                         for (value = 0; value < 256; value++)
3772                             if (!isALNUM(value))
3773                                 ANYOF_BITMAP_SET(ret, value);
3774                     }
3775                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3776                     break;
3777                 case ANYOF_ALNUMC:
3778                     if (LOC)
3779                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3780                     else {
3781                         for (value = 0; value < 256; value++)
3782                             if (isALNUMC(value))
3783                                 ANYOF_BITMAP_SET(ret, value);
3784                     }
3785                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3786                     break;
3787                 case ANYOF_NALNUMC:
3788                     if (LOC)
3789                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3790                     else {
3791                         for (value = 0; value < 256; value++)
3792                             if (!isALNUMC(value))
3793                                 ANYOF_BITMAP_SET(ret, value);
3794                     }
3795                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3796                     break;
3797                 case ANYOF_ALPHA:
3798                     if (LOC)
3799                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3800                     else {
3801                         for (value = 0; value < 256; value++)
3802                             if (isALPHA(value))
3803                                 ANYOF_BITMAP_SET(ret, value);
3804                     }
3805                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3806                     break;
3807                 case ANYOF_NALPHA:
3808                     if (LOC)
3809                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3810                     else {
3811                         for (value = 0; value < 256; value++)
3812                             if (!isALPHA(value))
3813                                 ANYOF_BITMAP_SET(ret, value);
3814                     }
3815                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3816                     break;
3817                 case ANYOF_ASCII:
3818                     if (LOC)
3819                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3820                     else {
3821 #ifndef EBCDIC
3822                         for (value = 0; value < 128; value++)
3823                             ANYOF_BITMAP_SET(ret, value);
3824 #else  /* EBCDIC */
3825                         for (value = 0; value < 256; value++) {
3826                             if (isASCII(value))
3827                                 ANYOF_BITMAP_SET(ret, value);
3828                         }
3829 #endif /* EBCDIC */
3830                     }
3831                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3832                     break;
3833                 case ANYOF_NASCII:
3834                     if (LOC)
3835                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3836                     else {
3837 #ifndef EBCDIC
3838                         for (value = 128; value < 256; value++)
3839                             ANYOF_BITMAP_SET(ret, value);
3840 #else  /* EBCDIC */
3841                         for (value = 0; value < 256; value++) {
3842                             if (!isASCII(value))
3843                                 ANYOF_BITMAP_SET(ret, value);
3844                         }
3845 #endif /* EBCDIC */
3846                     }
3847                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3848                     break;
3849                 case ANYOF_BLANK:
3850                     if (LOC)
3851                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3852                     else {
3853                         for (value = 0; value < 256; value++)
3854                             if (isBLANK(value))
3855                                 ANYOF_BITMAP_SET(ret, value);
3856                     }
3857                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3858                     break;
3859                 case ANYOF_NBLANK:
3860                     if (LOC)
3861                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3862                     else {
3863                         for (value = 0; value < 256; value++)
3864                             if (!isBLANK(value))
3865                                 ANYOF_BITMAP_SET(ret, value);
3866                     }
3867                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3868                     break;
3869                 case ANYOF_CNTRL:
3870                     if (LOC)
3871                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3872                     else {
3873                         for (value = 0; value < 256; value++)
3874                             if (isCNTRL(value))
3875                                 ANYOF_BITMAP_SET(ret, value);
3876                     }
3877                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3878                     break;
3879                 case ANYOF_NCNTRL:
3880                     if (LOC)
3881                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3882                     else {
3883                         for (value = 0; value < 256; value++)
3884                             if (!isCNTRL(value))
3885                                 ANYOF_BITMAP_SET(ret, value);
3886                     }
3887                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3888                     break;
3889                 case ANYOF_DIGIT:
3890                     if (LOC)
3891                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3892                     else {
3893                         /* consecutive digits assumed */
3894                         for (value = '0'; value <= '9'; value++)
3895                             ANYOF_BITMAP_SET(ret, value);
3896                     }
3897                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3898                     break;
3899                 case ANYOF_NDIGIT:
3900                     if (LOC)
3901                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3902                     else {
3903                         /* consecutive digits assumed */
3904                         for (value = 0; value < '0'; value++)
3905                             ANYOF_BITMAP_SET(ret, value);
3906                         for (value = '9' + 1; value < 256; value++)
3907                             ANYOF_BITMAP_SET(ret, value);
3908                     }
3909                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3910                     break;
3911                 case ANYOF_GRAPH:
3912                     if (LOC)
3913                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3914                     else {
3915                         for (value = 0; value < 256; value++)
3916                             if (isGRAPH(value))
3917                                 ANYOF_BITMAP_SET(ret, value);
3918                     }
3919                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3920                     break;
3921                 case ANYOF_NGRAPH:
3922                     if (LOC)
3923                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3924                     else {
3925                         for (value = 0; value < 256; value++)
3926                             if (!isGRAPH(value))
3927                                 ANYOF_BITMAP_SET(ret, value);
3928                     }
3929                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3930                     break;
3931                 case ANYOF_LOWER:
3932                     if (LOC)
3933                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3934                     else {
3935                         for (value = 0; value < 256; value++)
3936                             if (isLOWER(value))
3937                                 ANYOF_BITMAP_SET(ret, value);
3938                     }
3939                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3940                     break;
3941                 case ANYOF_NLOWER:
3942                     if (LOC)
3943                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3944                     else {
3945                         for (value = 0; value < 256; value++)
3946                             if (!isLOWER(value))
3947                                 ANYOF_BITMAP_SET(ret, value);
3948                     }
3949                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3950                     break;
3951                 case ANYOF_PRINT:
3952                     if (LOC)
3953                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3954                     else {
3955                         for (value = 0; value < 256; value++)
3956                             if (isPRINT(value))
3957                                 ANYOF_BITMAP_SET(ret, value);
3958                     }
3959                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3960                     break;
3961                 case ANYOF_NPRINT:
3962                     if (LOC)
3963                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3964                     else {
3965                         for (value = 0; value < 256; value++)
3966                             if (!isPRINT(value))
3967                                 ANYOF_BITMAP_SET(ret, value);
3968                     }
3969                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3970                     break;
3971                 case ANYOF_PSXSPC:
3972                     if (LOC)
3973                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3974                     else {
3975                         for (value = 0; value < 256; value++)
3976                             if (isPSXSPC(value))
3977                                 ANYOF_BITMAP_SET(ret, value);
3978                     }
3979                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3980                     break;
3981                 case ANYOF_NPSXSPC:
3982                     if (LOC)
3983                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3984                     else {
3985                         for (value = 0; value < 256; value++)
3986                             if (!isPSXSPC(value))
3987                                 ANYOF_BITMAP_SET(ret, value);
3988                     }
3989                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3990                     break;
3991                 case ANYOF_PUNCT:
3992                     if (LOC)
3993                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3994                     else {
3995                         for (value = 0; value < 256; value++)
3996                             if (isPUNCT(value))
3997                                 ANYOF_BITMAP_SET(ret, value);
3998                     }
3999                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4000                     break;
4001                 case ANYOF_NPUNCT:
4002                     if (LOC)
4003                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4004                     else {
4005                         for (value = 0; value < 256; value++)
4006                             if (!isPUNCT(value))
4007                                 ANYOF_BITMAP_SET(ret, value);
4008                     }
4009                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4010                     break;
4011                 case ANYOF_SPACE:
4012                     if (LOC)
4013                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4014                     else {
4015                         for (value = 0; value < 256; value++)
4016                             if (isSPACE(value))
4017                                 ANYOF_BITMAP_SET(ret, value);
4018                     }
4019                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4020                     break;
4021                 case ANYOF_NSPACE:
4022                     if (LOC)
4023                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4024                     else {
4025                         for (value = 0; value < 256; value++)
4026                             if (!isSPACE(value))
4027                                 ANYOF_BITMAP_SET(ret, value);
4028                     }
4029                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4030                     break;
4031                 case ANYOF_UPPER:
4032                     if (LOC)
4033                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4034                     else {
4035                         for (value = 0; value < 256; value++)
4036                             if (isUPPER(value))
4037                                 ANYOF_BITMAP_SET(ret, value);
4038                     }
4039                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4040                     break;
4041                 case ANYOF_NUPPER:
4042                     if (LOC)
4043                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4044                     else {
4045                         for (value = 0; value < 256; value++)
4046                             if (!isUPPER(value))
4047                                 ANYOF_BITMAP_SET(ret, value);
4048                     }
4049                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4050                     break;
4051                 case ANYOF_XDIGIT:
4052                     if (LOC)
4053                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4054                     else {
4055                         for (value = 0; value < 256; value++)
4056                             if (isXDIGIT(value))
4057                                 ANYOF_BITMAP_SET(ret, value);
4058                     }
4059                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4060                     break;
4061                 case ANYOF_NXDIGIT:
4062                     if (LOC)
4063                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4064                     else {
4065                         for (value = 0; value < 256; value++)
4066                             if (!isXDIGIT(value))
4067                                 ANYOF_BITMAP_SET(ret, value);
4068                     }
4069                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4070                     break;
4071                 default:
4072                     vFAIL("Invalid [::] class");
4073                     break;
4074                 }
4075                 if (LOC)
4076                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4077                 continue;
4078             }
4079         } /* end of namedclass \blah */
4080
4081         if (range) {
4082             if (prevvalue > (IV)value) /* b-a */ {
4083                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4084                               RExC_parse - rangebegin,
4085                               RExC_parse - rangebegin,
4086                               rangebegin);
4087                 range = 0; /* not a valid range */
4088             }
4089         }
4090         else {
4091             prevvalue = value; /* save the beginning of the range */
4092             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4093                 RExC_parse[1] != ']') {
4094                 RExC_parse++;
4095
4096                 /* a bad range like \w-, [:word:]- ? */
4097                 if (namedclass > OOB_NAMEDCLASS) {
4098                     if (ckWARN(WARN_REGEXP))
4099                         vWARN4(RExC_parse,
4100                                "False [] range \"%*.*s\"",
4101                                RExC_parse - rangebegin,
4102                                RExC_parse - rangebegin,
4103                                rangebegin);
4104                     if (!SIZE_ONLY)
4105                         ANYOF_BITMAP_SET(ret, '-');
4106                 } else
4107                     range = 1;  /* yeah, it's a range! */
4108                 continue;       /* but do it the next time */
4109             }
4110         }
4111
4112         /* now is the next time */
4113         if (!SIZE_ONLY) {
4114             IV i;
4115
4116             if (prevvalue < 256) {
4117                 IV ceilvalue = value < 256 ? value : 255;
4118
4119 #ifdef EBCDIC
4120                 /* In EBCDIC [\x89-\x91] should include
4121                  * the \x8e but [i-j] should not. */
4122                 if (literal_endpoint == 2 &&
4123                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4124                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4125                 {
4126                     if (isLOWER(prevvalue)) {
4127                         for (i = prevvalue; i <= ceilvalue; i++)
4128                             if (isLOWER(i))
4129                                 ANYOF_BITMAP_SET(ret, i);
4130                     } else {
4131                         for (i = prevvalue; i <= ceilvalue; i++)
4132                             if (isUPPER(i))
4133                                 ANYOF_BITMAP_SET(ret, i);
4134                     }
4135                 }
4136                 else
4137 #endif
4138                       for (i = prevvalue; i <= ceilvalue; i++)
4139                           ANYOF_BITMAP_SET(ret, i);
4140           }
4141           if (value > 255 || UTF) {
4142                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4143                 UV natvalue      = NATIVE_TO_UNI(value);
4144
4145                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4146                 if (prevnatvalue < natvalue) { /* what about > ? */
4147                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4148                                    prevnatvalue, natvalue);
4149                 }
4150                 else if (prevnatvalue == natvalue) {
4151                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4152                     if (FOLD) {
4153                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4154                          STRLEN foldlen;
4155                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4156
4157                          /* If folding and foldable and a single
4158                           * character, insert also the folded version
4159                           * to the charclass. */
4160                          if (f != value) {
4161                               if (foldlen == (STRLEN)UNISKIP(f))
4162                                   Perl_sv_catpvf(aTHX_ listsv,
4163                                                  "%04"UVxf"\n", f);
4164                               else {
4165                                   /* Any multicharacter foldings
4166                                    * require the following transform:
4167                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4168                                    * where E folds into "pq" and F folds
4169                                    * into "rst", all other characters
4170                                    * fold to single characters.  We save
4171                                    * away these multicharacter foldings,
4172                                    * to be later saved as part of the
4173                                    * additional "s" data. */
4174                                   SV *sv;
4175
4176                                   if (!unicode_alternate)
4177                                       unicode_alternate = newAV();
4178                                   sv = newSVpvn((char*)foldbuf, foldlen);
4179                                   SvUTF8_on(sv);
4180                                   av_push(unicode_alternate, sv);
4181                               }
4182                          }
4183
4184                          /* If folding and the value is one of the Greek
4185                           * sigmas insert a few more sigmas to make the
4186                           * folding rules of the sigmas to work right.
4187                           * Note that not all the possible combinations
4188                           * are handled here: some of them are handled
4189                           * by the standard folding rules, and some of
4190                           * them (literal or EXACTF cases) are handled
4191                           * during runtime in regexec.c:S_find_byclass(). */
4192                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4193                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4194                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4195                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4196                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4197                          }
4198                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4199                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4200                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4201                     }
4202                 }
4203             }
4204 #ifdef EBCDIC
4205             literal_endpoint = 0;
4206 #endif
4207         }
4208
4209         range = 0; /* this range (if it was one) is done now */
4210     }
4211
4212     if (need_class) {
4213         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4214         if (SIZE_ONLY)
4215             RExC_size += ANYOF_CLASS_ADD_SKIP;
4216         else
4217             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4218     }
4219
4220     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4221     if (!SIZE_ONLY &&
4222          /* If the only flag is folding (plus possibly inversion). */
4223         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4224        ) {
4225         for (value = 0; value < 256; ++value) {
4226             if (ANYOF_BITMAP_TEST(ret, value)) {
4227                 UV fold = PL_fold[value];
4228
4229                 if (fold != value)
4230                     ANYOF_BITMAP_SET(ret, fold);
4231             }
4232         }
4233         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4234     }
4235
4236     /* optimize inverted simple patterns (e.g. [^a-z]) */
4237     if (!SIZE_ONLY && optimize_invert &&
4238         /* If the only flag is inversion. */
4239         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4240         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4241             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4242         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4243     }
4244
4245     if (!SIZE_ONLY) {
4246         AV *av = newAV();
4247         SV *rv;
4248
4249         /* The 0th element stores the character class description
4250          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4251          * to initialize the appropriate swash (which gets stored in
4252          * the 1st element), and also useful for dumping the regnode.
4253          * The 2nd element stores the multicharacter foldings,
4254          * used later (regexec.c:S_reginclass()). */
4255         av_store(av, 0, listsv);
4256         av_store(av, 1, NULL);
4257         av_store(av, 2, (SV*)unicode_alternate);
4258         rv = newRV_noinc((SV*)av);
4259         n = add_data(pRExC_state, 1, "s");
4260         RExC_rx->data->data[n] = (void*)rv;
4261         ARG_SET(ret, n);
4262     }
4263
4264     return ret;
4265 }
4266
4267 STATIC char*
4268 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4269 {
4270     char* retval = RExC_parse++;
4271
4272     for (;;) {
4273         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4274                 RExC_parse[2] == '#') {
4275             while (*RExC_parse && *RExC_parse != ')')
4276                 RExC_parse++;
4277             RExC_parse++;
4278             continue;
4279         }
4280         if (RExC_flags & PMf_EXTENDED) {
4281             if (isSPACE(*RExC_parse)) {
4282                 RExC_parse++;
4283                 continue;
4284             }
4285             else if (*RExC_parse == '#') {
4286                 while (*RExC_parse && *RExC_parse != '\n')
4287                     RExC_parse++;
4288                 RExC_parse++;
4289                 continue;
4290             }
4291         }
4292         return retval;
4293     }
4294 }
4295
4296 /*
4297 - reg_node - emit a node
4298 */
4299 STATIC regnode *                        /* Location. */
4300 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4301 {
4302     register regnode *ret;
4303     register regnode *ptr;
4304
4305     ret = RExC_emit;
4306     if (SIZE_ONLY) {
4307         SIZE_ALIGN(RExC_size);
4308         RExC_size += 1;
4309         return(ret);
4310     }
4311
4312     NODE_ALIGN_FILL(ret);
4313     ptr = ret;
4314     FILL_ADVANCE_NODE(ptr, op);
4315     if (RExC_offsets) {         /* MJD */
4316         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4317               "reg_node", __LINE__, 
4318               reg_name[op],
4319               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4320               ? "Overwriting end of array!\n" : "OK",
4321               RExC_emit - RExC_emit_start,
4322               RExC_parse - RExC_start,
4323               RExC_offsets[0])); 
4324         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4325     }
4326             
4327     RExC_emit = ptr;
4328
4329     return(ret);
4330 }
4331
4332 /*
4333 - reganode - emit a node with an argument
4334 */
4335 STATIC regnode *                        /* Location. */
4336 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4337 {
4338     register regnode *ret;
4339     register regnode *ptr;
4340
4341     ret = RExC_emit;
4342     if (SIZE_ONLY) {
4343         SIZE_ALIGN(RExC_size);
4344         RExC_size += 2;
4345         return(ret);
4346     }
4347
4348     NODE_ALIGN_FILL(ret);
4349     ptr = ret;
4350     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4351     if (RExC_offsets) {         /* MJD */
4352         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4353               "reganode",
4354               __LINE__,
4355               reg_name[op],
4356               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4357               "Overwriting end of array!\n" : "OK",
4358               RExC_emit - RExC_emit_start,
4359               RExC_parse - RExC_start,
4360               RExC_offsets[0])); 
4361         Set_Cur_Node_Offset;
4362     }
4363             
4364     RExC_emit = ptr;
4365
4366     return(ret);
4367 }
4368
4369 /*
4370 - reguni - emit (if appropriate) a Unicode character
4371 */
4372 STATIC void
4373 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4374 {
4375     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4376 }
4377
4378 /*
4379 - reginsert - insert an operator in front of already-emitted operand
4380 *
4381 * Means relocating the operand.
4382 */
4383 STATIC void
4384 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4385 {
4386     register regnode *src;
4387     register regnode *dst;
4388     register regnode *place;
4389     register int offset = regarglen[(U8)op];
4390
4391 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4392
4393     if (SIZE_ONLY) {
4394         RExC_size += NODE_STEP_REGNODE + offset;
4395         return;
4396     }
4397
4398     src = RExC_emit;
4399     RExC_emit += NODE_STEP_REGNODE + offset;
4400     dst = RExC_emit;
4401     while (src > opnd) {
4402         StructCopy(--src, --dst, regnode);
4403         if (RExC_offsets) {     /* MJD 20010112 */
4404             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4405                   "reg_insert",
4406                   __LINE__,
4407                   reg_name[op],
4408                   dst - RExC_emit_start > RExC_offsets[0] 
4409                   ? "Overwriting end of array!\n" : "OK",
4410                   src - RExC_emit_start,
4411                   dst - RExC_emit_start,
4412                   RExC_offsets[0])); 
4413             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4414             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4415         }
4416     }
4417     
4418
4419     place = opnd;               /* Op node, where operand used to be. */
4420     if (RExC_offsets) {         /* MJD */
4421         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4422               "reginsert",
4423               __LINE__,
4424               reg_name[op],
4425               place - RExC_emit_start > RExC_offsets[0] 
4426               ? "Overwriting end of array!\n" : "OK",
4427               place - RExC_emit_start,
4428               RExC_parse - RExC_start,
4429               RExC_offsets[0])); 
4430         Set_Node_Offset(place, RExC_parse);
4431     }
4432     src = NEXTOPER(place);
4433     FILL_ADVANCE_NODE(place, op);
4434     Zero(src, offset, regnode);
4435 }
4436
4437 /*
4438 - regtail - set the next-pointer at the end of a node chain of p to val.
4439 */
4440 STATIC void
4441 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4442 {
4443     register regnode *scan;
4444     register regnode *temp;
4445
4446     if (SIZE_ONLY)
4447         return;
4448
4449     /* Find last node. */
4450     scan = p;
4451     for (;;) {
4452         temp = regnext(scan);
4453         if (temp == NULL)
4454             break;
4455         scan = temp;
4456     }
4457
4458     if (reg_off_by_arg[OP(scan)]) {
4459         ARG_SET(scan, val - scan);
4460     }
4461     else {
4462         NEXT_OFF(scan) = val - scan;
4463     }
4464 }
4465
4466 /*
4467 - regoptail - regtail on operand of first argument; nop if operandless
4468 */
4469 STATIC void
4470 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4471 {
4472     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4473     if (p == NULL || SIZE_ONLY)
4474         return;
4475     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4476         regtail(pRExC_state, NEXTOPER(p), val);
4477     }
4478     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4479         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4480     }
4481     else
4482         return;
4483 }
4484
4485 /*
4486  - regcurly - a little FSA that accepts {\d+,?\d*}
4487  */
4488 STATIC I32
4489 S_regcurly(pTHX_ register char *s)
4490 {
4491     if (*s++ != '{')
4492         return FALSE;
4493     if (!isDIGIT(*s))
4494         return FALSE;
4495     while (isDIGIT(*s))
4496         s++;
4497     if (*s == ',')
4498         s++;
4499     while (isDIGIT(*s))
4500         s++;
4501     if (*s != '}')
4502         return FALSE;
4503     return TRUE;
4504 }
4505
4506
4507 #ifdef DEBUGGING
4508
4509 STATIC regnode *
4510 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4511 {
4512     register U8 op = EXACT;     /* Arbitrary non-END op. */
4513     register regnode *next;
4514
4515     while (op != END && (!last || node < last)) {
4516         /* While that wasn't END last time... */
4517
4518         NODE_ALIGN(node);
4519         op = OP(node);
4520         if (op == CLOSE)
4521             l--;        
4522         next = regnext(node);
4523         /* Where, what. */
4524         if (OP(node) == OPTIMIZED)
4525             goto after_print;
4526         regprop(sv, node);
4527         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4528                       (int)(2*l + 1), "", SvPVX(sv));
4529         if (next == NULL)               /* Next ptr. */
4530             PerlIO_printf(Perl_debug_log, "(0)");
4531         else
4532             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4533         (void)PerlIO_putc(Perl_debug_log, '\n');
4534       after_print:
4535         if (PL_regkind[(U8)op] == BRANCHJ) {
4536             register regnode *nnode = (OP(next) == LONGJMP
4537                                        ? regnext(next)
4538                                        : next);
4539             if (last && nnode > last)
4540                 nnode = last;
4541             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4542         }
4543         else if (PL_regkind[(U8)op] == BRANCH) {
4544             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4545         }
4546         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4547             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4548                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4549         }
4550         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4551             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4552                              next, sv, l + 1);
4553         }
4554         else if ( op == PLUS || op == STAR) {
4555             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4556         }
4557         else if (op == ANYOF) {
4558             /* arglen 1 + class block */
4559             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4560                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4561             node = NEXTOPER(node);
4562         }
4563         else if (PL_regkind[(U8)op] == EXACT) {
4564             /* Literal string, where present. */
4565             node += NODE_SZ_STR(node) - 1;
4566             node = NEXTOPER(node);
4567         }
4568         else {
4569             node = NEXTOPER(node);
4570             node += regarglen[(U8)op];
4571         }
4572         if (op == CURLYX || op == OPEN)
4573             l++;
4574         else if (op == WHILEM)
4575             l--;
4576     }
4577     return node;
4578 }
4579
4580 #endif  /* DEBUGGING */
4581
4582 /*
4583  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4584  */
4585 void
4586 Perl_regdump(pTHX_ regexp *r)
4587 {
4588 #ifdef DEBUGGING
4589     SV *sv = sv_newmortal();
4590
4591     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4592
4593     /* Header fields of interest. */
4594     if (r->anchored_substr)
4595         PerlIO_printf(Perl_debug_log,
4596                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4597                       PL_colors[0],
4598                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4599                       SvPVX(r->anchored_substr),
4600                       PL_colors[1],
4601                       SvTAIL(r->anchored_substr) ? "$" : "",
4602                       (IV)r->anchored_offset);
4603     else if (r->anchored_utf8)
4604         PerlIO_printf(Perl_debug_log,
4605                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4606                       PL_colors[0],
4607                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4608                       SvPVX(r->anchored_utf8),
4609                       PL_colors[1],
4610                       SvTAIL(r->anchored_utf8) ? "$" : "",
4611                       (IV)r->anchored_offset);
4612     if (r->float_substr)
4613         PerlIO_printf(Perl_debug_log,
4614                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4615                       PL_colors[0],
4616                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4617                       SvPVX(r->float_substr),
4618                       PL_colors[1],
4619                       SvTAIL(r->float_substr) ? "$" : "",
4620                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4621     else if (r->float_utf8)
4622         PerlIO_printf(Perl_debug_log,
4623                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4624                       PL_colors[0],
4625                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4626                       SvPVX(r->float_utf8),
4627                       PL_colors[1],
4628                       SvTAIL(r->float_utf8) ? "$" : "",
4629                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4630     if (r->check_substr || r->check_utf8)
4631         PerlIO_printf(Perl_debug_log,
4632                       r->check_substr == r->float_substr
4633                       && r->check_utf8 == r->float_utf8
4634                       ? "(checking floating" : "(checking anchored");
4635     if (r->reganch & ROPT_NOSCAN)
4636         PerlIO_printf(Perl_debug_log, " noscan");
4637     if (r->reganch & ROPT_CHECK_ALL)
4638         PerlIO_printf(Perl_debug_log, " isall");
4639     if (r->check_substr || r->check_utf8)
4640         PerlIO_printf(Perl_debug_log, ") ");
4641
4642     if (r->regstclass) {
4643         regprop(sv, r->regstclass);
4644         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4645     }
4646     if (r->reganch & ROPT_ANCH) {
4647         PerlIO_printf(Perl_debug_log, "anchored");
4648         if (r->reganch & ROPT_ANCH_BOL)
4649             PerlIO_printf(Perl_debug_log, "(BOL)");
4650         if (r->reganch & ROPT_ANCH_MBOL)
4651             PerlIO_printf(Perl_debug_log, "(MBOL)");
4652         if (r->reganch & ROPT_ANCH_SBOL)
4653             PerlIO_printf(Perl_debug_log, "(SBOL)");
4654         if (r->reganch & ROPT_ANCH_GPOS)
4655             PerlIO_printf(Perl_debug_log, "(GPOS)");
4656         PerlIO_putc(Perl_debug_log, ' ');
4657     }
4658     if (r->reganch & ROPT_GPOS_SEEN)
4659         PerlIO_printf(Perl_debug_log, "GPOS ");
4660     if (r->reganch & ROPT_SKIP)
4661         PerlIO_printf(Perl_debug_log, "plus ");
4662     if (r->reganch & ROPT_IMPLICIT)
4663         PerlIO_printf(Perl_debug_log, "implicit ");
4664     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4665     if (r->reganch & ROPT_EVAL_SEEN)
4666         PerlIO_printf(Perl_debug_log, "with eval ");
4667     PerlIO_printf(Perl_debug_log, "\n");
4668     if (r->offsets) {
4669       U32 i;
4670       U32 len = r->offsets[0];
4671       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4672       for (i = 1; i <= len; i++)
4673         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4674                       (UV)r->offsets[i*2-1], 
4675                       (UV)r->offsets[i*2]);
4676       PerlIO_printf(Perl_debug_log, "\n");
4677     }
4678 #endif  /* DEBUGGING */
4679 }
4680
4681 #ifdef DEBUGGING
4682
4683 STATIC void
4684 S_put_byte(pTHX_ SV *sv, int c)
4685 {
4686     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4687         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4688     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4689         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4690     else
4691         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4692 }
4693
4694 #endif  /* DEBUGGING */
4695
4696 /*
4697 - regprop - printable representation of opcode
4698 */
4699 void
4700 Perl_regprop(pTHX_ SV *sv, regnode *o)
4701 {
4702 #ifdef DEBUGGING
4703     register int k;
4704
4705     sv_setpvn(sv, "", 0);
4706     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4707         /* It would be nice to FAIL() here, but this may be called from
4708            regexec.c, and it would be hard to supply pRExC_state. */
4709         Perl_croak(aTHX_ "Corrupted regexp opcode");
4710     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4711
4712     k = PL_regkind[(U8)OP(o)];
4713
4714     if (k == EXACT) {
4715         SV *dsv = sv_2mortal(newSVpvn("", 0));
4716         /* Using is_utf8_string() is a crude hack but it may
4717          * be the best for now since we have no flag "this EXACTish
4718          * node was UTF-8" --jhi */
4719         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4720         char *s    = do_utf8 ?
4721           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4722                          UNI_DISPLAY_REGEX) :
4723           STRING(o);
4724         int len = do_utf8 ?
4725           strlen(s) :
4726           STR_LEN(o);
4727         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4728                        PL_colors[0],
4729                        len, s,
4730                        PL_colors[1]);
4731     }
4732     else if (k == CURLY) {
4733         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4734             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4735         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4736     }
4737     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4738         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4739     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4740         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4741     else if (k == LOGICAL)
4742         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4743     else if (k == ANYOF) {
4744         int i, rangestart = -1;
4745         U8 flags = ANYOF_FLAGS(o);
4746         const char * const anyofs[] = { /* Should be synchronized with
4747                                          * ANYOF_ #xdefines in regcomp.h */
4748             "\\w",
4749             "\\W",
4750             "\\s",
4751             "\\S",
4752             "\\d",
4753             "\\D",
4754             "[:alnum:]",
4755             "[:^alnum:]",
4756             "[:alpha:]",
4757             "[:^alpha:]",
4758             "[:ascii:]",
4759             "[:^ascii:]",
4760             "[:ctrl:]",
4761             "[:^ctrl:]",
4762             "[:graph:]",
4763             "[:^graph:]",
4764             "[:lower:]",
4765             "[:^lower:]",
4766             "[:print:]",
4767             "[:^print:]",
4768             "[:punct:]",
4769             "[:^punct:]",
4770             "[:upper:]",
4771             "[:^upper:]",
4772             "[:xdigit:]",
4773             "[:^xdigit:]",
4774             "[:space:]",
4775             "[:^space:]",
4776             "[:blank:]",
4777             "[:^blank:]"
4778         };
4779
4780         if (flags & ANYOF_LOCALE)
4781             sv_catpv(sv, "{loc}");
4782         if (flags & ANYOF_FOLD)
4783             sv_catpv(sv, "{i}");
4784         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4785         if (flags & ANYOF_INVERT)
4786             sv_catpv(sv, "^");
4787         for (i = 0; i <= 256; i++) {
4788             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4789                 if (rangestart == -1)
4790                     rangestart = i;
4791             } else if (rangestart != -1) {
4792                 if (i <= rangestart + 3)
4793                     for (; rangestart < i; rangestart++)
4794                         put_byte(sv, rangestart);
4795                 else {
4796                     put_byte(sv, rangestart);
4797                     sv_catpv(sv, "-");
4798                     put_byte(sv, i - 1);
4799                 }
4800                 rangestart = -1;
4801             }
4802         }
4803
4804         if (o->flags & ANYOF_CLASS)
4805             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4806                 if (ANYOF_CLASS_TEST(o,i))
4807                     sv_catpv(sv, anyofs[i]);
4808
4809         if (flags & ANYOF_UNICODE)
4810             sv_catpv(sv, "{unicode}");
4811         else if (flags & ANYOF_UNICODE_ALL)
4812             sv_catpv(sv, "{unicode_all}");
4813
4814         {
4815             SV *lv;
4816             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4817         
4818             if (lv) {
4819                 if (sw) {
4820                     U8 s[UTF8_MAXLEN+1];
4821                 
4822                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4823                         U8 *e = uvchr_to_utf8(s, i);
4824                         
4825                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4826                             if (rangestart == -1)
4827                                 rangestart = i;
4828                         } else if (rangestart != -1) {
4829                             U8 *p;
4830                         
4831                             if (i <= rangestart + 3)
4832                                 for (; rangestart < i; rangestart++) {
4833                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4834                                         put_byte(sv, *p);
4835                                 }
4836                             else {
4837                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4838                                     put_byte(sv, *p);
4839                                 sv_catpv(sv, "-");
4840                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4841                                         put_byte(sv, *p);
4842                                 }
4843                                 rangestart = -1;
4844                             }
4845                         }
4846                         
4847                     sv_catpv(sv, "..."); /* et cetera */
4848                 }
4849
4850                 {
4851                     char *s = savepv(SvPVX(lv));
4852                     char *origs = s;
4853                 
4854                     while(*s && *s != '\n') s++;
4855                 
4856                     if (*s == '\n') {
4857                         char *t = ++s;
4858                         
4859                         while (*s) {
4860                             if (*s == '\n')
4861                                 *s = ' ';
4862                             s++;
4863                         }
4864                         if (s[-1] == ' ')
4865                             s[-1] = 0;
4866                         
4867                         sv_catpv(sv, t);
4868                     }
4869                 
4870                     Safefree(origs);
4871                 }
4872             }
4873         }
4874
4875         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4876     }
4877     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4878         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4879 #endif  /* DEBUGGING */
4880 }
4881
4882 SV *
4883 Perl_re_intuit_string(pTHX_ regexp *prog)
4884 {                               /* Assume that RE_INTUIT is set */
4885     DEBUG_r(
4886         {   STRLEN n_a;
4887             char *s = SvPV(prog->check_substr
4888                       ? prog->check_substr : prog->check_utf8, n_a);
4889
4890             if (!PL_colorset) reginitcolors();
4891             PerlIO_printf(Perl_debug_log,
4892                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4893                       PL_colors[4],
4894                       prog->check_substr ? "" : "utf8 ",
4895                       PL_colors[5],PL_colors[0],
4896                       s,
4897                       PL_colors[1],
4898                       (strlen(s) > 60 ? "..." : ""));
4899         } );
4900
4901     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4902 }
4903
4904 void
4905 Perl_pregfree(pTHX_ struct regexp *r)
4906 {
4907 #ifdef DEBUGGING
4908     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4909 #endif
4910
4911     if (!r || (--r->refcnt > 0))
4912         return;
4913     DEBUG_r({
4914          int len;
4915          char *s;
4916
4917          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4918                 r->prelen, 60, UNI_DISPLAY_REGEX)
4919             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4920          len = SvCUR(dsv);
4921          if (!PL_colorset)
4922               reginitcolors();
4923          PerlIO_printf(Perl_debug_log,
4924                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4925                        PL_colors[4],PL_colors[5],PL_colors[0],
4926                        len, len, s,
4927                        PL_colors[1],
4928                        len > 60 ? "..." : "");
4929     });
4930
4931     if (r->precomp)
4932         Safefree(r->precomp);
4933     if (r->offsets)             /* 20010421 MJD */
4934         Safefree(r->offsets);
4935     RX_MATCH_COPY_FREE(r);
4936 #ifdef PERL_COPY_ON_WRITE
4937     if (r->saved_copy)
4938         SvREFCNT_dec(r->saved_copy);
4939 #endif
4940     if (r->substrs) {
4941         if (r->anchored_substr)
4942             SvREFCNT_dec(r->anchored_substr);
4943         if (r->anchored_utf8)
4944             SvREFCNT_dec(r->anchored_utf8);
4945         if (r->float_substr)
4946             SvREFCNT_dec(r->float_substr);
4947         if (r->float_utf8)
4948             SvREFCNT_dec(r->float_utf8);
4949         Safefree(r->substrs);
4950     }
4951     if (r->data) {
4952         int n = r->data->count;
4953         PAD* new_comppad = NULL;
4954         PAD* old_comppad;
4955
4956         while (--n >= 0) {
4957           /* If you add a ->what type here, update the comment in regcomp.h */
4958             switch (r->data->what[n]) {
4959             case 's':
4960                 SvREFCNT_dec((SV*)r->data->data[n]);
4961                 break;
4962             case 'f':
4963                 Safefree(r->data->data[n]);
4964                 break;
4965             case 'p':
4966                 new_comppad = (AV*)r->data->data[n];
4967                 break;
4968             case 'o':
4969                 if (new_comppad == NULL)
4970                     Perl_croak(aTHX_ "panic: pregfree comppad");
4971                 PAD_SAVE_LOCAL(old_comppad,
4972                     /* Watch out for global destruction's random ordering. */
4973                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4974                                 new_comppad : Null(PAD *)
4975                 );
4976                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4977                     op_free((OP_4tree*)r->data->data[n]);
4978                 }
4979
4980                 PAD_RESTORE_LOCAL(old_comppad);
4981                 SvREFCNT_dec((SV*)new_comppad);
4982                 new_comppad = NULL;
4983                 break;
4984             case 'n':
4985                 break;
4986             default:
4987                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4988             }
4989         }
4990         Safefree(r->data->what);
4991         Safefree(r->data);
4992     }
4993     Safefree(r->startp);
4994     Safefree(r->endp);
4995     Safefree(r);
4996 }
4997
4998 /*
4999  - regnext - dig the "next" pointer out of a node
5000  *
5001  * [Note, when REGALIGN is defined there are two places in regmatch()
5002  * that bypass this code for speed.]
5003  */
5004 regnode *
5005 Perl_regnext(pTHX_ register regnode *p)
5006 {
5007     register I32 offset;
5008
5009     if (p == &PL_regdummy)
5010         return(NULL);
5011
5012     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5013     if (offset == 0)
5014         return(NULL);
5015
5016     return(p+offset);
5017 }
5018
5019 STATIC void     
5020 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5021 {
5022     va_list args;
5023     STRLEN l1 = strlen(pat1);
5024     STRLEN l2 = strlen(pat2);
5025     char buf[512];
5026     SV *msv;
5027     char *message;
5028
5029     if (l1 > 510)
5030         l1 = 510;
5031     if (l1 + l2 > 510)
5032         l2 = 510 - l1;
5033     Copy(pat1, buf, l1 , char);
5034     Copy(pat2, buf + l1, l2 , char);
5035     buf[l1 + l2] = '\n';
5036     buf[l1 + l2 + 1] = '\0';
5037 #ifdef I_STDARG
5038     /* ANSI variant takes additional second argument */
5039     va_start(args, pat2);
5040 #else
5041     va_start(args);
5042 #endif
5043     msv = vmess(buf, &args);
5044     va_end(args);
5045     message = SvPV(msv,l1);
5046     if (l1 > 512)
5047         l1 = 512;
5048     Copy(message, buf, l1 , char);
5049     buf[l1] = '\0';                     /* Overwrite \n */
5050     Perl_croak(aTHX_ "%s", buf);
5051 }
5052
5053 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5054
5055 void
5056 Perl_save_re_context(pTHX)
5057 {
5058     SAVEI32(PL_reg_flags);              /* from regexec.c */
5059     SAVEPPTR(PL_bostr);
5060     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5061     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5062     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5063     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5064     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5065     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5066     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5067     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5068     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5069     PL_reg_start_tmp = 0;
5070     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5071     PL_reg_start_tmpl = 0;
5072     SAVEVPTR(PL_regdata);
5073     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5074     SAVEI32(PL_regnarrate);             /* from regexec.c */
5075     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5076     SAVEINT(PL_regindent);              /* from regexec.c */
5077     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5078     SAVEVPTR(PL_curcop);
5079     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5080     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5081     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5082     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5083     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5084     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5085     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5086     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5087     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5088     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5089     PL_reg_oldsaved = Nullch;
5090     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5091     PL_reg_oldsavedlen = 0;
5092 #ifdef PERL_COPY_ON_WRITE
5093     SAVESPTR(PL_nrs);
5094     PL_nrs = Nullsv;
5095 #endif
5096     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5097     PL_reg_maxiter = 0;
5098     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5099     PL_reg_leftiter = 0;
5100     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5101     PL_reg_poscache = Nullch;
5102     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5103     PL_reg_poscache_size = 0;
5104     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5105     SAVEI32(PL_regnpar);                /* () count. */
5106     SAVEI32(PL_regsize);                /* from regexec.c */
5107
5108     {
5109         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5110         U32 i;
5111         GV *mgv;
5112         REGEXP *rx;
5113         char digits[16];
5114
5115         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5116             for (i = 1; i <= rx->nparens; i++) {
5117                 sprintf(digits, "%lu", (long)i);
5118                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5119                     save_scalar(mgv);
5120             }
5121         }
5122     }
5123
5124 #ifdef DEBUGGING
5125     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5126 #endif
5127 }
5128
5129 static void
5130 clear_re(pTHX_ void *r)
5131 {
5132     ReREFCNT_dec((regexp *)r);
5133 }
5134