This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73  ****    2000, 2001, 2002, 2003, by Larry Wall and others
74  ****
75  ****    You may distribute under the terms of either the GNU General Public
76  ****    License or the Artistic License, as specified in the README file.
77
78  *
79  * Beware that some of this code is subtly aware of the way operator
80  * precedence is structured in regular expressions.  Serious changes in
81  * regular-expression syntax might require a total rethink.
82  */
83 #include "EXTERN.h"
84 #define PERL_IN_REGCOMP_C
85 #include "perl.h"
86
87 #ifndef PERL_IN_XSUB_RE
88 #  include "INTERN.h"
89 #endif
90
91 #define REG_COMP_C
92 #include "regcomp.h"
93
94 #ifdef op
95 #undef op
96 #endif /* op */
97
98 #ifdef MSDOS
99 # if defined(BUGGY_MSC6)
100  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101  # pragma optimize("a",off)
102  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103  # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
105 #endif /* MSDOS */
106
107 #ifndef STATIC
108 #define STATIC  static
109 #endif
110
111 typedef struct RExC_state_t {
112     U32         flags;                  /* are we folding, multilining? */
113     char        *precomp;               /* uncompiled string. */
114     regexp      *rx;
115     char        *start;                 /* Start of input for compile */
116     char        *end;                   /* End of input for compile */
117     char        *parse;                 /* Input-scan pointer. */
118     I32         whilem_seen;            /* number of WHILEM in this expr */
119     regnode     *emit_start;            /* Start of emitted-code area */
120     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
121     I32         naughty;                /* How bad is this pattern? */
122     I32         sawback;                /* Did we see \1, ...? */
123     U32         seen;
124     I32         size;                   /* Code size. */
125     I32         npar;                   /* () count. */
126     I32         extralen;
127     I32         seen_zerolen;
128     I32         seen_evals;
129     I32         utf8;
130 #if ADD_TO_REGEXEC
131     char        *starttry;              /* -Dr: where regtry was called. */
132 #define RExC_starttry   (pRExC_state->starttry)
133 #endif
134 } RExC_state_t;
135
136 #define RExC_flags      (pRExC_state->flags)
137 #define RExC_precomp    (pRExC_state->precomp)
138 #define RExC_rx         (pRExC_state->rx)
139 #define RExC_start      (pRExC_state->start)
140 #define RExC_end        (pRExC_state->end)
141 #define RExC_parse      (pRExC_state->parse)
142 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
143 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit       (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty    (pRExC_state->naughty)
147 #define RExC_sawback    (pRExC_state->sawback)
148 #define RExC_seen       (pRExC_state->seen)
149 #define RExC_size       (pRExC_state->size)
150 #define RExC_npar       (pRExC_state->npar)
151 #define RExC_extralen   (pRExC_state->extralen)
152 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8       (pRExC_state->utf8)
155
156 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158         ((*s) == '{' && regcurly(s)))
159
160 #ifdef SPSTART
161 #undef SPSTART          /* dratted cpp namespace... */
162 #endif
163 /*
164  * Flags to be passed up and down.
165  */
166 #define WORST           0       /* Worst case. */
167 #define HASWIDTH        0x1     /* Known to match non-null strings. */
168 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART         0x4     /* Starts with * or +. */
170 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
171
172 /* Length of a variant. */
173
174 typedef struct scan_data_t {
175     I32 len_min;
176     I32 len_delta;
177     I32 pos_min;
178     I32 pos_delta;
179     SV *last_found;
180     I32 last_end;                       /* min value, <0 unless valid. */
181     I32 last_start_min;
182     I32 last_start_max;
183     SV **longest;                       /* Either &l_fixed, or &l_float. */
184     SV *longest_fixed;
185     I32 offset_fixed;
186     SV *longest_float;
187     I32 offset_float_min;
188     I32 offset_float_max;
189     I32 flags;
190     I32 whilem_c;
191     I32 *last_closep;
192     struct regnode_charclass_class *start_class;
193 } scan_data_t;
194
195 /*
196  * Forward declarations for pregcomp()'s friends.
197  */
198
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200                                       0, 0, 0, 0, 0, 0};
201
202 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL          0x1
204 #define SF_BEFORE_MEOL          0x2
205 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
208 #ifdef NO_UNARY_PLUS
209 #  define SF_FIX_SHIFT_EOL      (0+2)
210 #  define SF_FL_SHIFT_EOL               (0+4)
211 #else
212 #  define SF_FIX_SHIFT_EOL      (+2)
213 #  define SF_FL_SHIFT_EOL               (+4)
214 #endif
215
216 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF               0x40
222 #define SF_HAS_PAR              0x80
223 #define SF_IN_PAR               0x100
224 #define SF_HAS_EVAL             0x200
225 #define SCF_DO_SUBSTR           0x400
226 #define SCF_DO_STCLASS_AND      0x0800
227 #define SCF_DO_STCLASS_OR       0x1000
228 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS  0x2000
230
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234
235 #define OOB_UNICODE             12345678
236 #define OOB_NAMEDCLASS          -1
237
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
241
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
244
245 /*
246  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248  * op/pragma/warn/regcomp.
249  */
250 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
252
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
254
255 /*
256  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257  * arg. Show regex, up to a maximum length. If it's too long, chop and add
258  * "...".
259  */
260 #define FAIL(msg) STMT_START {                                          \
261     char *ellipses = "";                                                \
262     IV len = RExC_end - RExC_precomp;                                   \
263                                                                         \
264     if (!SIZE_ONLY)                                                     \
265         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
266     if (len > RegexLengthToShowInErrorMessages) {                       \
267         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
268         len = RegexLengthToShowInErrorMessages - 10;                    \
269         ellipses = "...";                                               \
270     }                                                                   \
271     Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                           \
272             msg, (int)len, RExC_precomp, ellipses);                     \
273 } STMT_END
274
275 /*
276  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277  * args. Show regex, up to a maximum length. If it's too long, chop and add
278  * "...".
279  */
280 #define FAIL2(pat,msg) STMT_START {                                     \
281     char *ellipses = "";                                                \
282     IV len = RExC_end - RExC_precomp;                                   \
283                                                                         \
284     if (!SIZE_ONLY)                                                     \
285         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                      \
286     if (len > RegexLengthToShowInErrorMessages) {                       \
287         /* chop 10 shorter than the max, to ensure meaning of "..." */  \
288         len = RegexLengthToShowInErrorMessages - 10;                    \
289         ellipses = "...";                                               \
290     }                                                                   \
291     S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                       \
292             msg, (int)len, RExC_precomp, ellipses);                     \
293 } STMT_END
294
295
296 /*
297  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298  */
299 #define Simple_vFAIL(m) STMT_START {                                    \
300     IV offset = RExC_parse - RExC_precomp;                              \
301     Perl_croak(aTHX_ "%s" REPORT_LOCATION,                              \
302             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
303 } STMT_END
304
305 /*
306  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307  */
308 #define vFAIL(m) STMT_START {                           \
309     if (!SIZE_ONLY)                                     \
310         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
311     Simple_vFAIL(m);                                    \
312 } STMT_END
313
314 /*
315  * Like Simple_vFAIL(), but accepts two arguments.
316  */
317 #define Simple_vFAIL2(m,a1) STMT_START {                        \
318     IV offset = RExC_parse - RExC_precomp;                      \
319     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,                   \
320             (int)offset, RExC_precomp, RExC_precomp + offset);  \
321 } STMT_END
322
323 /*
324  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325  */
326 #define vFAIL2(m,a1) STMT_START {                       \
327     if (!SIZE_ONLY)                                     \
328         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
329     Simple_vFAIL2(m, a1);                               \
330 } STMT_END
331
332
333 /*
334  * Like Simple_vFAIL(), but accepts three arguments.
335  */
336 #define Simple_vFAIL3(m, a1, a2) STMT_START {                   \
337     IV offset = RExC_parse - RExC_precomp;                      \
338     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,               \
339             (int)offset, RExC_precomp, RExC_precomp + offset);  \
340 } STMT_END
341
342 /*
343  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344  */
345 #define vFAIL3(m,a1,a2) STMT_START {                    \
346     if (!SIZE_ONLY)                                     \
347         SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);      \
348     Simple_vFAIL3(m, a1, a2);                           \
349 } STMT_END
350
351 /*
352  * Like Simple_vFAIL(), but accepts four arguments.
353  */
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START {               \
355     IV offset = RExC_parse - RExC_precomp;                      \
356     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,           \
357             (int)offset, RExC_precomp, RExC_precomp + offset);  \
358 } STMT_END
359
360 /*
361  * Like Simple_vFAIL(), but accepts five arguments.
362  */
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START {           \
364     IV offset = RExC_parse - RExC_precomp;                      \
365     S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,       \
366             (int)offset, RExC_precomp, RExC_precomp + offset);  \
367 } STMT_END
368
369
370 #define vWARN(loc,m) STMT_START {                                       \
371     IV offset = loc - RExC_precomp;                                     \
372     Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,      \
373             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
374 } STMT_END
375
376 #define vWARNdep(loc,m) STMT_START {                                    \
377     IV offset = loc - RExC_precomp;                                     \
378     Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP),          \
379             "%s" REPORT_LOCATION,                                       \
380             m, (int)offset, RExC_precomp, RExC_precomp + offset);       \
381 } STMT_END
382
383
384 #define vWARN2(loc, m, a1) STMT_START {                                 \
385     IV offset = loc - RExC_precomp;                                     \
386     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
387             a1, (int)offset, RExC_precomp, RExC_precomp + offset);      \
388 } STMT_END
389
390 #define vWARN3(loc, m, a1, a2) STMT_START {                             \
391     IV offset = loc - RExC_precomp;                                     \
392     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
393             a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset);  \
394 } STMT_END
395
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START {                         \
397     IV offset = loc - RExC_precomp;                                     \
398     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
399             a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 } STMT_END
401
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START {                     \
403     IV offset = loc - RExC_precomp;                                     \
404     Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,         \
405             a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 } STMT_END
407
408
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START {                  \
411     if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412 } STMT_END
413
414 /* Macros for recording node offsets.   20001227 mjd@plover.com 
415  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
416  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
417  * Element 0 holds the number n.
418  */
419
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424 #define Set_Node_Offset_To_R(node,byte) STMT_START {                    \
425     if (! SIZE_ONLY) {                                                  \
426         MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n",         \
427                 __LINE__, (node), (byte)));                             \
428         if((node) < 0) {                                                \
429             Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430         } else {                                                        \
431             RExC_offsets[2*(node)-1] = (byte);                          \
432         }                                                               \
433     }                                                                   \
434 } STMT_END
435
436 #define Set_Node_Offset(node,byte) \
437     Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440 #define Set_Node_Length_To_R(node,len) STMT_START {                     \
441     if (! SIZE_ONLY) {                                                  \
442         MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n",           \
443                 __LINE__, (node), (len)));                              \
444         if((node) < 0) {                                                \
445             Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446         } else {                                                        \
447             RExC_offsets[2*(node)] = (len);                             \
448         }                                                               \
449     }                                                                   \
450 } STMT_END
451
452 #define Set_Node_Length(node,len) \
453     Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456     Set_Node_Length(node, RExC_parse - parse_start)
457
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
462 static void clear_re(pTHX_ void *r);
463
464 /* Mark that we cannot extend a found fixed substring at this point.
465    Updata the longest found anchored substring and the longest found
466    floating substrings if needed. */
467
468 STATIC void
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 {
471     STRLEN l = CHR_SVLEN(data->last_found);
472     STRLEN old_l = CHR_SVLEN(*data->longest);
473
474     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475         SvSetMagicSV(*data->longest, data->last_found);
476         if (*data->longest == data->longest_fixed) {
477             data->offset_fixed = l ? data->last_start_min : data->pos_min;
478             if (data->flags & SF_BEFORE_EOL)
479                 data->flags
480                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481             else
482                 data->flags &= ~SF_FIX_BEFORE_EOL;
483         }
484         else {
485             data->offset_float_min = l ? data->last_start_min : data->pos_min;
486             data->offset_float_max = (l
487                                       ? data->last_start_max
488                                       : data->pos_min + data->pos_delta);
489             if ((U32)data->offset_float_max > (U32)I32_MAX)
490                 data->offset_float_max = I32_MAX;
491             if (data->flags & SF_BEFORE_EOL)
492                 data->flags
493                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494             else
495                 data->flags &= ~SF_FL_BEFORE_EOL;
496         }
497     }
498     SvCUR_set(data->last_found, 0);
499     {
500         SV * sv = data->last_found;
501         MAGIC *mg =
502             SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503         if (mg && mg->mg_len > 0)
504             mg->mg_len = 0;
505     }
506     data->last_end = -1;
507     data->flags &= ~SF_BEFORE_EOL;
508 }
509
510 /* Can match anything (initialization) */
511 STATIC void
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 {
514     ANYOF_CLASS_ZERO(cl);
515     ANYOF_BITMAP_SETALL(cl);
516     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517     if (LOC)
518         cl->flags |= ANYOF_LOCALE;
519 }
520
521 /* Can match anything (initialization) */
522 STATIC int
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524 {
525     int value;
526
527     for (value = 0; value <= ANYOF_MAX; value += 2)
528         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529             return 1;
530     if (!(cl->flags & ANYOF_UNICODE_ALL))
531         return 0;
532     if (!ANYOF_BITMAP_TESTALLSET(cl))
533         return 0;
534     return 1;
535 }
536
537 /* Can match anything (initialization) */
538 STATIC void
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 {
541     Zero(cl, 1, struct regnode_charclass_class);
542     cl->type = ANYOF;
543     cl_anything(pRExC_state, cl);
544 }
545
546 STATIC void
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 {
549     Zero(cl, 1, struct regnode_charclass_class);
550     cl->type = ANYOF;
551     cl_anything(pRExC_state, cl);
552     if (LOC)
553         cl->flags |= ANYOF_LOCALE;
554 }
555
556 /* 'And' a given class with another one.  Can create false positives */
557 /* We assume that cl is not inverted */
558 STATIC void
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560          struct regnode_charclass_class *and_with)
561 {
562     if (!(and_with->flags & ANYOF_CLASS)
563         && !(cl->flags & ANYOF_CLASS)
564         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565         && !(and_with->flags & ANYOF_FOLD)
566         && !(cl->flags & ANYOF_FOLD)) {
567         int i;
568
569         if (and_with->flags & ANYOF_INVERT)
570             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571                 cl->bitmap[i] &= ~and_with->bitmap[i];
572         else
573             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574                 cl->bitmap[i] &= and_with->bitmap[i];
575     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576     if (!(and_with->flags & ANYOF_EOS))
577         cl->flags &= ~ANYOF_EOS;
578
579     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580         !(and_with->flags & ANYOF_INVERT)) {
581         cl->flags &= ~ANYOF_UNICODE_ALL;
582         cl->flags |= ANYOF_UNICODE;
583         ARG_SET(cl, ARG(and_with));
584     }
585     if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586         !(and_with->flags & ANYOF_INVERT))
587         cl->flags &= ~ANYOF_UNICODE_ALL;
588     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589         !(and_with->flags & ANYOF_INVERT))
590         cl->flags &= ~ANYOF_UNICODE;
591 }
592
593 /* 'OR' a given class with another one.  Can create false positives */
594 /* We assume that cl is not inverted */
595 STATIC void
596 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
597 {
598     if (or_with->flags & ANYOF_INVERT) {
599         /* We do not use
600          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601          *   <= (B1 | !B2) | (CL1 | !CL2)
602          * which is wasteful if CL2 is small, but we ignore CL2:
603          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604          * XXXX Can we handle case-fold?  Unclear:
605          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607          */
608         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609              && !(or_with->flags & ANYOF_FOLD)
610              && !(cl->flags & ANYOF_FOLD) ) {
611             int i;
612
613             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614                 cl->bitmap[i] |= ~or_with->bitmap[i];
615         } /* XXXX: logic is complicated otherwise */
616         else {
617             cl_anything(pRExC_state, cl);
618         }
619     } else {
620         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
622              && (!(or_with->flags & ANYOF_FOLD)
623                  || (cl->flags & ANYOF_FOLD)) ) {
624             int i;
625
626             /* OR char bitmap and class bitmap separately */
627             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628                 cl->bitmap[i] |= or_with->bitmap[i];
629             if (or_with->flags & ANYOF_CLASS) {
630                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631                     cl->classflags[i] |= or_with->classflags[i];
632                 cl->flags |= ANYOF_CLASS;
633             }
634         }
635         else { /* XXXX: logic is complicated, leave it along for a moment. */
636             cl_anything(pRExC_state, cl);
637         }
638     }
639     if (or_with->flags & ANYOF_EOS)
640         cl->flags |= ANYOF_EOS;
641
642     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643         ARG(cl) != ARG(or_with)) {
644         cl->flags |= ANYOF_UNICODE_ALL;
645         cl->flags &= ~ANYOF_UNICODE;
646     }
647     if (or_with->flags & ANYOF_UNICODE_ALL) {
648         cl->flags |= ANYOF_UNICODE_ALL;
649         cl->flags &= ~ANYOF_UNICODE;
650     }
651 }
652
653 /*
654  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655  * These need to be revisited when a newer toolchain becomes available.
656  */
657 #if defined(__sparc64__) && defined(__GNUC__)
658 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659 #       undef  SPARC64_GCC_WORKAROUND
660 #       define SPARC64_GCC_WORKAROUND 1
661 #   endif
662 #endif
663
664 /* REx optimizer.  Converts nodes into quickier variants "in place".
665    Finds fixed substrings.  */
666
667 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668    to the position after last scanned or to NULL. */
669
670 STATIC I32
671 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
672                         /* scanp: Start here (read-write). */
673                         /* deltap: Write maxlen-minlen here. */
674                         /* last: Stop before this one. */
675 {
676     I32 min = 0, pars = 0, code;
677     regnode *scan = *scanp, *next;
678     I32 delta = 0;
679     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
680     int is_inf_internal = 0;            /* The studied chunk is infinite */
681     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682     scan_data_t data_fake;
683     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
684
685     while (scan && OP(scan) != END && scan < last) {
686         /* Peephole optimizer: */
687
688         if (PL_regkind[(U8)OP(scan)] == EXACT) {
689             /* Merge several consecutive EXACTish nodes into one. */
690             regnode *n = regnext(scan);
691             U32 stringok = 1;
692 #ifdef DEBUGGING
693             regnode *stop = scan;
694 #endif
695
696             next = scan + NODE_SZ_STR(scan);
697             /* Skip NOTHING, merge EXACT*. */
698             while (n &&
699                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
700                      (stringok && (OP(n) == OP(scan))))
701                    && NEXT_OFF(n)
702                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703                 if (OP(n) == TAIL || n > next)
704                     stringok = 0;
705                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
706                     NEXT_OFF(scan) += NEXT_OFF(n);
707                     next = n + NODE_STEP_REGNODE;
708 #ifdef DEBUGGING
709                     if (stringok)
710                         stop = n;
711 #endif
712                     n = regnext(n);
713                 }
714                 else if (stringok) {
715                     int oldl = STR_LEN(scan);
716                     regnode *nnext = regnext(n);
717
718                     if (oldl + STR_LEN(n) > U8_MAX)
719                         break;
720                     NEXT_OFF(scan) += NEXT_OFF(n);
721                     STR_LEN(scan) += STR_LEN(n);
722                     next = n + NODE_SZ_STR(n);
723                     /* Now we can overwrite *n : */
724                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
725 #ifdef DEBUGGING
726                     stop = next - 1;
727 #endif
728                     n = nnext;
729                 }
730             }
731
732             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
733 /*
734   Two problematic code points in Unicode casefolding of EXACT nodes:
735
736    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
738
739    which casefold to
740
741    Unicode                      UTF-8
742
743    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
744    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
745
746    This means that in case-insensitive matching (or "loose matching",
747    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748    length of the above casefolded versions) can match a target string
749    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750    This would rather mess up the minimum length computation.
751
752    What we'll do is to look for the tail four bytes, and then peek
753    at the preceding two bytes to see whether we need to decrease
754    the minimum length by four (six minus two).
755
756    Thanks to the design of UTF-8, there cannot be false matches:
757    A sequence of valid UTF-8 bytes cannot be a subsequence of
758    another valid sequence of UTF-8 bytes.
759
760 */
761                  char *s0 = STRING(scan), *s, *t;
762                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763                  char *t0 = "\xcc\x88\xcc\x81";
764                  char *t1 = t0 + 3;
765                  
766                  for (s = s0 + 2;
767                       s < s2 && (t = ninstr(s, s1, t0, t1));
768                       s = t + 4) {
769                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
771                            min -= 4;
772                  }
773             }
774
775 #ifdef DEBUGGING
776             /* Allow dumping */
777             n = scan + NODE_SZ_STR(scan);
778             while (n <= stop) {
779                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
780                     OP(n) = OPTIMIZED;
781                     NEXT_OFF(n) = 0;
782                 }
783                 n++;
784             }
785 #endif
786         }
787         /* Follow the next-chain of the current node and optimize
788            away all the NOTHINGs from it.  */
789         if (OP(scan) != CURLYX) {
790             int max = (reg_off_by_arg[OP(scan)]
791                        ? I32_MAX
792                        /* I32 may be smaller than U16 on CRAYs! */
793                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
794             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
795             int noff;
796             regnode *n = scan;
797         
798             /* Skip NOTHING and LONGJMP. */
799             while ((n = regnext(n))
800                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
801                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
802                    && off + noff < max)
803                 off += noff;
804             if (reg_off_by_arg[OP(scan)])
805                 ARG(scan) = off;
806             else
807                 NEXT_OFF(scan) = off;
808         }
809         /* The principal pseudo-switch.  Cannot be a switch, since we
810            look into several different things.  */
811         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
812                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813             next = regnext(scan);
814             code = OP(scan);
815         
816             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
817                 I32 max1 = 0, min1 = I32_MAX, num = 0;
818                 struct regnode_charclass_class accum;
819                 
820                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
821                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
822                 if (flags & SCF_DO_STCLASS)
823                     cl_init_zero(pRExC_state, &accum);
824                 while (OP(scan) == code) {
825                     I32 deltanext, minnext, f = 0, fake;
826                     struct regnode_charclass_class this_class;
827
828                     num++;
829                     data_fake.flags = 0;
830                     if (data) {         
831                         data_fake.whilem_c = data->whilem_c;
832                         data_fake.last_closep = data->last_closep;
833                     }
834                     else
835                         data_fake.last_closep = &fake;
836                     next = regnext(scan);
837                     scan = NEXTOPER(scan);
838                     if (code != BRANCH)
839                         scan = NEXTOPER(scan);
840                     if (flags & SCF_DO_STCLASS) {
841                         cl_init(pRExC_state, &this_class);
842                         data_fake.start_class = &this_class;
843                         f = SCF_DO_STCLASS_AND;
844                     }           
845                     if (flags & SCF_WHILEM_VISITED_POS)
846                         f |= SCF_WHILEM_VISITED_POS;
847                     /* we suppose the run is continuous, last=next...*/
848                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
849                                           next, &data_fake, f);
850                     if (min1 > minnext)
851                         min1 = minnext;
852                     if (max1 < minnext + deltanext)
853                         max1 = minnext + deltanext;
854                     if (deltanext == I32_MAX)
855                         is_inf = is_inf_internal = 1;
856                     scan = next;
857                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
858                         pars++;
859                     if (data && (data_fake.flags & SF_HAS_EVAL))
860                         data->flags |= SF_HAS_EVAL;
861                     if (data)
862                         data->whilem_c = data_fake.whilem_c;
863                     if (flags & SCF_DO_STCLASS)
864                         cl_or(pRExC_state, &accum, &this_class);
865                     if (code == SUSPEND)
866                         break;
867                 }
868                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
869                     min1 = 0;
870                 if (flags & SCF_DO_SUBSTR) {
871                     data->pos_min += min1;
872                     data->pos_delta += max1 - min1;
873                     if (max1 != min1 || is_inf)
874                         data->longest = &(data->longest_float);
875                 }
876                 min += min1;
877                 delta += max1 - min1;
878                 if (flags & SCF_DO_STCLASS_OR) {
879                     cl_or(pRExC_state, data->start_class, &accum);
880                     if (min1) {
881                         cl_and(data->start_class, &and_with);
882                         flags &= ~SCF_DO_STCLASS;
883                     }
884                 }
885                 else if (flags & SCF_DO_STCLASS_AND) {
886                     if (min1) {
887                         cl_and(data->start_class, &accum);
888                         flags &= ~SCF_DO_STCLASS;
889                     }
890                     else {
891                         /* Switch to OR mode: cache the old value of
892                          * data->start_class */
893                         StructCopy(data->start_class, &and_with,
894                                    struct regnode_charclass_class);
895                         flags &= ~SCF_DO_STCLASS_AND;
896                         StructCopy(&accum, data->start_class,
897                                    struct regnode_charclass_class);
898                         flags |= SCF_DO_STCLASS_OR;
899                         data->start_class->flags |= ANYOF_EOS;
900                     }
901                 }
902             }
903             else if (code == BRANCHJ)   /* single branch is optimized. */
904                 scan = NEXTOPER(NEXTOPER(scan));
905             else                        /* single branch is optimized. */
906                 scan = NEXTOPER(scan);
907             continue;
908         }
909         else if (OP(scan) == EXACT) {
910             I32 l = STR_LEN(scan);
911             UV uc = *((U8*)STRING(scan));
912             if (UTF) {
913                 U8 *s = (U8*)STRING(scan);
914                 l = utf8_length(s, s + l);
915                 uc = utf8_to_uvchr(s, NULL);
916             }
917             min += l;
918             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919                 /* The code below prefers earlier match for fixed
920                    offset, later match for variable offset.  */
921                 if (data->last_end == -1) { /* Update the start info. */
922                     data->last_start_min = data->pos_min;
923                     data->last_start_max = is_inf
924                         ? I32_MAX : data->pos_min + data->pos_delta;
925                 }
926                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
927                 {
928                     SV * sv = data->last_found;
929                     MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
931                     if (mg && mg->mg_len >= 0)
932                         mg->mg_len += utf8_length((U8*)STRING(scan),
933                                                   (U8*)STRING(scan)+STR_LEN(scan));
934                 }
935                 if (UTF)
936                     SvUTF8_on(data->last_found);
937                 data->last_end = data->pos_min + l;
938                 data->pos_min += l; /* As in the first entry. */
939                 data->flags &= ~SF_BEFORE_EOL;
940             }
941             if (flags & SCF_DO_STCLASS_AND) {
942                 /* Check whether it is compatible with what we know already! */
943                 int compat = 1;
944
945                 if (uc >= 0x100 ||
946                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
948                     && (!(data->start_class->flags & ANYOF_FOLD)
949                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
950                     )
951                     compat = 0;
952                 ANYOF_CLASS_ZERO(data->start_class);
953                 ANYOF_BITMAP_ZERO(data->start_class);
954                 if (compat)
955                     ANYOF_BITMAP_SET(data->start_class, uc);
956                 data->start_class->flags &= ~ANYOF_EOS;
957                 if (uc < 0x100)
958                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
959             }
960             else if (flags & SCF_DO_STCLASS_OR) {
961                 /* false positive possible if the class is case-folded */
962                 if (uc < 0x100)
963                     ANYOF_BITMAP_SET(data->start_class, uc);
964                 else
965                     data->start_class->flags |= ANYOF_UNICODE_ALL;
966                 data->start_class->flags &= ~ANYOF_EOS;
967                 cl_and(data->start_class, &and_with);
968             }
969             flags &= ~SCF_DO_STCLASS;
970         }
971         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972             I32 l = STR_LEN(scan);
973             UV uc = *((U8*)STRING(scan));
974
975             /* Search for fixed substrings supports EXACT only. */
976             if (flags & SCF_DO_SUBSTR)
977                 scan_commit(pRExC_state, data);
978             if (UTF) {
979                 U8 *s = (U8 *)STRING(scan);
980                 l = utf8_length(s, s + l);
981                 uc = utf8_to_uvchr(s, NULL);
982             }
983             min += l;
984             if (data && (flags & SCF_DO_SUBSTR))
985                 data->pos_min += l;
986             if (flags & SCF_DO_STCLASS_AND) {
987                 /* Check whether it is compatible with what we know already! */
988                 int compat = 1;
989
990                 if (uc >= 0x100 ||
991                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
993                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
994                     compat = 0;
995                 ANYOF_CLASS_ZERO(data->start_class);
996                 ANYOF_BITMAP_ZERO(data->start_class);
997                 if (compat) {
998                     ANYOF_BITMAP_SET(data->start_class, uc);
999                     data->start_class->flags &= ~ANYOF_EOS;
1000                     data->start_class->flags |= ANYOF_FOLD;
1001                     if (OP(scan) == EXACTFL)
1002                         data->start_class->flags |= ANYOF_LOCALE;
1003                 }
1004             }
1005             else if (flags & SCF_DO_STCLASS_OR) {
1006                 if (data->start_class->flags & ANYOF_FOLD) {
1007                     /* false positive possible if the class is case-folded.
1008                        Assume that the locale settings are the same... */
1009                     if (uc < 0x100)
1010                         ANYOF_BITMAP_SET(data->start_class, uc);
1011                     data->start_class->flags &= ~ANYOF_EOS;
1012                 }
1013                 cl_and(data->start_class, &and_with);
1014             }
1015             flags &= ~SCF_DO_STCLASS;
1016         }
1017         else if (strchr((char*)PL_varies,OP(scan))) {
1018             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019             I32 f = flags, pos_before = 0;
1020             regnode *oscan = scan;
1021             struct regnode_charclass_class this_class;
1022             struct regnode_charclass_class *oclass = NULL;
1023             I32 next_is_eval = 0;
1024
1025             switch (PL_regkind[(U8)OP(scan)]) {
1026             case WHILEM:                /* End of (?:...)* . */
1027                 scan = NEXTOPER(scan);
1028                 goto finish;
1029             case PLUS:
1030                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031                     next = NEXTOPER(scan);
1032                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1033                         mincount = 1;
1034                         maxcount = REG_INFTY;
1035                         next = regnext(scan);
1036                         scan = NEXTOPER(scan);
1037                         goto do_curly;
1038                     }
1039                 }
1040                 if (flags & SCF_DO_SUBSTR)
1041                     data->pos_min++;
1042                 min++;
1043                 /* Fall through. */
1044             case STAR:
1045                 if (flags & SCF_DO_STCLASS) {
1046                     mincount = 0;
1047                     maxcount = REG_INFTY;
1048                     next = regnext(scan);
1049                     scan = NEXTOPER(scan);
1050                     goto do_curly;
1051                 }
1052                 is_inf = is_inf_internal = 1;
1053                 scan = regnext(scan);
1054                 if (flags & SCF_DO_SUBSTR) {
1055                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056                     data->longest = &(data->longest_float);
1057                 }
1058                 goto optimize_curly_tail;
1059             case CURLY:
1060                 mincount = ARG1(scan);
1061                 maxcount = ARG2(scan);
1062                 next = regnext(scan);
1063                 if (OP(scan) == CURLYX) {
1064                     I32 lp = (data ? *(data->last_closep) : 0);
1065
1066                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067                 }
1068                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069                 next_is_eval = (OP(scan) == EVAL);
1070               do_curly:
1071                 if (flags & SCF_DO_SUBSTR) {
1072                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073                     pos_before = data->pos_min;
1074                 }
1075                 if (data) {
1076                     fl = data->flags;
1077                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078                     if (is_inf)
1079                         data->flags |= SF_IS_INF;
1080                 }
1081                 if (flags & SCF_DO_STCLASS) {
1082                     cl_init(pRExC_state, &this_class);
1083                     oclass = data->start_class;
1084                     data->start_class = &this_class;
1085                     f |= SCF_DO_STCLASS_AND;
1086                     f &= ~SCF_DO_STCLASS_OR;
1087                 }
1088                 /* These are the cases when once a subexpression
1089                    fails at a particular position, it cannot succeed
1090                    even after backtracking at the enclosing scope.
1091                 
1092                    XXXX what if minimal match and we are at the
1093                         initial run of {n,m}? */
1094                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095                     f &= ~SCF_WHILEM_VISITED_POS;
1096
1097                 /* This will finish on WHILEM, setting scan, or on NULL: */
1098                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099                                       mincount == 0
1100                                         ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102                 if (flags & SCF_DO_STCLASS)
1103                     data->start_class = oclass;
1104                 if (mincount == 0 || minnext == 0) {
1105                     if (flags & SCF_DO_STCLASS_OR) {
1106                         cl_or(pRExC_state, data->start_class, &this_class);
1107                     }
1108                     else if (flags & SCF_DO_STCLASS_AND) {
1109                         /* Switch to OR mode: cache the old value of
1110                          * data->start_class */
1111                         StructCopy(data->start_class, &and_with,
1112                                    struct regnode_charclass_class);
1113                         flags &= ~SCF_DO_STCLASS_AND;
1114                         StructCopy(&this_class, data->start_class,
1115                                    struct regnode_charclass_class);
1116                         flags |= SCF_DO_STCLASS_OR;
1117                         data->start_class->flags |= ANYOF_EOS;
1118                     }
1119                 } else {                /* Non-zero len */
1120                     if (flags & SCF_DO_STCLASS_OR) {
1121                         cl_or(pRExC_state, data->start_class, &this_class);
1122                         cl_and(data->start_class, &and_with);
1123                     }
1124                     else if (flags & SCF_DO_STCLASS_AND)
1125                         cl_and(data->start_class, &this_class);
1126                     flags &= ~SCF_DO_STCLASS;
1127                 }
1128                 if (!scan)              /* It was not CURLYX, but CURLY. */
1129                     scan = next;
1130                 if (ckWARN(WARN_REGEXP)
1131                        /* ? quantifier ok, except for (?{ ... }) */
1132                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1133                     && (minnext == 0) && (deltanext == 0)
1134                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1135                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1136                 {
1137                     vWARN(RExC_parse,
1138                           "Quantifier unexpected on zero-length expression");
1139                 }
1140
1141                 min += minnext * mincount;
1142                 is_inf_internal |= ((maxcount == REG_INFTY
1143                                      && (minnext + deltanext) > 0)
1144                                     || deltanext == I32_MAX);
1145                 is_inf |= is_inf_internal;
1146                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148                 /* Try powerful optimization CURLYX => CURLYN. */
1149                 if (  OP(oscan) == CURLYX && data
1150                       && data->flags & SF_IN_PAR
1151                       && !(data->flags & SF_HAS_EVAL)
1152                       && !deltanext && minnext == 1 ) {
1153                     /* Try to optimize to CURLYN.  */
1154                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155                     regnode *nxt1 = nxt;
1156 #ifdef DEBUGGING
1157                     regnode *nxt2;
1158 #endif
1159
1160                     /* Skip open. */
1161                     nxt = regnext(nxt);
1162                     if (!strchr((char*)PL_simple,OP(nxt))
1163                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164                              && STR_LEN(nxt) == 1))
1165                         goto nogo;
1166 #ifdef DEBUGGING
1167                     nxt2 = nxt;
1168 #endif
1169                     nxt = regnext(nxt);
1170                     if (OP(nxt) != CLOSE)
1171                         goto nogo;
1172                     /* Now we know that nxt2 is the only contents: */
1173                     oscan->flags = (U8)ARG(nxt);
1174                     OP(oscan) = CURLYN;
1175                     OP(nxt1) = NOTHING; /* was OPEN. */
1176 #ifdef DEBUGGING
1177                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1181                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1182                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1183 #endif
1184                 }
1185               nogo:
1186
1187                 /* Try optimization CURLYX => CURLYM. */
1188                 if (  OP(oscan) == CURLYX && data
1189                       && !(data->flags & SF_HAS_PAR)
1190                       && !(data->flags & SF_HAS_EVAL)
1191                       && !deltanext  ) {
1192                     /* XXXX How to optimize if data == 0? */
1193                     /* Optimize to a simpler form.  */
1194                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1195                     regnode *nxt2;
1196
1197                     OP(oscan) = CURLYM;
1198                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1199                             && (OP(nxt2) != WHILEM))
1200                         nxt = nxt2;
1201                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1202                     /* Need to optimize away parenths. */
1203                     if (data->flags & SF_IN_PAR) {
1204                         /* Set the parenth number.  */
1205                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1206
1207                         if (OP(nxt) != CLOSE)
1208                             FAIL("Panic opt close");
1209                         oscan->flags = (U8)ARG(nxt);
1210                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1211                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1212 #ifdef DEBUGGING
1213                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1214                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1215                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1216                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1217 #endif
1218 #if 0
1219                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1220                             regnode *nnxt = regnext(nxt1);
1221                         
1222                             if (nnxt == nxt) {
1223                                 if (reg_off_by_arg[OP(nxt1)])
1224                                     ARG_SET(nxt1, nxt2 - nxt1);
1225                                 else if (nxt2 - nxt1 < U16_MAX)
1226                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1227                                 else
1228                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1229                             }
1230                             nxt1 = nnxt;
1231                         }
1232 #endif
1233                         /* Optimize again: */
1234                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1235                                     NULL, 0);
1236                     }
1237                     else
1238                         oscan->flags = 0;
1239                 }
1240                 else if ((OP(oscan) == CURLYX)
1241                          && (flags & SCF_WHILEM_VISITED_POS)
1242                          /* See the comment on a similar expression above.
1243                             However, this time it not a subexpression
1244                             we care about, but the expression itself. */
1245                          && (maxcount == REG_INFTY)
1246                          && data && ++data->whilem_c < 16) {
1247                     /* This stays as CURLYX, we can put the count/of pair. */
1248                     /* Find WHILEM (as in regexec.c) */
1249                     regnode *nxt = oscan + NEXT_OFF(oscan);
1250
1251                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1252                         nxt += ARG(nxt);
1253                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1254                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1255                 }
1256                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1257                     pars++;
1258                 if (flags & SCF_DO_SUBSTR) {
1259                     SV *last_str = Nullsv;
1260                     int counted = mincount != 0;
1261
1262                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1263 #if defined(SPARC64_GCC_WORKAROUND)
1264                         I32 b = 0;
1265                         STRLEN l = 0;
1266                         char *s = NULL;
1267                         I32 old = 0;
1268
1269                         if (pos_before >= data->last_start_min)
1270                             b = pos_before;
1271                         else
1272                             b = data->last_start_min;
1273
1274                         l = 0;
1275                         s = SvPV(data->last_found, l);
1276                         old = b - data->last_start_min;
1277
1278 #else
1279                         I32 b = pos_before >= data->last_start_min
1280                             ? pos_before : data->last_start_min;
1281                         STRLEN l;
1282                         char *s = SvPV(data->last_found, l);
1283                         I32 old = b - data->last_start_min;
1284 #endif
1285
1286                         if (UTF)
1287                             old = utf8_hop((U8*)s, old) - (U8*)s;
1288                         
1289                         l -= old;
1290                         /* Get the added string: */
1291                         last_str = newSVpvn(s  + old, l);
1292                         if (UTF)
1293                             SvUTF8_on(last_str);
1294                         if (deltanext == 0 && pos_before == b) {
1295                             /* What was added is a constant string */
1296                             if (mincount > 1) {
1297                                 SvGROW(last_str, (mincount * l) + 1);
1298                                 repeatcpy(SvPVX(last_str) + l,
1299                                           SvPVX(last_str), l, mincount - 1);
1300                                 SvCUR(last_str) *= mincount;
1301                                 /* Add additional parts. */
1302                                 SvCUR_set(data->last_found,
1303                                           SvCUR(data->last_found) - l);
1304                                 sv_catsv(data->last_found, last_str);
1305                                 {
1306                                     SV * sv = data->last_found;
1307                                     MAGIC *mg =
1308                                         SvUTF8(sv) && SvMAGICAL(sv) ?
1309                                         mg_find(sv, PERL_MAGIC_utf8) : NULL;
1310                                     if (mg && mg->mg_len >= 0)
1311                                         mg->mg_len += CHR_SVLEN(last_str);
1312                                 }
1313                                 data->last_end += l * (mincount - 1);
1314                             }
1315                         } else {
1316                             /* start offset must point into the last copy */
1317                             data->last_start_min += minnext * (mincount - 1);
1318                             data->last_start_max += is_inf ? I32_MAX
1319                                 : (maxcount - 1) * (minnext + data->pos_delta);
1320                         }
1321                     }
1322                     /* It is counted once already... */
1323                     data->pos_min += minnext * (mincount - counted);
1324                     data->pos_delta += - counted * deltanext +
1325                         (minnext + deltanext) * maxcount - minnext * mincount;
1326                     if (mincount != maxcount) {
1327                          /* Cannot extend fixed substrings found inside
1328                             the group.  */
1329                         scan_commit(pRExC_state,data);
1330                         if (mincount && last_str) {
1331                             sv_setsv(data->last_found, last_str);
1332                             data->last_end = data->pos_min;
1333                             data->last_start_min =
1334                                 data->pos_min - CHR_SVLEN(last_str);
1335                             data->last_start_max = is_inf
1336                                 ? I32_MAX
1337                                 : data->pos_min + data->pos_delta
1338                                 - CHR_SVLEN(last_str);
1339                         }
1340                         data->longest = &(data->longest_float);
1341                     }
1342                     SvREFCNT_dec(last_str);
1343                 }
1344                 if (data && (fl & SF_HAS_EVAL))
1345                     data->flags |= SF_HAS_EVAL;
1346               optimize_curly_tail:
1347                 if (OP(oscan) != CURLYX) {
1348                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1349                            && NEXT_OFF(next))
1350                         NEXT_OFF(oscan) += NEXT_OFF(next);
1351                 }
1352                 continue;
1353             default:                    /* REF and CLUMP only? */
1354                 if (flags & SCF_DO_SUBSTR) {
1355                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1356                     data->longest = &(data->longest_float);
1357                 }
1358                 is_inf = is_inf_internal = 1;
1359                 if (flags & SCF_DO_STCLASS_OR)
1360                     cl_anything(pRExC_state, data->start_class);
1361                 flags &= ~SCF_DO_STCLASS;
1362                 break;
1363             }
1364         }
1365         else if (strchr((char*)PL_simple,OP(scan))) {
1366             int value = 0;
1367
1368             if (flags & SCF_DO_SUBSTR) {
1369                 scan_commit(pRExC_state,data);
1370                 data->pos_min++;
1371             }
1372             min++;
1373             if (flags & SCF_DO_STCLASS) {
1374                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1375
1376                 /* Some of the logic below assumes that switching
1377                    locale on will only add false positives. */
1378                 switch (PL_regkind[(U8)OP(scan)]) {
1379                 case SANY:
1380                 default:
1381                   do_default:
1382                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1383                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1384                         cl_anything(pRExC_state, data->start_class);
1385                     break;
1386                 case REG_ANY:
1387                     if (OP(scan) == SANY)
1388                         goto do_default;
1389                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1390                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1391                                  || (data->start_class->flags & ANYOF_CLASS));
1392                         cl_anything(pRExC_state, data->start_class);
1393                     }
1394                     if (flags & SCF_DO_STCLASS_AND || !value)
1395                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1396                     break;
1397                 case ANYOF:
1398                     if (flags & SCF_DO_STCLASS_AND)
1399                         cl_and(data->start_class,
1400                                (struct regnode_charclass_class*)scan);
1401                     else
1402                         cl_or(pRExC_state, data->start_class,
1403                               (struct regnode_charclass_class*)scan);
1404                     break;
1405                 case ALNUM:
1406                     if (flags & SCF_DO_STCLASS_AND) {
1407                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1408                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1409                             for (value = 0; value < 256; value++)
1410                                 if (!isALNUM(value))
1411                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1412                         }
1413                     }
1414                     else {
1415                         if (data->start_class->flags & ANYOF_LOCALE)
1416                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1417                         else {
1418                             for (value = 0; value < 256; value++)
1419                                 if (isALNUM(value))
1420                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1421                         }
1422                     }
1423                     break;
1424                 case ALNUML:
1425                     if (flags & SCF_DO_STCLASS_AND) {
1426                         if (data->start_class->flags & ANYOF_LOCALE)
1427                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1428                     }
1429                     else {
1430                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1431                         data->start_class->flags |= ANYOF_LOCALE;
1432                     }
1433                     break;
1434                 case NALNUM:
1435                     if (flags & SCF_DO_STCLASS_AND) {
1436                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1437                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1438                             for (value = 0; value < 256; value++)
1439                                 if (isALNUM(value))
1440                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1441                         }
1442                     }
1443                     else {
1444                         if (data->start_class->flags & ANYOF_LOCALE)
1445                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1446                         else {
1447                             for (value = 0; value < 256; value++)
1448                                 if (!isALNUM(value))
1449                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1450                         }
1451                     }
1452                     break;
1453                 case NALNUML:
1454                     if (flags & SCF_DO_STCLASS_AND) {
1455                         if (data->start_class->flags & ANYOF_LOCALE)
1456                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1457                     }
1458                     else {
1459                         data->start_class->flags |= ANYOF_LOCALE;
1460                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1461                     }
1462                     break;
1463                 case SPACE:
1464                     if (flags & SCF_DO_STCLASS_AND) {
1465                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1466                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1467                             for (value = 0; value < 256; value++)
1468                                 if (!isSPACE(value))
1469                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1470                         }
1471                     }
1472                     else {
1473                         if (data->start_class->flags & ANYOF_LOCALE)
1474                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1475                         else {
1476                             for (value = 0; value < 256; value++)
1477                                 if (isSPACE(value))
1478                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1479                         }
1480                     }
1481                     break;
1482                 case SPACEL:
1483                     if (flags & SCF_DO_STCLASS_AND) {
1484                         if (data->start_class->flags & ANYOF_LOCALE)
1485                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1486                     }
1487                     else {
1488                         data->start_class->flags |= ANYOF_LOCALE;
1489                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1490                     }
1491                     break;
1492                 case NSPACE:
1493                     if (flags & SCF_DO_STCLASS_AND) {
1494                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1495                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1496                             for (value = 0; value < 256; value++)
1497                                 if (isSPACE(value))
1498                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1499                         }
1500                     }
1501                     else {
1502                         if (data->start_class->flags & ANYOF_LOCALE)
1503                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1504                         else {
1505                             for (value = 0; value < 256; value++)
1506                                 if (!isSPACE(value))
1507                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1508                         }
1509                     }
1510                     break;
1511                 case NSPACEL:
1512                     if (flags & SCF_DO_STCLASS_AND) {
1513                         if (data->start_class->flags & ANYOF_LOCALE) {
1514                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1515                             for (value = 0; value < 256; value++)
1516                                 if (!isSPACE(value))
1517                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1518                         }
1519                     }
1520                     else {
1521                         data->start_class->flags |= ANYOF_LOCALE;
1522                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1523                     }
1524                     break;
1525                 case DIGIT:
1526                     if (flags & SCF_DO_STCLASS_AND) {
1527                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1528                         for (value = 0; value < 256; value++)
1529                             if (!isDIGIT(value))
1530                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1531                     }
1532                     else {
1533                         if (data->start_class->flags & ANYOF_LOCALE)
1534                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1535                         else {
1536                             for (value = 0; value < 256; value++)
1537                                 if (isDIGIT(value))
1538                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1539                         }
1540                     }
1541                     break;
1542                 case NDIGIT:
1543                     if (flags & SCF_DO_STCLASS_AND) {
1544                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1545                         for (value = 0; value < 256; value++)
1546                             if (isDIGIT(value))
1547                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1548                     }
1549                     else {
1550                         if (data->start_class->flags & ANYOF_LOCALE)
1551                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1552                         else {
1553                             for (value = 0; value < 256; value++)
1554                                 if (!isDIGIT(value))
1555                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1556                         }
1557                     }
1558                     break;
1559                 }
1560                 if (flags & SCF_DO_STCLASS_OR)
1561                     cl_and(data->start_class, &and_with);
1562                 flags &= ~SCF_DO_STCLASS;
1563             }
1564         }
1565         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1566             data->flags |= (OP(scan) == MEOL
1567                             ? SF_BEFORE_MEOL
1568                             : SF_BEFORE_SEOL);
1569         }
1570         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1571                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1572                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1573                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1574             /* Lookahead/lookbehind */
1575             I32 deltanext, minnext, fake = 0;
1576             regnode *nscan;
1577             struct regnode_charclass_class intrnl;
1578             int f = 0;
1579
1580             data_fake.flags = 0;
1581             if (data) {         
1582                 data_fake.whilem_c = data->whilem_c;
1583                 data_fake.last_closep = data->last_closep;
1584             }
1585             else
1586                 data_fake.last_closep = &fake;
1587             if ( flags & SCF_DO_STCLASS && !scan->flags
1588                  && OP(scan) == IFMATCH ) { /* Lookahead */
1589                 cl_init(pRExC_state, &intrnl);
1590                 data_fake.start_class = &intrnl;
1591                 f |= SCF_DO_STCLASS_AND;
1592             }
1593             if (flags & SCF_WHILEM_VISITED_POS)
1594                 f |= SCF_WHILEM_VISITED_POS;
1595             next = regnext(scan);
1596             nscan = NEXTOPER(NEXTOPER(scan));
1597             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1598             if (scan->flags) {
1599                 if (deltanext) {
1600                     vFAIL("Variable length lookbehind not implemented");
1601                 }
1602                 else if (minnext > U8_MAX) {
1603                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1604                 }
1605                 scan->flags = (U8)minnext;
1606             }
1607             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1608                 pars++;
1609             if (data && (data_fake.flags & SF_HAS_EVAL))
1610                 data->flags |= SF_HAS_EVAL;
1611             if (data)
1612                 data->whilem_c = data_fake.whilem_c;
1613             if (f & SCF_DO_STCLASS_AND) {
1614                 int was = (data->start_class->flags & ANYOF_EOS);
1615
1616                 cl_and(data->start_class, &intrnl);
1617                 if (was)
1618                     data->start_class->flags |= ANYOF_EOS;
1619             }
1620         }
1621         else if (OP(scan) == OPEN) {
1622             pars++;
1623         }
1624         else if (OP(scan) == CLOSE) {
1625             if ((I32)ARG(scan) == is_par) {
1626                 next = regnext(scan);
1627
1628                 if ( next && (OP(next) != WHILEM) && next < last)
1629                     is_par = 0;         /* Disable optimization */
1630             }
1631             if (data)
1632                 *(data->last_closep) = ARG(scan);
1633         }
1634         else if (OP(scan) == EVAL) {
1635                 if (data)
1636                     data->flags |= SF_HAS_EVAL;
1637         }
1638         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1639                 if (flags & SCF_DO_SUBSTR) {
1640                     scan_commit(pRExC_state,data);
1641                     data->longest = &(data->longest_float);
1642                 }
1643                 is_inf = is_inf_internal = 1;
1644                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1645                     cl_anything(pRExC_state, data->start_class);
1646                 flags &= ~SCF_DO_STCLASS;
1647         }
1648         /* Else: zero-length, ignore. */
1649         scan = regnext(scan);
1650     }
1651
1652   finish:
1653     *scanp = scan;
1654     *deltap = is_inf_internal ? I32_MAX : delta;
1655     if (flags & SCF_DO_SUBSTR && is_inf)
1656         data->pos_delta = I32_MAX - data->pos_min;
1657     if (is_par > U8_MAX)
1658         is_par = 0;
1659     if (is_par && pars==1 && data) {
1660         data->flags |= SF_IN_PAR;
1661         data->flags &= ~SF_HAS_PAR;
1662     }
1663     else if (pars && data) {
1664         data->flags |= SF_HAS_PAR;
1665         data->flags &= ~SF_IN_PAR;
1666     }
1667     if (flags & SCF_DO_STCLASS_OR)
1668         cl_and(data->start_class, &and_with);
1669     return min;
1670 }
1671
1672 STATIC I32
1673 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1674 {
1675     if (RExC_rx->data) {
1676         Renewc(RExC_rx->data,
1677                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1678                char, struct reg_data);
1679         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1680         RExC_rx->data->count += n;
1681     }
1682     else {
1683         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1684              char, struct reg_data);
1685         New(1208, RExC_rx->data->what, n, U8);
1686         RExC_rx->data->count = n;
1687     }
1688     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1689     return RExC_rx->data->count - n;
1690 }
1691
1692 void
1693 Perl_reginitcolors(pTHX)
1694 {
1695     int i = 0;
1696     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1697         
1698     if (s) {
1699         PL_colors[0] = s = savepv(s);
1700         while (++i < 6) {
1701             s = strchr(s, '\t');
1702             if (s) {
1703                 *s = '\0';
1704                 PL_colors[i] = ++s;
1705             }
1706             else
1707                 PL_colors[i] = s = "";
1708         }
1709     } else {
1710         while (i < 6)
1711             PL_colors[i++] = "";
1712     }
1713     PL_colorset = 1;
1714 }
1715
1716
1717 /*
1718  - pregcomp - compile a regular expression into internal code
1719  *
1720  * We can't allocate space until we know how big the compiled form will be,
1721  * but we can't compile it (and thus know how big it is) until we've got a
1722  * place to put the code.  So we cheat:  we compile it twice, once with code
1723  * generation turned off and size counting turned on, and once "for real".
1724  * This also means that we don't allocate space until we are sure that the
1725  * thing really will compile successfully, and we never have to move the
1726  * code and thus invalidate pointers into it.  (Note that it has to be in
1727  * one piece because free() must be able to free it all.) [NB: not true in perl]
1728  *
1729  * Beware that the optimization-preparation code in here knows about some
1730  * of the structure of the compiled regexp.  [I'll say.]
1731  */
1732 regexp *
1733 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1734 {
1735     register regexp *r;
1736     regnode *scan;
1737     regnode *first;
1738     I32 flags;
1739     I32 minlen = 0;
1740     I32 sawplus = 0;
1741     I32 sawopen = 0;
1742     scan_data_t data;
1743     RExC_state_t RExC_state;
1744     RExC_state_t *pRExC_state = &RExC_state;
1745
1746     if (exp == NULL)
1747         FAIL("NULL regexp argument");
1748
1749     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1750
1751     RExC_precomp = exp;
1752     DEBUG_r({
1753          if (!PL_colorset) reginitcolors();
1754          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1755                        PL_colors[4],PL_colors[5],PL_colors[0],
1756                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1757     });
1758     RExC_flags = pm->op_pmflags;
1759     RExC_sawback = 0;
1760
1761     RExC_seen = 0;
1762     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1763     RExC_seen_evals = 0;
1764     RExC_extralen = 0;
1765
1766     /* First pass: determine size, legality. */
1767     RExC_parse = exp;
1768     RExC_start = exp;
1769     RExC_end = xend;
1770     RExC_naughty = 0;
1771     RExC_npar = 1;
1772     RExC_size = 0L;
1773     RExC_emit = &PL_regdummy;
1774     RExC_whilem_seen = 0;
1775 #if 0 /* REGC() is (currently) a NOP at the first pass.
1776        * Clever compilers notice this and complain. --jhi */
1777     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1778 #endif
1779     if (reg(pRExC_state, 0, &flags) == NULL) {
1780         RExC_precomp = Nullch;
1781         return(NULL);
1782     }
1783     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1784
1785     /* Small enough for pointer-storage convention?
1786        If extralen==0, this means that we will not need long jumps. */
1787     if (RExC_size >= 0x10000L && RExC_extralen)
1788         RExC_size += RExC_extralen;
1789     else
1790         RExC_extralen = 0;
1791     if (RExC_whilem_seen > 15)
1792         RExC_whilem_seen = 15;
1793
1794     /* Allocate space and initialize. */
1795     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1796          char, regexp);
1797     if (r == NULL)
1798         FAIL("Regexp out of space");
1799
1800 #ifdef DEBUGGING
1801     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1802     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1803 #endif
1804     r->refcnt = 1;
1805     r->prelen = xend - exp;
1806     r->precomp = savepvn(RExC_precomp, r->prelen);
1807     r->subbeg = NULL;
1808     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1809     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1810
1811     r->substrs = 0;                     /* Useful during FAIL. */
1812     r->startp = 0;                      /* Useful during FAIL. */
1813     r->endp = 0;                        /* Useful during FAIL. */
1814
1815     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1816     if (r->offsets) {
1817       r->offsets[0] = RExC_size; 
1818     }
1819     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1820                           "%s %"UVuf" bytes for offset annotations.\n", 
1821                           r->offsets ? "Got" : "Couldn't get", 
1822                           (UV)((2*RExC_size+1) * sizeof(U32))));
1823
1824     RExC_rx = r;
1825
1826     /* Second pass: emit code. */
1827     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1828     RExC_parse = exp;
1829     RExC_end = xend;
1830     RExC_naughty = 0;
1831     RExC_npar = 1;
1832     RExC_emit_start = r->program;
1833     RExC_emit = r->program;
1834     /* Store the count of eval-groups for security checks: */
1835     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1836     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1837     r->data = 0;
1838     if (reg(pRExC_state, 0, &flags) == NULL)
1839         return(NULL);
1840
1841     /* Dig out information for optimizations. */
1842     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1843     pm->op_pmflags = RExC_flags;
1844     if (UTF)
1845         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1846     r->regstclass = NULL;
1847     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1848         r->reganch |= ROPT_NAUGHTY;
1849     scan = r->program + 1;              /* First BRANCH. */
1850
1851     /* XXXX To minimize changes to RE engine we always allocate
1852        3-units-long substrs field. */
1853     Newz(1004, r->substrs, 1, struct reg_substr_data);
1854
1855     StructCopy(&zero_scan_data, &data, scan_data_t);
1856     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1857     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1858         I32 fake;
1859         STRLEN longest_float_length, longest_fixed_length;
1860         struct regnode_charclass_class ch_class;
1861         int stclass_flag;
1862         I32 last_close = 0;
1863
1864         first = scan;
1865         /* Skip introductions and multiplicators >= 1. */
1866         while ((OP(first) == OPEN && (sawopen = 1)) ||
1867                /* An OR of *one* alternative - should not happen now. */
1868             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1869             (OP(first) == PLUS) ||
1870             (OP(first) == MINMOD) ||
1871                /* An {n,m} with n>0 */
1872             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1873                 if (OP(first) == PLUS)
1874                     sawplus = 1;
1875                 else
1876                     first += regarglen[(U8)OP(first)];
1877                 first = NEXTOPER(first);
1878         }
1879
1880         /* Starting-point info. */
1881       again:
1882         if (PL_regkind[(U8)OP(first)] == EXACT) {
1883             if (OP(first) == EXACT)
1884                 ;       /* Empty, get anchored substr later. */
1885             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1886                 r->regstclass = first;
1887         }
1888         else if (strchr((char*)PL_simple,OP(first)))
1889             r->regstclass = first;
1890         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1891                  PL_regkind[(U8)OP(first)] == NBOUND)
1892             r->regstclass = first;
1893         else if (PL_regkind[(U8)OP(first)] == BOL) {
1894             r->reganch |= (OP(first) == MBOL
1895                            ? ROPT_ANCH_MBOL
1896                            : (OP(first) == SBOL
1897                               ? ROPT_ANCH_SBOL
1898                               : ROPT_ANCH_BOL));
1899             first = NEXTOPER(first);
1900             goto again;
1901         }
1902         else if (OP(first) == GPOS) {
1903             r->reganch |= ROPT_ANCH_GPOS;
1904             first = NEXTOPER(first);
1905             goto again;
1906         }
1907         else if (!sawopen && (OP(first) == STAR &&
1908             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1909             !(r->reganch & ROPT_ANCH) )
1910         {
1911             /* turn .* into ^.* with an implied $*=1 */
1912             int type = OP(NEXTOPER(first));
1913
1914             if (type == REG_ANY)
1915                 type = ROPT_ANCH_MBOL;
1916             else
1917                 type = ROPT_ANCH_SBOL;
1918
1919             r->reganch |= type | ROPT_IMPLICIT;
1920             first = NEXTOPER(first);
1921             goto again;
1922         }
1923         if (sawplus && (!sawopen || !RExC_sawback)
1924             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1925             /* x+ must match at the 1st pos of run of x's */
1926             r->reganch |= ROPT_SKIP;
1927
1928         /* Scan is after the zeroth branch, first is atomic matcher. */
1929         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1930                               (IV)(first - scan + 1)));
1931         /*
1932         * If there's something expensive in the r.e., find the
1933         * longest literal string that must appear and make it the
1934         * regmust.  Resolve ties in favor of later strings, since
1935         * the regstart check works with the beginning of the r.e.
1936         * and avoiding duplication strengthens checking.  Not a
1937         * strong reason, but sufficient in the absence of others.
1938         * [Now we resolve ties in favor of the earlier string if
1939         * it happens that c_offset_min has been invalidated, since the
1940         * earlier string may buy us something the later one won't.]
1941         */
1942         minlen = 0;
1943
1944         data.longest_fixed = newSVpvn("",0);
1945         data.longest_float = newSVpvn("",0);
1946         data.last_found = newSVpvn("",0);
1947         data.longest = &(data.longest_fixed);
1948         first = scan;
1949         if (!r->regstclass) {
1950             cl_init(pRExC_state, &ch_class);
1951             data.start_class = &ch_class;
1952             stclass_flag = SCF_DO_STCLASS_AND;
1953         } else                          /* XXXX Check for BOUND? */
1954             stclass_flag = 0;
1955         data.last_closep = &last_close;
1956
1957         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1958                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1959         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1960              && data.last_start_min == 0 && data.last_end > 0
1961              && !RExC_seen_zerolen
1962              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1963             r->reganch |= ROPT_CHECK_ALL;
1964         scan_commit(pRExC_state, &data);
1965         SvREFCNT_dec(data.last_found);
1966
1967         longest_float_length = CHR_SVLEN(data.longest_float);
1968         if (longest_float_length
1969             || (data.flags & SF_FL_BEFORE_EOL
1970                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1971                     || (RExC_flags & PMf_MULTILINE)))) {
1972             int t;
1973
1974             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1975                 && data.offset_fixed == data.offset_float_min
1976                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977                     goto remove_float;          /* As in (a)+. */
1978
1979             if (SvUTF8(data.longest_float)) {
1980                 r->float_utf8 = data.longest_float;
1981                 r->float_substr = Nullsv;
1982             } else {
1983                 r->float_substr = data.longest_float;
1984                 r->float_utf8 = Nullsv;
1985             }
1986             r->float_min_offset = data.offset_float_min;
1987             r->float_max_offset = data.offset_float_max;
1988             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1990                            || (RExC_flags & PMf_MULTILINE)));
1991             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1992         }
1993         else {
1994           remove_float:
1995             r->float_substr = r->float_utf8 = Nullsv;
1996             SvREFCNT_dec(data.longest_float);
1997             longest_float_length = 0;
1998         }
1999
2000         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2001         if (longest_fixed_length
2002             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2004                     || (RExC_flags & PMf_MULTILINE)))) {
2005             int t;
2006
2007             if (SvUTF8(data.longest_fixed)) {
2008                 r->anchored_utf8 = data.longest_fixed;
2009                 r->anchored_substr = Nullsv;
2010             } else {
2011                 r->anchored_substr = data.longest_fixed;
2012                 r->anchored_utf8 = Nullsv;
2013             }
2014             r->anchored_offset = data.offset_fixed;
2015             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2017                      || (RExC_flags & PMf_MULTILINE)));
2018             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2019         }
2020         else {
2021             r->anchored_substr = r->anchored_utf8 = Nullsv;
2022             SvREFCNT_dec(data.longest_fixed);
2023             longest_fixed_length = 0;
2024         }
2025         if (r->regstclass
2026             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2027             r->regstclass = NULL;
2028         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2029             && stclass_flag
2030             && !(data.start_class->flags & ANYOF_EOS)
2031             && !cl_is_anything(data.start_class))
2032         {
2033             I32 n = add_data(pRExC_state, 1, "f");
2034
2035             New(1006, RExC_rx->data->data[n], 1,
2036                 struct regnode_charclass_class);
2037             StructCopy(data.start_class,
2038                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2039                        struct regnode_charclass_class);
2040             r->regstclass = (regnode*)RExC_rx->data->data[n];
2041             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2042             PL_regdata = r->data; /* for regprop() */
2043             DEBUG_r({ SV *sv = sv_newmortal();
2044                       regprop(sv, (regnode*)data.start_class);
2045                       PerlIO_printf(Perl_debug_log,
2046                                     "synthetic stclass `%s'.\n",
2047                                     SvPVX(sv));});
2048         }
2049
2050         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2051         if (longest_fixed_length > longest_float_length) {
2052             r->check_substr = r->anchored_substr;
2053             r->check_utf8 = r->anchored_utf8;
2054             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055             if (r->reganch & ROPT_ANCH_SINGLE)
2056                 r->reganch |= ROPT_NOSCAN;
2057         }
2058         else {
2059             r->check_substr = r->float_substr;
2060             r->check_utf8 = r->float_utf8;
2061             r->check_offset_min = data.offset_float_min;
2062             r->check_offset_max = data.offset_float_max;
2063         }
2064         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065            This should be changed ASAP!  */
2066         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2067             r->reganch |= RE_USE_INTUIT;
2068             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2069                 r->reganch |= RE_INTUIT_TAIL;
2070         }
2071     }
2072     else {
2073         /* Several toplevels. Best we can is to set minlen. */
2074         I32 fake;
2075         struct regnode_charclass_class ch_class;
2076         I32 last_close = 0;
2077         
2078         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079         scan = r->program + 1;
2080         cl_init(pRExC_state, &ch_class);
2081         data.start_class = &ch_class;
2082         data.last_closep = &last_close;
2083         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2084         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085                 = r->float_substr = r->float_utf8 = Nullsv;
2086         if (!(data.start_class->flags & ANYOF_EOS)
2087             && !cl_is_anything(data.start_class))
2088         {
2089             I32 n = add_data(pRExC_state, 1, "f");
2090
2091             New(1006, RExC_rx->data->data[n], 1,
2092                 struct regnode_charclass_class);
2093             StructCopy(data.start_class,
2094                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2095                        struct regnode_charclass_class);
2096             r->regstclass = (regnode*)RExC_rx->data->data[n];
2097             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2098             DEBUG_r({ SV* sv = sv_newmortal();
2099                       regprop(sv, (regnode*)data.start_class);
2100                       PerlIO_printf(Perl_debug_log,
2101                                     "synthetic stclass `%s'.\n",
2102                                     SvPVX(sv));});
2103         }
2104     }
2105
2106     r->minlen = minlen;
2107     if (RExC_seen & REG_SEEN_GPOS)
2108         r->reganch |= ROPT_GPOS_SEEN;
2109     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2110         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2111     if (RExC_seen & REG_SEEN_EVAL)
2112         r->reganch |= ROPT_EVAL_SEEN;
2113     if (RExC_seen & REG_SEEN_CANY)
2114         r->reganch |= ROPT_CANY_SEEN;
2115     Newz(1002, r->startp, RExC_npar, I32);
2116     Newz(1002, r->endp, RExC_npar, I32);
2117     PL_regdata = r->data; /* for regprop() */
2118     DEBUG_r(regdump(r));
2119     return(r);
2120 }
2121
2122 /*
2123  - reg - regular expression, i.e. main body or parenthesized thing
2124  *
2125  * Caller must absorb opening parenthesis.
2126  *
2127  * Combining parenthesis handling with the base level of regular expression
2128  * is a trifle forced, but the need to tie the tails of the branches to what
2129  * follows makes it hard to avoid.
2130  */
2131 STATIC regnode *
2132 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2133     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2134 {
2135     register regnode *ret;              /* Will be the head of the group. */
2136     register regnode *br;
2137     register regnode *lastbr;
2138     register regnode *ender = 0;
2139     register I32 parno = 0;
2140     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2141
2142     /* for (?g), (?gc), and (?o) warnings; warning
2143        about (?c) will warn about (?g) -- japhy    */
2144
2145     I32 wastedflags = 0x00,
2146         wasted_o    = 0x01,
2147         wasted_g    = 0x02,
2148         wasted_gc   = 0x02 | 0x04,
2149         wasted_c    = 0x04;
2150
2151     char * parse_start = RExC_parse; /* MJD */
2152     char *oregcomp_parse = RExC_parse;
2153     char c;
2154
2155     *flagp = 0;                         /* Tentatively. */
2156
2157
2158     /* Make an OPEN node, if parenthesized. */
2159     if (paren) {
2160         if (*RExC_parse == '?') { /* (?...) */
2161             U32 posflags = 0, negflags = 0;
2162             U32 *flagsp = &posflags;
2163             int logical = 0;
2164             char *seqstart = RExC_parse;
2165
2166             RExC_parse++;
2167             paren = *RExC_parse++;
2168             ret = NULL;                 /* For look-ahead/behind. */
2169             switch (paren) {
2170             case '<':           /* (?<...) */
2171                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2172                 if (*RExC_parse == '!')
2173                     paren = ',';
2174                 if (*RExC_parse != '=' && *RExC_parse != '!')
2175                     goto unknown;
2176                 RExC_parse++;
2177             case '=':           /* (?=...) */
2178             case '!':           /* (?!...) */
2179                 RExC_seen_zerolen++;
2180             case ':':           /* (?:...) */
2181             case '>':           /* (?>...) */
2182                 break;
2183             case '$':           /* (?$...) */
2184             case '@':           /* (?@...) */
2185                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2186                 break;
2187             case '#':           /* (?#...) */
2188                 while (*RExC_parse && *RExC_parse != ')')
2189                     RExC_parse++;
2190                 if (*RExC_parse != ')')
2191                     FAIL("Sequence (?#... not terminated");
2192                 nextchar(pRExC_state);
2193                 *flagp = TRYAGAIN;
2194                 return NULL;
2195             case 'p':           /* (?p...) */
2196                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2197                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2198                 /* FALL THROUGH*/
2199             case '?':           /* (??...) */
2200                 logical = 1;
2201                 if (*RExC_parse != '{')
2202                     goto unknown;
2203                 paren = *RExC_parse++;
2204                 /* FALL THROUGH */
2205             case '{':           /* (?{...}) */
2206             {
2207                 I32 count = 1, n = 0;
2208                 char c;
2209                 char *s = RExC_parse;
2210                 SV *sv;
2211                 OP_4tree *sop, *rop;
2212
2213                 RExC_seen_zerolen++;
2214                 RExC_seen |= REG_SEEN_EVAL;
2215                 while (count && (c = *RExC_parse)) {
2216                     if (c == '\\' && RExC_parse[1])
2217                         RExC_parse++;
2218                     else if (c == '{')
2219                         count++;
2220                     else if (c == '}')
2221                         count--;
2222                     RExC_parse++;
2223                 }
2224                 if (*RExC_parse != ')')
2225                 {
2226                     RExC_parse = s;             
2227                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2228                 }
2229                 if (!SIZE_ONLY) {
2230                     PAD *pad;
2231                 
2232                     if (RExC_parse - 1 - s)
2233                         sv = newSVpvn(s, RExC_parse - 1 - s);
2234                     else
2235                         sv = newSVpvn("", 0);
2236
2237                     ENTER;
2238                     Perl_save_re_context(aTHX);
2239                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2240                     sop->op_private |= OPpREFCOUNTED;
2241                     /* re_dup will OpREFCNT_inc */
2242                     OpREFCNT_set(sop, 1);
2243                     LEAVE;
2244
2245                     n = add_data(pRExC_state, 3, "nop");
2246                     RExC_rx->data->data[n] = (void*)rop;
2247                     RExC_rx->data->data[n+1] = (void*)sop;
2248                     RExC_rx->data->data[n+2] = (void*)pad;
2249                     SvREFCNT_dec(sv);
2250                 }
2251                 else {                                          /* First pass */
2252                     if (PL_reginterp_cnt < ++RExC_seen_evals
2253                         && PL_curcop != &PL_compiling)
2254                         /* No compiled RE interpolated, has runtime
2255                            components ===> unsafe.  */
2256                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2257                     if (PL_tainting && PL_tainted)
2258                         FAIL("Eval-group in insecure regular expression");
2259                 }
2260                 
2261                 nextchar(pRExC_state);
2262                 if (logical) {
2263                     ret = reg_node(pRExC_state, LOGICAL);
2264                     if (!SIZE_ONLY)
2265                         ret->flags = 2;
2266                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2267                     /* deal with the length of this later - MJD */
2268                     return ret;
2269                 }
2270                 ret = reganode(pRExC_state, EVAL, n);
2271                 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272                 Set_Node_Offset(ret, parse_start);
2273                 return ret;
2274             }
2275             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2276             {
2277                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2278                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279                         || RExC_parse[1] == '<'
2280                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2281                         I32 flag;
2282                         
2283                         ret = reg_node(pRExC_state, LOGICAL);
2284                         if (!SIZE_ONLY)
2285                             ret->flags = 1;
2286                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2287                         goto insert_if;
2288                     }
2289                 }
2290                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2291                     /* (?(1)...) */
2292                     parno = atoi(RExC_parse++);
2293
2294                     while (isDIGIT(*RExC_parse))
2295                         RExC_parse++;
2296                     ret = reganode(pRExC_state, GROUPP, parno);
2297                     
2298                     if ((c = *nextchar(pRExC_state)) != ')')
2299                         vFAIL("Switch condition not recognized");
2300                   insert_if:
2301                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302                     br = regbranch(pRExC_state, &flags, 1);
2303                     if (br == NULL)
2304                         br = reganode(pRExC_state, LONGJMP, 0);
2305                     else
2306                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307                     c = *nextchar(pRExC_state);
2308                     if (flags&HASWIDTH)
2309                         *flagp |= HASWIDTH;
2310                     if (c == '|') {
2311                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312                         regbranch(pRExC_state, &flags, 1);
2313                         regtail(pRExC_state, ret, lastbr);
2314                         if (flags&HASWIDTH)
2315                             *flagp |= HASWIDTH;
2316                         c = *nextchar(pRExC_state);
2317                     }
2318                     else
2319                         lastbr = NULL;
2320                     if (c != ')')
2321                         vFAIL("Switch (?(condition)... contains too many branches");
2322                     ender = reg_node(pRExC_state, TAIL);
2323                     regtail(pRExC_state, br, ender);
2324                     if (lastbr) {
2325                         regtail(pRExC_state, lastbr, ender);
2326                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2327                     }
2328                     else
2329                         regtail(pRExC_state, ret, ender);
2330                     return ret;
2331                 }
2332                 else {
2333                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2334                 }
2335             }
2336             case 0:
2337                 RExC_parse--; /* for vFAIL to print correctly */
2338                 vFAIL("Sequence (? incomplete");
2339                 break;
2340             default:
2341                 --RExC_parse;
2342               parse_flags:      /* (?i) */
2343                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2344                     /* (?g), (?gc) and (?o) are useless here
2345                        and must be globally applied -- japhy */
2346
2347                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350                             if (! (wastedflags & wflagbit) ) {
2351                                 wastedflags |= wflagbit;
2352                                 vWARN5(
2353                                     RExC_parse + 1,
2354                                     "Useless (%s%c) - %suse /%c modifier",
2355                                     flagsp == &negflags ? "?-" : "?",
2356                                     *RExC_parse,
2357                                     flagsp == &negflags ? "don't " : "",
2358                                     *RExC_parse
2359                                 );
2360                             }
2361                         }
2362                     }
2363                     else if (*RExC_parse == 'c') {
2364                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365                             if (! (wastedflags & wasted_c) ) {
2366                                 wastedflags |= wasted_gc;
2367                                 vWARN3(
2368                                     RExC_parse + 1,
2369                                     "Useless (%sc) - %suse /gc modifier",
2370                                     flagsp == &negflags ? "?-" : "?",
2371                                     flagsp == &negflags ? "don't " : ""
2372                                 );
2373                             }
2374                         }
2375                     }
2376                     else { pmflag(flagsp, *RExC_parse); }
2377
2378                     ++RExC_parse;
2379                 }
2380                 if (*RExC_parse == '-') {
2381                     flagsp = &negflags;
2382                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2383                     ++RExC_parse;
2384                     goto parse_flags;
2385                 }
2386                 RExC_flags |= posflags;
2387                 RExC_flags &= ~negflags;
2388                 if (*RExC_parse == ':') {
2389                     RExC_parse++;
2390                     paren = ':';
2391                     break;
2392                 }               
2393               unknown:
2394                 if (*RExC_parse != ')') {
2395                     RExC_parse++;
2396                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2397                 }
2398                 nextchar(pRExC_state);
2399                 *flagp = TRYAGAIN;
2400                 return NULL;
2401             }
2402         }
2403         else {                  /* (...) */
2404             parno = RExC_npar;
2405             RExC_npar++;
2406             ret = reganode(pRExC_state, OPEN, parno);
2407             Set_Node_Length(ret, 1); /* MJD */
2408             Set_Node_Offset(ret, RExC_parse); /* MJD */
2409             open = 1;
2410         }
2411     }
2412     else                        /* ! paren */
2413         ret = NULL;
2414
2415     /* Pick up the branches, linking them together. */
2416     parse_start = RExC_parse;   /* MJD */
2417     br = regbranch(pRExC_state, &flags, 1);
2418     /*     branch_len = (paren != 0); */
2419     
2420     if (br == NULL)
2421         return(NULL);
2422     if (*RExC_parse == '|') {
2423         if (!SIZE_ONLY && RExC_extralen) {
2424             reginsert(pRExC_state, BRANCHJ, br);
2425         }
2426         else {                  /* MJD */
2427             reginsert(pRExC_state, BRANCH, br);
2428             Set_Node_Length(br, paren != 0);
2429             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2430         }
2431         have_branch = 1;
2432         if (SIZE_ONLY)
2433             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2434     }
2435     else if (paren == ':') {
2436         *flagp |= flags&SIMPLE;
2437     }
2438     if (open) {                         /* Starts with OPEN. */
2439         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2440     }
2441     else if (paren != '?')              /* Not Conditional */
2442         ret = br;
2443     *flagp |= flags & (SPSTART | HASWIDTH);
2444     lastbr = br;
2445     while (*RExC_parse == '|') {
2446         if (!SIZE_ONLY && RExC_extralen) {
2447             ender = reganode(pRExC_state, LONGJMP,0);
2448             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2449         }
2450         if (SIZE_ONLY)
2451             RExC_extralen += 2;         /* Account for LONGJMP. */
2452         nextchar(pRExC_state);
2453         br = regbranch(pRExC_state, &flags, 0);
2454         
2455         if (br == NULL)
2456             return(NULL);
2457         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2458         lastbr = br;
2459         if (flags&HASWIDTH)
2460             *flagp |= HASWIDTH;
2461         *flagp |= flags&SPSTART;
2462     }
2463
2464     if (have_branch || paren != ':') {
2465         /* Make a closing node, and hook it on the end. */
2466         switch (paren) {
2467         case ':':
2468             ender = reg_node(pRExC_state, TAIL);
2469             break;
2470         case 1:
2471             ender = reganode(pRExC_state, CLOSE, parno);
2472             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473             Set_Node_Length(ender,1); /* MJD */
2474             break;
2475         case '<':
2476         case ',':
2477         case '=':
2478         case '!':
2479             *flagp &= ~HASWIDTH;
2480             /* FALL THROUGH */
2481         case '>':
2482             ender = reg_node(pRExC_state, SUCCEED);
2483             break;
2484         case 0:
2485             ender = reg_node(pRExC_state, END);
2486             break;
2487         }
2488         regtail(pRExC_state, lastbr, ender);
2489
2490         if (have_branch) {
2491             /* Hook the tails of the branches to the closing node. */
2492             for (br = ret; br != NULL; br = regnext(br)) {
2493                 regoptail(pRExC_state, br, ender);
2494             }
2495         }
2496     }
2497
2498     {
2499         char *p;
2500         static char parens[] = "=!<,>";
2501
2502         if (paren && (p = strchr(parens, paren))) {
2503             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2504             int flag = (p - parens) > 1;
2505
2506             if (paren == '>')
2507                 node = SUSPEND, flag = 0;
2508             reginsert(pRExC_state, node,ret);
2509             Set_Node_Offset(ret, oregcomp_parse);
2510             Set_Node_Length(ret,  RExC_parse - oregcomp_parse + 2);
2511             ret->flags = flag;
2512             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2513         }
2514     }
2515
2516     /* Check for proper termination. */
2517     if (paren) {
2518         RExC_flags = oregflags;
2519         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520             RExC_parse = oregcomp_parse;
2521             vFAIL("Unmatched (");
2522         }
2523     }
2524     else if (!paren && RExC_parse < RExC_end) {
2525         if (*RExC_parse == ')') {
2526             RExC_parse++;
2527             vFAIL("Unmatched )");
2528         }
2529         else
2530             FAIL("Junk on end of regexp");      /* "Can't happen". */
2531         /* NOTREACHED */
2532     }
2533
2534     return(ret);
2535 }
2536
2537 /*
2538  - regbranch - one alternative of an | operator
2539  *
2540  * Implements the concatenation operator.
2541  */
2542 STATIC regnode *
2543 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2544 {
2545     register regnode *ret;
2546     register regnode *chain = NULL;
2547     register regnode *latest;
2548     I32 flags = 0, c = 0;
2549
2550     if (first)
2551         ret = NULL;
2552     else {
2553         if (!SIZE_ONLY && RExC_extralen)
2554             ret = reganode(pRExC_state, BRANCHJ,0);
2555         else {
2556             ret = reg_node(pRExC_state, BRANCH);
2557             Set_Node_Length(ret, 1);
2558         }
2559     }
2560         
2561     if (!first && SIZE_ONLY)
2562         RExC_extralen += 1;                     /* BRANCHJ */
2563
2564     *flagp = WORST;                     /* Tentatively. */
2565
2566     RExC_parse--;
2567     nextchar(pRExC_state);
2568     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2569         flags &= ~TRYAGAIN;
2570         latest = regpiece(pRExC_state, &flags);
2571         if (latest == NULL) {
2572             if (flags & TRYAGAIN)
2573                 continue;
2574             return(NULL);
2575         }
2576         else if (ret == NULL)
2577             ret = latest;
2578         *flagp |= flags&HASWIDTH;
2579         if (chain == NULL)      /* First piece. */
2580             *flagp |= flags&SPSTART;
2581         else {
2582             RExC_naughty++;
2583             regtail(pRExC_state, chain, latest);
2584         }
2585         chain = latest;
2586         c++;
2587     }
2588     if (chain == NULL) {        /* Loop ran zero times. */
2589         chain = reg_node(pRExC_state, NOTHING);
2590         if (ret == NULL)
2591             ret = chain;
2592     }
2593     if (c == 1) {
2594         *flagp |= flags&SIMPLE;
2595     }
2596
2597     return(ret);
2598 }
2599
2600 /*
2601  - regpiece - something followed by possible [*+?]
2602  *
2603  * Note that the branching code sequences used for ? and the general cases
2604  * of * and + are somewhat optimized:  they use the same NOTHING node as
2605  * both the endmarker for their branch list and the body of the last branch.
2606  * It might seem that this node could be dispensed with entirely, but the
2607  * endmarker role is not redundant.
2608  */
2609 STATIC regnode *
2610 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2611 {
2612     register regnode *ret;
2613     register char op;
2614     register char *next;
2615     I32 flags;
2616     char *origparse = RExC_parse;
2617     char *maxpos;
2618     I32 min;
2619     I32 max = REG_INFTY;
2620     char *parse_start;
2621
2622     ret = regatom(pRExC_state, &flags);
2623     if (ret == NULL) {
2624         if (flags & TRYAGAIN)
2625             *flagp |= TRYAGAIN;
2626         return(NULL);
2627     }
2628
2629     op = *RExC_parse;
2630
2631     if (op == '{' && regcurly(RExC_parse)) {
2632         parse_start = RExC_parse; /* MJD */
2633         next = RExC_parse + 1;
2634         maxpos = Nullch;
2635         while (isDIGIT(*next) || *next == ',') {
2636             if (*next == ',') {
2637                 if (maxpos)
2638                     break;
2639                 else
2640                     maxpos = next;
2641             }
2642             next++;
2643         }
2644         if (*next == '}') {             /* got one */
2645             if (!maxpos)
2646                 maxpos = next;
2647             RExC_parse++;
2648             min = atoi(RExC_parse);
2649             if (*maxpos == ',')
2650                 maxpos++;
2651             else
2652                 maxpos = RExC_parse;
2653             max = atoi(maxpos);
2654             if (!max && *maxpos != '0')
2655                 max = REG_INFTY;                /* meaning "infinity" */
2656             else if (max >= REG_INFTY)
2657                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2658             RExC_parse = next;
2659             nextchar(pRExC_state);
2660
2661         do_curly:
2662             if ((flags&SIMPLE)) {
2663                 RExC_naughty += 2 + RExC_naughty / 2;
2664                 reginsert(pRExC_state, CURLY, ret);
2665                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666                 Set_Node_Cur_Length(ret);
2667             }
2668             else {
2669                 regnode *w = reg_node(pRExC_state, WHILEM);
2670
2671                 w->flags = 0;
2672                 regtail(pRExC_state, ret, w);
2673                 if (!SIZE_ONLY && RExC_extralen) {
2674                     reginsert(pRExC_state, LONGJMP,ret);
2675                     reginsert(pRExC_state, NOTHING,ret);
2676                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2677                 }
2678                 reginsert(pRExC_state, CURLYX,ret);
2679                                 /* MJD hk */
2680                 Set_Node_Offset(ret, parse_start+1);
2681                 Set_Node_Length(ret, 
2682                                 op == '{' ? (RExC_parse - parse_start) : 1);
2683                 
2684                 if (!SIZE_ONLY && RExC_extralen)
2685                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2686                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2687                 if (SIZE_ONLY)
2688                     RExC_whilem_seen++, RExC_extralen += 3;
2689                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2690             }
2691             ret->flags = 0;
2692
2693             if (min > 0)
2694                 *flagp = WORST;
2695             if (max > 0)
2696                 *flagp |= HASWIDTH;
2697             if (max && max < min)
2698                 vFAIL("Can't do {n,m} with n > m");
2699             if (!SIZE_ONLY) {
2700                 ARG1_SET(ret, (U16)min);
2701                 ARG2_SET(ret, (U16)max);
2702             }
2703
2704             goto nest_check;
2705         }
2706     }
2707
2708     if (!ISMULT1(op)) {
2709         *flagp = flags;
2710         return(ret);
2711     }
2712
2713 #if 0                           /* Now runtime fix should be reliable. */
2714
2715     /* if this is reinstated, don't forget to put this back into perldiag:
2716
2717             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2718
2719            (F) The part of the regexp subject to either the * or + quantifier
2720            could match an empty string. The {#} shows in the regular
2721            expression about where the problem was discovered.
2722
2723     */
2724
2725     if (!(flags&HASWIDTH) && op != '?')
2726       vFAIL("Regexp *+ operand could be empty");
2727 #endif
2728
2729     parse_start = RExC_parse;
2730     nextchar(pRExC_state);
2731
2732     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2733
2734     if (op == '*' && (flags&SIMPLE)) {
2735         reginsert(pRExC_state, STAR, ret);
2736         ret->flags = 0;
2737         RExC_naughty += 4;
2738     }
2739     else if (op == '*') {
2740         min = 0;
2741         goto do_curly;
2742     }
2743     else if (op == '+' && (flags&SIMPLE)) {
2744         reginsert(pRExC_state, PLUS, ret);
2745         ret->flags = 0;
2746         RExC_naughty += 3;
2747     }
2748     else if (op == '+') {
2749         min = 1;
2750         goto do_curly;
2751     }
2752     else if (op == '?') {
2753         min = 0; max = 1;
2754         goto do_curly;
2755     }
2756   nest_check:
2757     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2758         vWARN3(RExC_parse,
2759                "%.*s matches null string many times",
2760                RExC_parse - origparse,
2761                origparse);
2762     }
2763
2764     if (*RExC_parse == '?') {
2765         nextchar(pRExC_state);
2766         reginsert(pRExC_state, MINMOD, ret);
2767         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2768     }
2769     if (ISMULT2(RExC_parse)) {
2770         RExC_parse++;
2771         vFAIL("Nested quantifiers");
2772     }
2773
2774     return(ret);
2775 }
2776
2777 /*
2778  - regatom - the lowest level
2779  *
2780  * Optimization:  gobbles an entire sequence of ordinary characters so that
2781  * it can turn them into a single node, which is smaller to store and
2782  * faster to run.  Backslashed characters are exceptions, each becoming a
2783  * separate node; the code is simpler that way and it's not worth fixing.
2784  *
2785  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2786 STATIC regnode *
2787 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2788 {
2789     register regnode *ret = 0;
2790     I32 flags;
2791     char *parse_start = 0;
2792
2793     *flagp = WORST;             /* Tentatively. */
2794
2795 tryagain:
2796     switch (*RExC_parse) {
2797     case '^':
2798         RExC_seen_zerolen++;
2799         nextchar(pRExC_state);
2800         if (RExC_flags & PMf_MULTILINE)
2801             ret = reg_node(pRExC_state, MBOL);
2802         else if (RExC_flags & PMf_SINGLELINE)
2803             ret = reg_node(pRExC_state, SBOL);
2804         else
2805             ret = reg_node(pRExC_state, BOL);
2806         Set_Node_Length(ret, 1); /* MJD */
2807         break;
2808     case '$':
2809         nextchar(pRExC_state);
2810         if (*RExC_parse)
2811             RExC_seen_zerolen++;
2812         if (RExC_flags & PMf_MULTILINE)
2813             ret = reg_node(pRExC_state, MEOL);
2814         else if (RExC_flags & PMf_SINGLELINE)
2815             ret = reg_node(pRExC_state, SEOL);
2816         else
2817             ret = reg_node(pRExC_state, EOL);
2818         Set_Node_Length(ret, 1); /* MJD */
2819         break;
2820     case '.':
2821         nextchar(pRExC_state);
2822         if (RExC_flags & PMf_SINGLELINE)
2823             ret = reg_node(pRExC_state, SANY);
2824         else
2825             ret = reg_node(pRExC_state, REG_ANY);
2826         *flagp |= HASWIDTH|SIMPLE;
2827         RExC_naughty++;
2828         Set_Node_Length(ret, 1); /* MJD */
2829         break;
2830     case '[':
2831     {
2832         char *oregcomp_parse = ++RExC_parse;
2833         ret = regclass(pRExC_state);
2834         if (*RExC_parse != ']') {
2835             RExC_parse = oregcomp_parse;
2836             vFAIL("Unmatched [");
2837         }
2838         nextchar(pRExC_state);
2839         *flagp |= HASWIDTH|SIMPLE;
2840         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2841         break;
2842     }
2843     case '(':
2844         nextchar(pRExC_state);
2845         ret = reg(pRExC_state, 1, &flags);
2846         if (ret == NULL) {
2847                 if (flags & TRYAGAIN) {
2848                     if (RExC_parse == RExC_end) {
2849                          /* Make parent create an empty node if needed. */
2850                         *flagp |= TRYAGAIN;
2851                         return(NULL);
2852                     }
2853                     goto tryagain;
2854                 }
2855                 return(NULL);
2856         }
2857         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2858         break;
2859     case '|':
2860     case ')':
2861         if (flags & TRYAGAIN) {
2862             *flagp |= TRYAGAIN;
2863             return NULL;
2864         }
2865         vFAIL("Internal urp");
2866                                 /* Supposed to be caught earlier. */
2867         break;
2868     case '{':
2869         if (!regcurly(RExC_parse)) {
2870             RExC_parse++;
2871             goto defchar;
2872         }
2873         /* FALL THROUGH */
2874     case '?':
2875     case '+':
2876     case '*':
2877         RExC_parse++;
2878         vFAIL("Quantifier follows nothing");
2879         break;
2880     case '\\':
2881         switch (*++RExC_parse) {
2882         case 'A':
2883             RExC_seen_zerolen++;
2884             ret = reg_node(pRExC_state, SBOL);
2885             *flagp |= SIMPLE;
2886             nextchar(pRExC_state);
2887             Set_Node_Length(ret, 2); /* MJD */
2888             break;
2889         case 'G':
2890             ret = reg_node(pRExC_state, GPOS);
2891             RExC_seen |= REG_SEEN_GPOS;
2892             *flagp |= SIMPLE;
2893             nextchar(pRExC_state);
2894             Set_Node_Length(ret, 2); /* MJD */
2895             break;
2896         case 'Z':
2897             ret = reg_node(pRExC_state, SEOL);
2898             *flagp |= SIMPLE;
2899             RExC_seen_zerolen++;                /* Do not optimize RE away */
2900             nextchar(pRExC_state);
2901             break;
2902         case 'z':
2903             ret = reg_node(pRExC_state, EOS);
2904             *flagp |= SIMPLE;
2905             RExC_seen_zerolen++;                /* Do not optimize RE away */
2906             nextchar(pRExC_state);
2907             Set_Node_Length(ret, 2); /* MJD */
2908             break;
2909         case 'C':
2910             ret = reg_node(pRExC_state, CANY);
2911             RExC_seen |= REG_SEEN_CANY;
2912             *flagp |= HASWIDTH|SIMPLE;
2913             nextchar(pRExC_state);
2914             Set_Node_Length(ret, 2); /* MJD */
2915             break;
2916         case 'X':
2917             ret = reg_node(pRExC_state, CLUMP);
2918             *flagp |= HASWIDTH;
2919             nextchar(pRExC_state);
2920             Set_Node_Length(ret, 2); /* MJD */
2921             break;
2922         case 'w':
2923             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2924             *flagp |= HASWIDTH|SIMPLE;
2925             nextchar(pRExC_state);
2926             Set_Node_Length(ret, 2); /* MJD */
2927             break;
2928         case 'W':
2929             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2930             *flagp |= HASWIDTH|SIMPLE;
2931             nextchar(pRExC_state);
2932             Set_Node_Length(ret, 2); /* MJD */
2933             break;
2934         case 'b':
2935             RExC_seen_zerolen++;
2936             RExC_seen |= REG_SEEN_LOOKBEHIND;
2937             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2938             *flagp |= SIMPLE;
2939             nextchar(pRExC_state);
2940             Set_Node_Length(ret, 2); /* MJD */
2941             break;
2942         case 'B':
2943             RExC_seen_zerolen++;
2944             RExC_seen |= REG_SEEN_LOOKBEHIND;
2945             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2946             *flagp |= SIMPLE;
2947             nextchar(pRExC_state);
2948             Set_Node_Length(ret, 2); /* MJD */
2949             break;
2950         case 's':
2951             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2952             *flagp |= HASWIDTH|SIMPLE;
2953             nextchar(pRExC_state);
2954             Set_Node_Length(ret, 2); /* MJD */
2955             break;
2956         case 'S':
2957             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2958             *flagp |= HASWIDTH|SIMPLE;
2959             nextchar(pRExC_state);
2960             Set_Node_Length(ret, 2); /* MJD */
2961             break;
2962         case 'd':
2963             ret = reg_node(pRExC_state, DIGIT);
2964             *flagp |= HASWIDTH|SIMPLE;
2965             nextchar(pRExC_state);
2966             Set_Node_Length(ret, 2); /* MJD */
2967             break;
2968         case 'D':
2969             ret = reg_node(pRExC_state, NDIGIT);
2970             *flagp |= HASWIDTH|SIMPLE;
2971             nextchar(pRExC_state);
2972             Set_Node_Length(ret, 2); /* MJD */
2973             break;
2974         case 'p':
2975         case 'P':
2976             {   
2977                 char* oldregxend = RExC_end;
2978                 char* parse_start = RExC_parse - 2;
2979
2980                 if (RExC_parse[1] == '{') {
2981                   /* a lovely hack--pretend we saw [\pX] instead */
2982                     RExC_end = strchr(RExC_parse, '}');
2983                     if (!RExC_end) {
2984                         U8 c = (U8)*RExC_parse;
2985                         RExC_parse += 2;
2986                         RExC_end = oldregxend;
2987                         vFAIL2("Missing right brace on \\%c{}", c);
2988                     }
2989                     RExC_end++;
2990                 }
2991                 else {
2992                     RExC_end = RExC_parse + 2;
2993                     if (RExC_end > oldregxend)
2994                         RExC_end = oldregxend;
2995                 }
2996                 RExC_parse--;
2997
2998                 ret = regclass(pRExC_state);
2999
3000                 RExC_end = oldregxend;
3001                 RExC_parse--;
3002
3003                 Set_Node_Offset(ret, parse_start + 2);
3004                 Set_Node_Cur_Length(ret);
3005                 nextchar(pRExC_state);
3006                 *flagp |= HASWIDTH|SIMPLE;
3007             }
3008             break;
3009         case 'n':
3010         case 'r':
3011         case 't':
3012         case 'f':
3013         case 'e':
3014         case 'a':
3015         case 'x':
3016         case 'c':
3017         case '0':
3018             goto defchar;
3019         case '1': case '2': case '3': case '4':
3020         case '5': case '6': case '7': case '8': case '9':
3021             {
3022                 I32 num = atoi(RExC_parse);
3023
3024                 if (num > 9 && num >= RExC_npar)
3025                     goto defchar;
3026                 else {
3027                     char * parse_start = RExC_parse - 1; /* MJD */
3028                     while (isDIGIT(*RExC_parse))
3029                         RExC_parse++;
3030
3031                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3032                         vFAIL("Reference to nonexistent group");
3033                     RExC_sawback = 1;
3034                     ret = reganode(pRExC_state,
3035                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3036                                    num);
3037                     *flagp |= HASWIDTH;
3038                     
3039                     /* override incorrect value set in reganode MJD */
3040                     Set_Node_Offset(ret, parse_start+1); 
3041                     Set_Node_Cur_Length(ret); /* MJD */
3042                     RExC_parse--;
3043                     nextchar(pRExC_state);
3044                 }
3045             }
3046             break;
3047         case '\0':
3048             if (RExC_parse >= RExC_end)
3049                 FAIL("Trailing \\");
3050             /* FALL THROUGH */
3051         default:
3052             /* Do not generate `unrecognized' warnings here, we fall
3053                back into the quick-grab loop below */
3054             goto defchar;
3055         }
3056         break;
3057
3058     case '#':
3059         if (RExC_flags & PMf_EXTENDED) {
3060             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3061             if (RExC_parse < RExC_end)
3062                 goto tryagain;
3063         }
3064         /* FALL THROUGH */
3065
3066     default: {
3067             register STRLEN len;
3068             register UV ender;
3069             register char *p;
3070             char *oldp, *s;
3071             STRLEN numlen;
3072             STRLEN foldlen;
3073             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3074
3075             parse_start = RExC_parse - 1;
3076
3077             RExC_parse++;
3078
3079         defchar:
3080             ender = 0;
3081             ret = reg_node(pRExC_state,
3082                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3083             s = STRING(ret);
3084             for (len = 0, p = RExC_parse - 1;
3085               len < 127 && p < RExC_end;
3086               len++)
3087             {
3088                 oldp = p;
3089
3090                 if (RExC_flags & PMf_EXTENDED)
3091                     p = regwhite(p, RExC_end);
3092                 switch (*p) {
3093                 case '^':
3094                 case '$':
3095                 case '.':
3096                 case '[':
3097                 case '(':
3098                 case ')':
3099                 case '|':
3100                     goto loopdone;
3101                 case '\\':
3102                     switch (*++p) {
3103                     case 'A':
3104                     case 'C':
3105                     case 'X':
3106                     case 'G':
3107                     case 'Z':
3108                     case 'z':
3109                     case 'w':
3110                     case 'W':
3111                     case 'b':
3112                     case 'B':
3113                     case 's':
3114                     case 'S':
3115                     case 'd':
3116                     case 'D':
3117                     case 'p':
3118                     case 'P':
3119                         --p;
3120                         goto loopdone;
3121                     case 'n':
3122                         ender = '\n';
3123                         p++;
3124                         break;
3125                     case 'r':
3126                         ender = '\r';
3127                         p++;
3128                         break;
3129                     case 't':
3130                         ender = '\t';
3131                         p++;
3132                         break;
3133                     case 'f':
3134                         ender = '\f';
3135                         p++;
3136                         break;
3137                     case 'e':
3138                           ender = ASCII_TO_NATIVE('\033');
3139                         p++;
3140                         break;
3141                     case 'a':
3142                           ender = ASCII_TO_NATIVE('\007');
3143                         p++;
3144                         break;
3145                     case 'x':
3146                         if (*++p == '{') {
3147                             char* e = strchr(p, '}');
3148         
3149                             if (!e) {
3150                                 RExC_parse = p + 1;
3151                                 vFAIL("Missing right brace on \\x{}");
3152                             }
3153                             else {
3154                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155                                     | PERL_SCAN_DISALLOW_PREFIX;
3156                                 numlen = e - p - 1;
3157                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3158                                 if (ender > 0xff)
3159                                     RExC_utf8 = 1;
3160                                 p = e + 1;
3161                             }
3162                         }
3163                         else {
3164                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3165                             numlen = 2;
3166                             ender = grok_hex(p, &numlen, &flags, NULL);
3167                             p += numlen;
3168                         }
3169                         break;
3170                     case 'c':
3171                         p++;
3172                         ender = UCHARAT(p++);
3173                         ender = toCTRL(ender);
3174                         break;
3175                     case '0': case '1': case '2': case '3':case '4':
3176                     case '5': case '6': case '7': case '8':case '9':
3177                         if (*p == '0' ||
3178                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3179                             I32 flags = 0;
3180                             numlen = 3;
3181                             ender = grok_oct(p, &numlen, &flags, NULL);
3182                             p += numlen;
3183                         }
3184                         else {
3185                             --p;
3186                             goto loopdone;
3187                         }
3188                         break;
3189                     case '\0':
3190                         if (p >= RExC_end)
3191                             FAIL("Trailing \\");
3192                         /* FALL THROUGH */
3193                     default:
3194                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3195                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3196                         goto normal_default;
3197                     }
3198                     break;
3199                 default:
3200                   normal_default:
3201                     if (UTF8_IS_START(*p) && UTF) {
3202                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3203                                                &numlen, 0);
3204                         p += numlen;
3205                     }
3206                     else
3207                         ender = *p++;
3208                     break;
3209                 }
3210                 if (RExC_flags & PMf_EXTENDED)
3211                     p = regwhite(p, RExC_end);
3212                 if (UTF && FOLD) {
3213                     /* Prime the casefolded buffer. */
3214                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3215                 }
3216                 if (ISMULT2(p)) { /* Back off on ?+*. */
3217                     if (len)
3218                         p = oldp;
3219                     else if (UTF) {
3220                          STRLEN unilen;
3221
3222                          if (FOLD) {
3223                               /* Emit all the Unicode characters. */
3224                               for (foldbuf = tmpbuf;
3225                                    foldlen;
3226                                    foldlen -= numlen) {
3227                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3228                                    if (numlen > 0) {
3229                                         reguni(pRExC_state, ender, s, &unilen);
3230                                         s       += unilen;
3231                                         len     += unilen;
3232                                         /* In EBCDIC the numlen
3233                                          * and unilen can differ. */
3234                                         foldbuf += numlen;
3235                                         if (numlen >= foldlen)
3236                                              break;
3237                                    }
3238                                    else
3239                                         break; /* "Can't happen." */
3240                               }
3241                          }
3242                          else {
3243                               reguni(pRExC_state, ender, s, &unilen);
3244                               if (unilen > 0) {
3245                                    s   += unilen;
3246                                    len += unilen;
3247                               }
3248                          }
3249                     }
3250                     else {
3251                         len++;
3252                         REGC((char)ender, s++);
3253                     }
3254                     break;
3255                 }
3256                 if (UTF) {
3257                      STRLEN unilen;
3258
3259                      if (FOLD) {
3260                           /* Emit all the Unicode characters. */
3261                           for (foldbuf = tmpbuf;
3262                                foldlen;
3263                                foldlen -= numlen) {
3264                                ender = utf8_to_uvchr(foldbuf, &numlen);
3265                                if (numlen > 0) {
3266                                     reguni(pRExC_state, ender, s, &unilen);
3267                                     len     += unilen;
3268                                     s       += unilen;
3269                                     /* In EBCDIC the numlen
3270                                      * and unilen can differ. */
3271                                     foldbuf += numlen;
3272                                     if (numlen >= foldlen)
3273                                          break;
3274                                }
3275                                else
3276                                     break;
3277                           }
3278                      }
3279                      else {
3280                           reguni(pRExC_state, ender, s, &unilen);
3281                           if (unilen > 0) {
3282                                s   += unilen;
3283                                len += unilen;
3284                           }
3285                      }
3286                      len--;
3287                 }
3288                 else
3289                     REGC((char)ender, s++);
3290             }
3291         loopdone:
3292             RExC_parse = p - 1;
3293             Set_Node_Cur_Length(ret); /* MJD */
3294             nextchar(pRExC_state);
3295             {
3296                 /* len is STRLEN which is unsigned, need to copy to signed */
3297                 IV iv = len;
3298                 if (iv < 0)
3299                     vFAIL("Internal disaster");
3300             }
3301             if (len > 0)
3302                 *flagp |= HASWIDTH;
3303             if (len == 1 && UNI_IS_INVARIANT(ender))
3304                 *flagp |= SIMPLE;
3305             if (!SIZE_ONLY)
3306                 STR_LEN(ret) = len;
3307             if (SIZE_ONLY)
3308                 RExC_size += STR_SZ(len);
3309             else
3310                 RExC_emit += STR_SZ(len);
3311         }
3312         break;
3313     }
3314
3315     /* If the encoding pragma is in effect recode the text of
3316      * any EXACT-kind nodes. */
3317     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3318         STRLEN oldlen = STR_LEN(ret);
3319         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3320
3321         if (RExC_utf8)
3322             SvUTF8_on(sv);
3323         if (sv_utf8_downgrade(sv, TRUE)) {
3324             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3325             STRLEN newlen = SvCUR(sv);
3326
3327             if (SvUTF8(sv))
3328                 RExC_utf8 = 1;
3329             if (!SIZE_ONLY) {
3330                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3331                                       (int)oldlen, STRING(ret),
3332                                       (int)newlen, s));
3333                 Copy(s, STRING(ret), newlen, char);
3334                 STR_LEN(ret) += newlen - oldlen;
3335                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3336             } else
3337                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3338         }
3339     }
3340
3341     return(ret);
3342 }
3343
3344 STATIC char *
3345 S_regwhite(pTHX_ char *p, char *e)
3346 {
3347     while (p < e) {
3348         if (isSPACE(*p))
3349             ++p;
3350         else if (*p == '#') {
3351             do {
3352                 p++;
3353             } while (p < e && *p != '\n');
3354         }
3355         else
3356             break;
3357     }
3358     return p;
3359 }
3360
3361 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3362    Character classes ([:foo:]) can also be negated ([:^foo:]).
3363    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3364    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3365    but trigger failures because they are currently unimplemented. */
3366
3367 #define POSIXCC_DONE(c)   ((c) == ':')
3368 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3369 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3370
3371 STATIC I32
3372 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3373 {
3374     char *posixcc = 0;
3375     I32 namedclass = OOB_NAMEDCLASS;
3376
3377     if (value == '[' && RExC_parse + 1 < RExC_end &&
3378         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3379         POSIXCC(UCHARAT(RExC_parse))) {
3380         char  c = UCHARAT(RExC_parse);
3381         char* s = RExC_parse++;
3382         
3383         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3384             RExC_parse++;
3385         if (RExC_parse == RExC_end)
3386             /* Grandfather lone [:, [=, [. */
3387             RExC_parse = s;
3388         else {
3389             char* t = RExC_parse++; /* skip over the c */
3390
3391             if (UCHARAT(RExC_parse) == ']') {
3392                 RExC_parse++; /* skip over the ending ] */
3393                 posixcc = s + 1;
3394                 if (*s == ':') {
3395                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3396                     I32 skip = 5; /* the most common skip */
3397
3398                     switch (*posixcc) {
3399                     case 'a':
3400                         if (strnEQ(posixcc, "alnum", 5))
3401                             namedclass =
3402                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3403                         else if (strnEQ(posixcc, "alpha", 5))
3404                             namedclass =
3405                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3406                         else if (strnEQ(posixcc, "ascii", 5))
3407                             namedclass =
3408                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3409                         break;
3410                     case 'b':
3411                         if (strnEQ(posixcc, "blank", 5))
3412                             namedclass =
3413                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3414                         break;
3415                     case 'c':
3416                         if (strnEQ(posixcc, "cntrl", 5))
3417                             namedclass =
3418                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3419                         break;
3420                     case 'd':
3421                         if (strnEQ(posixcc, "digit", 5))
3422                             namedclass =
3423                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3424                         break;
3425                     case 'g':
3426                         if (strnEQ(posixcc, "graph", 5))
3427                             namedclass =
3428                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3429                         break;
3430                     case 'l':
3431                         if (strnEQ(posixcc, "lower", 5))
3432                             namedclass =
3433                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3434                         break;
3435                     case 'p':
3436                         if (strnEQ(posixcc, "print", 5))
3437                             namedclass =
3438                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3439                         else if (strnEQ(posixcc, "punct", 5))
3440                             namedclass =
3441                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3442                         break;
3443                     case 's':
3444                         if (strnEQ(posixcc, "space", 5))
3445                             namedclass =
3446                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3447                         break;
3448                     case 'u':
3449                         if (strnEQ(posixcc, "upper", 5))
3450                             namedclass =
3451                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3452                         break;
3453                     case 'w': /* this is not POSIX, this is the Perl \w */
3454                         if (strnEQ(posixcc, "word", 4)) {
3455                             namedclass =
3456                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3457                             skip = 4;
3458                         }
3459                         break;
3460                     case 'x':
3461                         if (strnEQ(posixcc, "xdigit", 6)) {
3462                             namedclass =
3463                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3464                             skip = 6;
3465                         }
3466                         break;
3467                     }
3468                     if (namedclass == OOB_NAMEDCLASS ||
3469                         posixcc[skip] != ':' ||
3470                         posixcc[skip+1] != ']')
3471                     {
3472                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3473                                       t - s - 1, s + 1);
3474                     }
3475                 } else if (!SIZE_ONLY) {
3476                     /* [[=foo=]] and [[.foo.]] are still future. */
3477
3478                     /* adjust RExC_parse so the warning shows after
3479                        the class closes */
3480                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3481                         RExC_parse++;
3482                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3483                 }
3484             } else {
3485                 /* Maternal grandfather:
3486                  * "[:" ending in ":" but not in ":]" */
3487                 RExC_parse = s;
3488             }
3489         }
3490     }
3491
3492     return namedclass;
3493 }
3494
3495 STATIC void
3496 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3497 {
3498     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3499         char *s = RExC_parse;
3500         char  c = *s++;
3501
3502         while(*s && isALNUM(*s))
3503             s++;
3504         if (*s && c == *s && s[1] == ']') {
3505             if (ckWARN(WARN_REGEXP))
3506                 vWARN3(s+2,
3507                         "POSIX syntax [%c %c] belongs inside character classes",
3508                         c, c);
3509
3510             /* [[=foo=]] and [[.foo.]] are still future. */
3511             if (POSIXCC_NOTYET(c)) {
3512                 /* adjust RExC_parse so the error shows after
3513                    the class closes */
3514                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3515                     ;
3516                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3517             }
3518         }
3519     }
3520 }
3521
3522 STATIC regnode *
3523 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3524 {
3525     register UV value;
3526     register UV nextvalue;
3527     register IV prevvalue = OOB_UNICODE;
3528     register IV range = 0;
3529     register regnode *ret;
3530     STRLEN numlen;
3531     IV namedclass;
3532     char *rangebegin = 0;
3533     bool need_class = 0;
3534     SV *listsv = Nullsv;
3535     register char *e;
3536     UV n;
3537     bool optimize_invert   = TRUE;
3538     AV* unicode_alternate  = 0;
3539 #ifdef EBCDIC
3540     UV literal_endpoint = 0;
3541 #endif
3542
3543     ret = reganode(pRExC_state, ANYOF, 0);
3544
3545     if (!SIZE_ONLY)
3546         ANYOF_FLAGS(ret) = 0;
3547
3548     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3549         RExC_naughty++;
3550         RExC_parse++;
3551         if (!SIZE_ONLY)
3552             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3553     }
3554
3555     if (SIZE_ONLY)
3556         RExC_size += ANYOF_SKIP;
3557     else {
3558         RExC_emit += ANYOF_SKIP;
3559         if (FOLD)
3560             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3561         if (LOC)
3562             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3563         ANYOF_BITMAP_ZERO(ret);
3564         listsv = newSVpvn("# comment\n", 10);
3565     }
3566
3567     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3568
3569     if (!SIZE_ONLY && POSIXCC(nextvalue))
3570         checkposixcc(pRExC_state);
3571
3572     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3573     if (UCHARAT(RExC_parse) == ']')
3574         goto charclassloop;
3575
3576     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3577
3578     charclassloop:
3579
3580         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3581
3582         if (!range)
3583             rangebegin = RExC_parse;
3584         if (UTF) {
3585             value = utf8n_to_uvchr((U8*)RExC_parse,
3586                                    RExC_end - RExC_parse,
3587                                    &numlen, 0);
3588             RExC_parse += numlen;
3589         }
3590         else
3591             value = UCHARAT(RExC_parse++);
3592         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3593         if (value == '[' && POSIXCC(nextvalue))
3594             namedclass = regpposixcc(pRExC_state, value);
3595         else if (value == '\\') {
3596             if (UTF) {
3597                 value = utf8n_to_uvchr((U8*)RExC_parse,
3598                                    RExC_end - RExC_parse,
3599                                    &numlen, 0);
3600                 RExC_parse += numlen;
3601             }
3602             else
3603                 value = UCHARAT(RExC_parse++);
3604             /* Some compilers cannot handle switching on 64-bit integer
3605              * values, therefore value cannot be an UV.  Yes, this will
3606              * be a problem later if we want switch on Unicode.
3607              * A similar issue a little bit later when switching on
3608              * namedclass. --jhi */
3609             switch ((I32)value) {
3610             case 'w':   namedclass = ANYOF_ALNUM;       break;
3611             case 'W':   namedclass = ANYOF_NALNUM;      break;
3612             case 's':   namedclass = ANYOF_SPACE;       break;
3613             case 'S':   namedclass = ANYOF_NSPACE;      break;
3614             case 'd':   namedclass = ANYOF_DIGIT;       break;
3615             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3616             case 'p':
3617             case 'P':
3618                 if (RExC_parse >= RExC_end)
3619                     vFAIL2("Empty \\%c{}", (U8)value);
3620                 if (*RExC_parse == '{') {
3621                     U8 c = (U8)value;
3622                     e = strchr(RExC_parse++, '}');
3623                     if (!e)
3624                         vFAIL2("Missing right brace on \\%c{}", c);
3625                     while (isSPACE(UCHARAT(RExC_parse)))
3626                         RExC_parse++;
3627                     if (e == RExC_parse)
3628                         vFAIL2("Empty \\%c{}", c);
3629                     n = e - RExC_parse;
3630                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3631                         n--;
3632                 }
3633                 else {
3634                     e = RExC_parse;
3635                     n = 1;
3636                 }
3637                 if (!SIZE_ONLY) {
3638                     if (UCHARAT(RExC_parse) == '^') {
3639                          RExC_parse++;
3640                          n--;
3641                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3642                          while (isSPACE(UCHARAT(RExC_parse))) {
3643                               RExC_parse++;
3644                               n--;
3645                          }
3646                     }
3647                     if (value == 'p')
3648                          Perl_sv_catpvf(aTHX_ listsv,
3649                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3650                     else
3651                          Perl_sv_catpvf(aTHX_ listsv,
3652                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3653                 }
3654                 RExC_parse = e + 1;
3655                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3656                 continue;
3657             case 'n':   value = '\n';                   break;
3658             case 'r':   value = '\r';                   break;
3659             case 't':   value = '\t';                   break;
3660             case 'f':   value = '\f';                   break;
3661             case 'b':   value = '\b';                   break;
3662             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3663             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3664             case 'x':
3665                 if (*RExC_parse == '{') {
3666                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3667                         | PERL_SCAN_DISALLOW_PREFIX;
3668                     e = strchr(RExC_parse++, '}');
3669                     if (!e)
3670                         vFAIL("Missing right brace on \\x{}");
3671
3672                     numlen = e - RExC_parse;
3673                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3674                     RExC_parse = e + 1;
3675                 }
3676                 else {
3677                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3678                     numlen = 2;
3679                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3680                     RExC_parse += numlen;
3681                 }
3682                 break;
3683             case 'c':
3684                 value = UCHARAT(RExC_parse++);
3685                 value = toCTRL(value);
3686                 break;
3687             case '0': case '1': case '2': case '3': case '4':
3688             case '5': case '6': case '7': case '8': case '9':
3689             {
3690                 I32 flags = 0;
3691                 numlen = 3;
3692                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3693                 RExC_parse += numlen;
3694                 break;
3695             }
3696             default:
3697                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3698                     vWARN2(RExC_parse,
3699                            "Unrecognized escape \\%c in character class passed through",
3700                            (int)value);
3701                 break;
3702             }
3703         } /* end of \blah */
3704 #ifdef EBCDIC
3705         else
3706             literal_endpoint++;
3707 #endif
3708
3709         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3710
3711             if (!SIZE_ONLY && !need_class)
3712                 ANYOF_CLASS_ZERO(ret);
3713
3714             need_class = 1;
3715
3716             /* a bad range like a-\d, a-[:digit:] ? */
3717             if (range) {
3718                 if (!SIZE_ONLY) {
3719                     if (ckWARN(WARN_REGEXP))
3720                         vWARN4(RExC_parse,
3721                                "False [] range \"%*.*s\"",
3722                                RExC_parse - rangebegin,
3723                                RExC_parse - rangebegin,
3724                                rangebegin);
3725                     if (prevvalue < 256) {
3726                         ANYOF_BITMAP_SET(ret, prevvalue);
3727                         ANYOF_BITMAP_SET(ret, '-');
3728                     }
3729                     else {
3730                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3731                         Perl_sv_catpvf(aTHX_ listsv,
3732                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3733                     }
3734                 }
3735
3736                 range = 0; /* this was not a true range */
3737             }
3738
3739             if (!SIZE_ONLY) {
3740                 if (namedclass > OOB_NAMEDCLASS)
3741                     optimize_invert = FALSE;
3742                 /* Possible truncation here but in some 64-bit environments
3743                  * the compiler gets heartburn about switch on 64-bit values.
3744                  * A similar issue a little earlier when switching on value.
3745                  * --jhi */
3746                 switch ((I32)namedclass) {
3747                 case ANYOF_ALNUM:
3748                     if (LOC)
3749                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3750                     else {
3751                         for (value = 0; value < 256; value++)
3752                             if (isALNUM(value))
3753                                 ANYOF_BITMAP_SET(ret, value);
3754                     }
3755                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3756                     break;
3757                 case ANYOF_NALNUM:
3758                     if (LOC)
3759                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3760                     else {
3761                         for (value = 0; value < 256; value++)
3762                             if (!isALNUM(value))
3763                                 ANYOF_BITMAP_SET(ret, value);
3764                     }
3765                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3766                     break;
3767                 case ANYOF_ALNUMC:
3768                     if (LOC)
3769                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3770                     else {
3771                         for (value = 0; value < 256; value++)
3772                             if (isALNUMC(value))
3773                                 ANYOF_BITMAP_SET(ret, value);
3774                     }
3775                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3776                     break;
3777                 case ANYOF_NALNUMC:
3778                     if (LOC)
3779                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3780                     else {
3781                         for (value = 0; value < 256; value++)
3782                             if (!isALNUMC(value))
3783                                 ANYOF_BITMAP_SET(ret, value);
3784                     }
3785                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3786                     break;
3787                 case ANYOF_ALPHA:
3788                     if (LOC)
3789                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3790                     else {
3791                         for (value = 0; value < 256; value++)
3792                             if (isALPHA(value))
3793                                 ANYOF_BITMAP_SET(ret, value);
3794                     }
3795                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3796                     break;
3797                 case ANYOF_NALPHA:
3798                     if (LOC)
3799                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3800                     else {
3801                         for (value = 0; value < 256; value++)
3802                             if (!isALPHA(value))
3803                                 ANYOF_BITMAP_SET(ret, value);
3804                     }
3805                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3806                     break;
3807                 case ANYOF_ASCII:
3808                     if (LOC)
3809                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3810                     else {
3811 #ifndef EBCDIC
3812                         for (value = 0; value < 128; value++)
3813                             ANYOF_BITMAP_SET(ret, value);
3814 #else  /* EBCDIC */
3815                         for (value = 0; value < 256; value++) {
3816                             if (isASCII(value))
3817                                 ANYOF_BITMAP_SET(ret, value);
3818                         }
3819 #endif /* EBCDIC */
3820                     }
3821                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3822                     break;
3823                 case ANYOF_NASCII:
3824                     if (LOC)
3825                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3826                     else {
3827 #ifndef EBCDIC
3828                         for (value = 128; value < 256; value++)
3829                             ANYOF_BITMAP_SET(ret, value);
3830 #else  /* EBCDIC */
3831                         for (value = 0; value < 256; value++) {
3832                             if (!isASCII(value))
3833                                 ANYOF_BITMAP_SET(ret, value);
3834                         }
3835 #endif /* EBCDIC */
3836                     }
3837                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3838                     break;
3839                 case ANYOF_BLANK:
3840                     if (LOC)
3841                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3842                     else {
3843                         for (value = 0; value < 256; value++)
3844                             if (isBLANK(value))
3845                                 ANYOF_BITMAP_SET(ret, value);
3846                     }
3847                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3848                     break;
3849                 case ANYOF_NBLANK:
3850                     if (LOC)
3851                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3852                     else {
3853                         for (value = 0; value < 256; value++)
3854                             if (!isBLANK(value))
3855                                 ANYOF_BITMAP_SET(ret, value);
3856                     }
3857                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3858                     break;
3859                 case ANYOF_CNTRL:
3860                     if (LOC)
3861                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3862                     else {
3863                         for (value = 0; value < 256; value++)
3864                             if (isCNTRL(value))
3865                                 ANYOF_BITMAP_SET(ret, value);
3866                     }
3867                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3868                     break;
3869                 case ANYOF_NCNTRL:
3870                     if (LOC)
3871                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3872                     else {
3873                         for (value = 0; value < 256; value++)
3874                             if (!isCNTRL(value))
3875                                 ANYOF_BITMAP_SET(ret, value);
3876                     }
3877                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3878                     break;
3879                 case ANYOF_DIGIT:
3880                     if (LOC)
3881                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3882                     else {
3883                         /* consecutive digits assumed */
3884                         for (value = '0'; value <= '9'; value++)
3885                             ANYOF_BITMAP_SET(ret, value);
3886                     }
3887                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3888                     break;
3889                 case ANYOF_NDIGIT:
3890                     if (LOC)
3891                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3892                     else {
3893                         /* consecutive digits assumed */
3894                         for (value = 0; value < '0'; value++)
3895                             ANYOF_BITMAP_SET(ret, value);
3896                         for (value = '9' + 1; value < 256; value++)
3897                             ANYOF_BITMAP_SET(ret, value);
3898                     }
3899                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3900                     break;
3901                 case ANYOF_GRAPH:
3902                     if (LOC)
3903                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3904                     else {
3905                         for (value = 0; value < 256; value++)
3906                             if (isGRAPH(value))
3907                                 ANYOF_BITMAP_SET(ret, value);
3908                     }
3909                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3910                     break;
3911                 case ANYOF_NGRAPH:
3912                     if (LOC)
3913                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3914                     else {
3915                         for (value = 0; value < 256; value++)
3916                             if (!isGRAPH(value))
3917                                 ANYOF_BITMAP_SET(ret, value);
3918                     }
3919                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3920                     break;
3921                 case ANYOF_LOWER:
3922                     if (LOC)
3923                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3924                     else {
3925                         for (value = 0; value < 256; value++)
3926                             if (isLOWER(value))
3927                                 ANYOF_BITMAP_SET(ret, value);
3928                     }
3929                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3930                     break;
3931                 case ANYOF_NLOWER:
3932                     if (LOC)
3933                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3934                     else {
3935                         for (value = 0; value < 256; value++)
3936                             if (!isLOWER(value))
3937                                 ANYOF_BITMAP_SET(ret, value);
3938                     }
3939                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3940                     break;
3941                 case ANYOF_PRINT:
3942                     if (LOC)
3943                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3944                     else {
3945                         for (value = 0; value < 256; value++)
3946                             if (isPRINT(value))
3947                                 ANYOF_BITMAP_SET(ret, value);
3948                     }
3949                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3950                     break;
3951                 case ANYOF_NPRINT:
3952                     if (LOC)
3953                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3954                     else {
3955                         for (value = 0; value < 256; value++)
3956                             if (!isPRINT(value))
3957                                 ANYOF_BITMAP_SET(ret, value);
3958                     }
3959                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3960                     break;
3961                 case ANYOF_PSXSPC:
3962                     if (LOC)
3963                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3964                     else {
3965                         for (value = 0; value < 256; value++)
3966                             if (isPSXSPC(value))
3967                                 ANYOF_BITMAP_SET(ret, value);
3968                     }
3969                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3970                     break;
3971                 case ANYOF_NPSXSPC:
3972                     if (LOC)
3973                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3974                     else {
3975                         for (value = 0; value < 256; value++)
3976                             if (!isPSXSPC(value))
3977                                 ANYOF_BITMAP_SET(ret, value);
3978                     }
3979                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3980                     break;
3981                 case ANYOF_PUNCT:
3982                     if (LOC)
3983                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3984                     else {
3985                         for (value = 0; value < 256; value++)
3986                             if (isPUNCT(value))
3987                                 ANYOF_BITMAP_SET(ret, value);
3988                     }
3989                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3990                     break;
3991                 case ANYOF_NPUNCT:
3992                     if (LOC)
3993                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3994                     else {
3995                         for (value = 0; value < 256; value++)
3996                             if (!isPUNCT(value))
3997                                 ANYOF_BITMAP_SET(ret, value);
3998                     }
3999                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4000                     break;
4001                 case ANYOF_SPACE:
4002                     if (LOC)
4003                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4004                     else {
4005                         for (value = 0; value < 256; value++)
4006                             if (isSPACE(value))
4007                                 ANYOF_BITMAP_SET(ret, value);
4008                     }
4009                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4010                     break;
4011                 case ANYOF_NSPACE:
4012                     if (LOC)
4013                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4014                     else {
4015                         for (value = 0; value < 256; value++)
4016                             if (!isSPACE(value))
4017                                 ANYOF_BITMAP_SET(ret, value);
4018                     }
4019                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4020                     break;
4021                 case ANYOF_UPPER:
4022                     if (LOC)
4023                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4024                     else {
4025                         for (value = 0; value < 256; value++)
4026                             if (isUPPER(value))
4027                                 ANYOF_BITMAP_SET(ret, value);
4028                     }
4029                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4030                     break;
4031                 case ANYOF_NUPPER:
4032                     if (LOC)
4033                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4034                     else {
4035                         for (value = 0; value < 256; value++)
4036                             if (!isUPPER(value))
4037                                 ANYOF_BITMAP_SET(ret, value);
4038                     }
4039                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4040                     break;
4041                 case ANYOF_XDIGIT:
4042                     if (LOC)
4043                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4044                     else {
4045                         for (value = 0; value < 256; value++)
4046                             if (isXDIGIT(value))
4047                                 ANYOF_BITMAP_SET(ret, value);
4048                     }
4049                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4050                     break;
4051                 case ANYOF_NXDIGIT:
4052                     if (LOC)
4053                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4054                     else {
4055                         for (value = 0; value < 256; value++)
4056                             if (!isXDIGIT(value))
4057                                 ANYOF_BITMAP_SET(ret, value);
4058                     }
4059                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4060                     break;
4061                 default:
4062                     vFAIL("Invalid [::] class");
4063                     break;
4064                 }
4065                 if (LOC)
4066                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4067                 continue;
4068             }
4069         } /* end of namedclass \blah */
4070
4071         if (range) {
4072             if (prevvalue > (IV)value) /* b-a */ {
4073                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4074                               RExC_parse - rangebegin,
4075                               RExC_parse - rangebegin,
4076                               rangebegin);
4077                 range = 0; /* not a valid range */
4078             }
4079         }
4080         else {
4081             prevvalue = value; /* save the beginning of the range */
4082             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4083                 RExC_parse[1] != ']') {
4084                 RExC_parse++;
4085
4086                 /* a bad range like \w-, [:word:]- ? */
4087                 if (namedclass > OOB_NAMEDCLASS) {
4088                     if (ckWARN(WARN_REGEXP))
4089                         vWARN4(RExC_parse,
4090                                "False [] range \"%*.*s\"",
4091                                RExC_parse - rangebegin,
4092                                RExC_parse - rangebegin,
4093                                rangebegin);
4094                     if (!SIZE_ONLY)
4095                         ANYOF_BITMAP_SET(ret, '-');
4096                 } else
4097                     range = 1;  /* yeah, it's a range! */
4098                 continue;       /* but do it the next time */
4099             }
4100         }
4101
4102         /* now is the next time */
4103         if (!SIZE_ONLY) {
4104             IV i;
4105
4106             if (prevvalue < 256) {
4107                 IV ceilvalue = value < 256 ? value : 255;
4108
4109 #ifdef EBCDIC
4110                 /* In EBCDIC [\x89-\x91] should include
4111                  * the \x8e but [i-j] should not. */
4112                 if (literal_endpoint == 2 &&
4113                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4114                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4115                 {
4116                     if (isLOWER(prevvalue)) {
4117                         for (i = prevvalue; i <= ceilvalue; i++)
4118                             if (isLOWER(i))
4119                                 ANYOF_BITMAP_SET(ret, i);
4120                     } else {
4121                         for (i = prevvalue; i <= ceilvalue; i++)
4122                             if (isUPPER(i))
4123                                 ANYOF_BITMAP_SET(ret, i);
4124                     }
4125                 }
4126                 else
4127 #endif
4128                       for (i = prevvalue; i <= ceilvalue; i++)
4129                           ANYOF_BITMAP_SET(ret, i);
4130           }
4131           if (value > 255 || UTF) {
4132                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4133                 UV natvalue      = NATIVE_TO_UNI(value);
4134
4135                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4136                 if (prevnatvalue < natvalue) { /* what about > ? */
4137                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4138                                    prevnatvalue, natvalue);
4139                 }
4140                 else if (prevnatvalue == natvalue) {
4141                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4142                     if (FOLD) {
4143                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4144                          STRLEN foldlen;
4145                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4146
4147                          /* If folding and foldable and a single
4148                           * character, insert also the folded version
4149                           * to the charclass. */
4150                          if (f != value) {
4151                               if (foldlen == (STRLEN)UNISKIP(f))
4152                                   Perl_sv_catpvf(aTHX_ listsv,
4153                                                  "%04"UVxf"\n", f);
4154                               else {
4155                                   /* Any multicharacter foldings
4156                                    * require the following transform:
4157                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4158                                    * where E folds into "pq" and F folds
4159                                    * into "rst", all other characters
4160                                    * fold to single characters.  We save
4161                                    * away these multicharacter foldings,
4162                                    * to be later saved as part of the
4163                                    * additional "s" data. */
4164                                   SV *sv;
4165
4166                                   if (!unicode_alternate)
4167                                       unicode_alternate = newAV();
4168                                   sv = newSVpvn((char*)foldbuf, foldlen);
4169                                   SvUTF8_on(sv);
4170                                   av_push(unicode_alternate, sv);
4171                               }
4172                          }
4173
4174                          /* If folding and the value is one of the Greek
4175                           * sigmas insert a few more sigmas to make the
4176                           * folding rules of the sigmas to work right.
4177                           * Note that not all the possible combinations
4178                           * are handled here: some of them are handled
4179                           * by the standard folding rules, and some of
4180                           * them (literal or EXACTF cases) are handled
4181                           * during runtime in regexec.c:S_find_byclass(). */
4182                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4183                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4184                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4185                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4186                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4187                          }
4188                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4189                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4190                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4191                     }
4192                 }
4193             }
4194 #ifdef EBCDIC
4195             literal_endpoint = 0;
4196 #endif
4197         }
4198
4199         range = 0; /* this range (if it was one) is done now */
4200     }
4201
4202     if (need_class) {
4203         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4204         if (SIZE_ONLY)
4205             RExC_size += ANYOF_CLASS_ADD_SKIP;
4206         else
4207             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4208     }
4209
4210     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4211     if (!SIZE_ONLY &&
4212          /* If the only flag is folding (plus possibly inversion). */
4213         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4214        ) {
4215         for (value = 0; value < 256; ++value) {
4216             if (ANYOF_BITMAP_TEST(ret, value)) {
4217                 UV fold = PL_fold[value];
4218
4219                 if (fold != value)
4220                     ANYOF_BITMAP_SET(ret, fold);
4221             }
4222         }
4223         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4224     }
4225
4226     /* optimize inverted simple patterns (e.g. [^a-z]) */
4227     if (!SIZE_ONLY && optimize_invert &&
4228         /* If the only flag is inversion. */
4229         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4230         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4231             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4232         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4233     }
4234
4235     if (!SIZE_ONLY) {
4236         AV *av = newAV();
4237         SV *rv;
4238
4239         /* The 0th element stores the character class description
4240          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4241          * to initialize the appropriate swash (which gets stored in
4242          * the 1st element), and also useful for dumping the regnode.
4243          * The 2nd element stores the multicharacter foldings,
4244          * used later (regexec.c:S_reginclass()). */
4245         av_store(av, 0, listsv);
4246         av_store(av, 1, NULL);
4247         av_store(av, 2, (SV*)unicode_alternate);
4248         rv = newRV_noinc((SV*)av);
4249         n = add_data(pRExC_state, 1, "s");
4250         RExC_rx->data->data[n] = (void*)rv;
4251         ARG_SET(ret, n);
4252     }
4253
4254     return ret;
4255 }
4256
4257 STATIC char*
4258 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4259 {
4260     char* retval = RExC_parse++;
4261
4262     for (;;) {
4263         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4264                 RExC_parse[2] == '#') {
4265             while (*RExC_parse != ')') {
4266                 if (RExC_parse == RExC_end)
4267                     FAIL("Sequence (?#... not terminated");
4268                 RExC_parse++;
4269             }
4270             RExC_parse++;
4271             continue;
4272         }
4273         if (RExC_flags & PMf_EXTENDED) {
4274             if (isSPACE(*RExC_parse)) {
4275                 RExC_parse++;
4276                 continue;
4277             }
4278             else if (*RExC_parse == '#') {
4279                 while (RExC_parse < RExC_end)
4280                     if (*RExC_parse++ == '\n') break;
4281                 continue;
4282             }
4283         }
4284         return retval;
4285     }
4286 }
4287
4288 /*
4289 - reg_node - emit a node
4290 */
4291 STATIC regnode *                        /* Location. */
4292 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4293 {
4294     register regnode *ret;
4295     register regnode *ptr;
4296
4297     ret = RExC_emit;
4298     if (SIZE_ONLY) {
4299         SIZE_ALIGN(RExC_size);
4300         RExC_size += 1;
4301         return(ret);
4302     }
4303
4304     NODE_ALIGN_FILL(ret);
4305     ptr = ret;
4306     FILL_ADVANCE_NODE(ptr, op);
4307     if (RExC_offsets) {         /* MJD */
4308         MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4309               "reg_node", __LINE__, 
4310               reg_name[op],
4311               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4312               ? "Overwriting end of array!\n" : "OK",
4313               RExC_emit - RExC_emit_start,
4314               RExC_parse - RExC_start,
4315               RExC_offsets[0])); 
4316         Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4317     }
4318             
4319     RExC_emit = ptr;
4320
4321     return(ret);
4322 }
4323
4324 /*
4325 - reganode - emit a node with an argument
4326 */
4327 STATIC regnode *                        /* Location. */
4328 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4329 {
4330     register regnode *ret;
4331     register regnode *ptr;
4332
4333     ret = RExC_emit;
4334     if (SIZE_ONLY) {
4335         SIZE_ALIGN(RExC_size);
4336         RExC_size += 2;
4337         return(ret);
4338     }
4339
4340     NODE_ALIGN_FILL(ret);
4341     ptr = ret;
4342     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4343     if (RExC_offsets) {         /* MJD */
4344         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4345               "reganode",
4346               __LINE__,
4347               reg_name[op],
4348               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4349               "Overwriting end of array!\n" : "OK",
4350               RExC_emit - RExC_emit_start,
4351               RExC_parse - RExC_start,
4352               RExC_offsets[0])); 
4353         Set_Cur_Node_Offset;
4354     }
4355             
4356     RExC_emit = ptr;
4357
4358     return(ret);
4359 }
4360
4361 /*
4362 - reguni - emit (if appropriate) a Unicode character
4363 */
4364 STATIC void
4365 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4366 {
4367     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4368 }
4369
4370 /*
4371 - reginsert - insert an operator in front of already-emitted operand
4372 *
4373 * Means relocating the operand.
4374 */
4375 STATIC void
4376 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4377 {
4378     register regnode *src;
4379     register regnode *dst;
4380     register regnode *place;
4381     register int offset = regarglen[(U8)op];
4382
4383 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4384
4385     if (SIZE_ONLY) {
4386         RExC_size += NODE_STEP_REGNODE + offset;
4387         return;
4388     }
4389
4390     src = RExC_emit;
4391     RExC_emit += NODE_STEP_REGNODE + offset;
4392     dst = RExC_emit;
4393     while (src > opnd) {
4394         StructCopy(--src, --dst, regnode);
4395         if (RExC_offsets) {     /* MJD 20010112 */
4396             MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4397                   "reg_insert",
4398                   __LINE__,
4399                   reg_name[op],
4400                   dst - RExC_emit_start > RExC_offsets[0] 
4401                   ? "Overwriting end of array!\n" : "OK",
4402                   src - RExC_emit_start,
4403                   dst - RExC_emit_start,
4404                   RExC_offsets[0])); 
4405             Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4406             Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4407         }
4408     }
4409     
4410
4411     place = opnd;               /* Op node, where operand used to be. */
4412     if (RExC_offsets) {         /* MJD */
4413         MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n", 
4414               "reginsert",
4415               __LINE__,
4416               reg_name[op],
4417               place - RExC_emit_start > RExC_offsets[0] 
4418               ? "Overwriting end of array!\n" : "OK",
4419               place - RExC_emit_start,
4420               RExC_parse - RExC_start,
4421               RExC_offsets[0])); 
4422         Set_Node_Offset(place, RExC_parse);
4423     }
4424     src = NEXTOPER(place);
4425     FILL_ADVANCE_NODE(place, op);
4426     Zero(src, offset, regnode);
4427 }
4428
4429 /*
4430 - regtail - set the next-pointer at the end of a node chain of p to val.
4431 */
4432 STATIC void
4433 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4434 {
4435     register regnode *scan;
4436     register regnode *temp;
4437
4438     if (SIZE_ONLY)
4439         return;
4440
4441     /* Find last node. */
4442     scan = p;
4443     for (;;) {
4444         temp = regnext(scan);
4445         if (temp == NULL)
4446             break;
4447         scan = temp;
4448     }
4449
4450     if (reg_off_by_arg[OP(scan)]) {
4451         ARG_SET(scan, val - scan);
4452     }
4453     else {
4454         NEXT_OFF(scan) = val - scan;
4455     }
4456 }
4457
4458 /*
4459 - regoptail - regtail on operand of first argument; nop if operandless
4460 */
4461 STATIC void
4462 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4463 {
4464     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4465     if (p == NULL || SIZE_ONLY)
4466         return;
4467     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4468         regtail(pRExC_state, NEXTOPER(p), val);
4469     }
4470     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4471         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4472     }
4473     else
4474         return;
4475 }
4476
4477 /*
4478  - regcurly - a little FSA that accepts {\d+,?\d*}
4479  */
4480 STATIC I32
4481 S_regcurly(pTHX_ register char *s)
4482 {
4483     if (*s++ != '{')
4484         return FALSE;
4485     if (!isDIGIT(*s))
4486         return FALSE;
4487     while (isDIGIT(*s))
4488         s++;
4489     if (*s == ',')
4490         s++;
4491     while (isDIGIT(*s))
4492         s++;
4493     if (*s != '}')
4494         return FALSE;
4495     return TRUE;
4496 }
4497
4498
4499 #ifdef DEBUGGING
4500
4501 STATIC regnode *
4502 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4503 {
4504     register U8 op = EXACT;     /* Arbitrary non-END op. */
4505     register regnode *next;
4506
4507     while (op != END && (!last || node < last)) {
4508         /* While that wasn't END last time... */
4509
4510         NODE_ALIGN(node);
4511         op = OP(node);
4512         if (op == CLOSE)
4513             l--;        
4514         next = regnext(node);
4515         /* Where, what. */
4516         if (OP(node) == OPTIMIZED)
4517             goto after_print;
4518         regprop(sv, node);
4519         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4520                       (int)(2*l + 1), "", SvPVX(sv));
4521         if (next == NULL)               /* Next ptr. */
4522             PerlIO_printf(Perl_debug_log, "(0)");
4523         else
4524             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4525         (void)PerlIO_putc(Perl_debug_log, '\n');
4526       after_print:
4527         if (PL_regkind[(U8)op] == BRANCHJ) {
4528             register regnode *nnode = (OP(next) == LONGJMP
4529                                        ? regnext(next)
4530                                        : next);
4531             if (last && nnode > last)
4532                 nnode = last;
4533             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4534         }
4535         else if (PL_regkind[(U8)op] == BRANCH) {
4536             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4537         }
4538         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4539             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4540                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4541         }
4542         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4543             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4544                              next, sv, l + 1);
4545         }
4546         else if ( op == PLUS || op == STAR) {
4547             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4548         }
4549         else if (op == ANYOF) {
4550             /* arglen 1 + class block */
4551             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4552                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4553             node = NEXTOPER(node);
4554         }
4555         else if (PL_regkind[(U8)op] == EXACT) {
4556             /* Literal string, where present. */
4557             node += NODE_SZ_STR(node) - 1;
4558             node = NEXTOPER(node);
4559         }
4560         else {
4561             node = NEXTOPER(node);
4562             node += regarglen[(U8)op];
4563         }
4564         if (op == CURLYX || op == OPEN)
4565             l++;
4566         else if (op == WHILEM)
4567             l--;
4568     }
4569     return node;
4570 }
4571
4572 #endif  /* DEBUGGING */
4573
4574 /*
4575  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4576  */
4577 void
4578 Perl_regdump(pTHX_ regexp *r)
4579 {
4580 #ifdef DEBUGGING
4581     SV *sv = sv_newmortal();
4582
4583     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4584
4585     /* Header fields of interest. */
4586     if (r->anchored_substr)
4587         PerlIO_printf(Perl_debug_log,
4588                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4589                       PL_colors[0],
4590                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4591                       SvPVX(r->anchored_substr),
4592                       PL_colors[1],
4593                       SvTAIL(r->anchored_substr) ? "$" : "",
4594                       (IV)r->anchored_offset);
4595     else if (r->anchored_utf8)
4596         PerlIO_printf(Perl_debug_log,
4597                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4598                       PL_colors[0],
4599                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4600                       SvPVX(r->anchored_utf8),
4601                       PL_colors[1],
4602                       SvTAIL(r->anchored_utf8) ? "$" : "",
4603                       (IV)r->anchored_offset);
4604     if (r->float_substr)
4605         PerlIO_printf(Perl_debug_log,
4606                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4607                       PL_colors[0],
4608                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4609                       SvPVX(r->float_substr),
4610                       PL_colors[1],
4611                       SvTAIL(r->float_substr) ? "$" : "",
4612                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4613     else if (r->float_utf8)
4614         PerlIO_printf(Perl_debug_log,
4615                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4616                       PL_colors[0],
4617                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4618                       SvPVX(r->float_utf8),
4619                       PL_colors[1],
4620                       SvTAIL(r->float_utf8) ? "$" : "",
4621                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4622     if (r->check_substr || r->check_utf8)
4623         PerlIO_printf(Perl_debug_log,
4624                       r->check_substr == r->float_substr
4625                       && r->check_utf8 == r->float_utf8
4626                       ? "(checking floating" : "(checking anchored");
4627     if (r->reganch & ROPT_NOSCAN)
4628         PerlIO_printf(Perl_debug_log, " noscan");
4629     if (r->reganch & ROPT_CHECK_ALL)
4630         PerlIO_printf(Perl_debug_log, " isall");
4631     if (r->check_substr || r->check_utf8)
4632         PerlIO_printf(Perl_debug_log, ") ");
4633
4634     if (r->regstclass) {
4635         regprop(sv, r->regstclass);
4636         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4637     }
4638     if (r->reganch & ROPT_ANCH) {
4639         PerlIO_printf(Perl_debug_log, "anchored");
4640         if (r->reganch & ROPT_ANCH_BOL)
4641             PerlIO_printf(Perl_debug_log, "(BOL)");
4642         if (r->reganch & ROPT_ANCH_MBOL)
4643             PerlIO_printf(Perl_debug_log, "(MBOL)");
4644         if (r->reganch & ROPT_ANCH_SBOL)
4645             PerlIO_printf(Perl_debug_log, "(SBOL)");
4646         if (r->reganch & ROPT_ANCH_GPOS)
4647             PerlIO_printf(Perl_debug_log, "(GPOS)");
4648         PerlIO_putc(Perl_debug_log, ' ');
4649     }
4650     if (r->reganch & ROPT_GPOS_SEEN)
4651         PerlIO_printf(Perl_debug_log, "GPOS ");
4652     if (r->reganch & ROPT_SKIP)
4653         PerlIO_printf(Perl_debug_log, "plus ");
4654     if (r->reganch & ROPT_IMPLICIT)
4655         PerlIO_printf(Perl_debug_log, "implicit ");
4656     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4657     if (r->reganch & ROPT_EVAL_SEEN)
4658         PerlIO_printf(Perl_debug_log, "with eval ");
4659     PerlIO_printf(Perl_debug_log, "\n");
4660     if (r->offsets) {
4661       U32 i;
4662       U32 len = r->offsets[0];
4663       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4664       for (i = 1; i <= len; i++)
4665         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4666                       (UV)r->offsets[i*2-1], 
4667                       (UV)r->offsets[i*2]);
4668       PerlIO_printf(Perl_debug_log, "\n");
4669     }
4670 #endif  /* DEBUGGING */
4671 }
4672
4673 #ifdef DEBUGGING
4674
4675 STATIC void
4676 S_put_byte(pTHX_ SV *sv, int c)
4677 {
4678     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4679         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4680     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4681         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4682     else
4683         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4684 }
4685
4686 #endif  /* DEBUGGING */
4687
4688 /*
4689 - regprop - printable representation of opcode
4690 */
4691 void
4692 Perl_regprop(pTHX_ SV *sv, regnode *o)
4693 {
4694 #ifdef DEBUGGING
4695     register int k;
4696
4697     sv_setpvn(sv, "", 0);
4698     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4699         /* It would be nice to FAIL() here, but this may be called from
4700            regexec.c, and it would be hard to supply pRExC_state. */
4701         Perl_croak(aTHX_ "Corrupted regexp opcode");
4702     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4703
4704     k = PL_regkind[(U8)OP(o)];
4705
4706     if (k == EXACT) {
4707         SV *dsv = sv_2mortal(newSVpvn("", 0));
4708         /* Using is_utf8_string() is a crude hack but it may
4709          * be the best for now since we have no flag "this EXACTish
4710          * node was UTF-8" --jhi */
4711         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4712         char *s    = do_utf8 ?
4713           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4714                          UNI_DISPLAY_REGEX) :
4715           STRING(o);
4716         int len = do_utf8 ?
4717           strlen(s) :
4718           STR_LEN(o);
4719         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4720                        PL_colors[0],
4721                        len, s,
4722                        PL_colors[1]);
4723     }
4724     else if (k == CURLY) {
4725         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4726             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4727         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4728     }
4729     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4730         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4731     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4732         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4733     else if (k == LOGICAL)
4734         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4735     else if (k == ANYOF) {
4736         int i, rangestart = -1;
4737         U8 flags = ANYOF_FLAGS(o);
4738         const char * const anyofs[] = { /* Should be synchronized with
4739                                          * ANYOF_ #xdefines in regcomp.h */
4740             "\\w",
4741             "\\W",
4742             "\\s",
4743             "\\S",
4744             "\\d",
4745             "\\D",
4746             "[:alnum:]",
4747             "[:^alnum:]",
4748             "[:alpha:]",
4749             "[:^alpha:]",
4750             "[:ascii:]",
4751             "[:^ascii:]",
4752             "[:ctrl:]",
4753             "[:^ctrl:]",
4754             "[:graph:]",
4755             "[:^graph:]",
4756             "[:lower:]",
4757             "[:^lower:]",
4758             "[:print:]",
4759             "[:^print:]",
4760             "[:punct:]",
4761             "[:^punct:]",
4762             "[:upper:]",
4763             "[:^upper:]",
4764             "[:xdigit:]",
4765             "[:^xdigit:]",
4766             "[:space:]",
4767             "[:^space:]",
4768             "[:blank:]",
4769             "[:^blank:]"
4770         };
4771
4772         if (flags & ANYOF_LOCALE)
4773             sv_catpv(sv, "{loc}");
4774         if (flags & ANYOF_FOLD)
4775             sv_catpv(sv, "{i}");
4776         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4777         if (flags & ANYOF_INVERT)
4778             sv_catpv(sv, "^");
4779         for (i = 0; i <= 256; i++) {
4780             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4781                 if (rangestart == -1)
4782                     rangestart = i;
4783             } else if (rangestart != -1) {
4784                 if (i <= rangestart + 3)
4785                     for (; rangestart < i; rangestart++)
4786                         put_byte(sv, rangestart);
4787                 else {
4788                     put_byte(sv, rangestart);
4789                     sv_catpv(sv, "-");
4790                     put_byte(sv, i - 1);
4791                 }
4792                 rangestart = -1;
4793             }
4794         }
4795
4796         if (o->flags & ANYOF_CLASS)
4797             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4798                 if (ANYOF_CLASS_TEST(o,i))
4799                     sv_catpv(sv, anyofs[i]);
4800
4801         if (flags & ANYOF_UNICODE)
4802             sv_catpv(sv, "{unicode}");
4803         else if (flags & ANYOF_UNICODE_ALL)
4804             sv_catpv(sv, "{unicode_all}");
4805
4806         {
4807             SV *lv;
4808             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4809         
4810             if (lv) {
4811                 if (sw) {
4812                     U8 s[UTF8_MAXLEN+1];
4813                 
4814                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4815                         U8 *e = uvchr_to_utf8(s, i);
4816                         
4817                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4818                             if (rangestart == -1)
4819                                 rangestart = i;
4820                         } else if (rangestart != -1) {
4821                             U8 *p;
4822                         
4823                             if (i <= rangestart + 3)
4824                                 for (; rangestart < i; rangestart++) {
4825                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4826                                         put_byte(sv, *p);
4827                                 }
4828                             else {
4829                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4830                                     put_byte(sv, *p);
4831                                 sv_catpv(sv, "-");
4832                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4833                                         put_byte(sv, *p);
4834                                 }
4835                                 rangestart = -1;
4836                             }
4837                         }
4838                         
4839                     sv_catpv(sv, "..."); /* et cetera */
4840                 }
4841
4842                 {
4843                     char *s = savepv(SvPVX(lv));
4844                     char *origs = s;
4845                 
4846                     while(*s && *s != '\n') s++;
4847                 
4848                     if (*s == '\n') {
4849                         char *t = ++s;
4850                         
4851                         while (*s) {
4852                             if (*s == '\n')
4853                                 *s = ' ';
4854                             s++;
4855                         }
4856                         if (s[-1] == ' ')
4857                             s[-1] = 0;
4858                         
4859                         sv_catpv(sv, t);
4860                     }
4861                 
4862                     Safefree(origs);
4863                 }
4864             }
4865         }
4866
4867         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4868     }
4869     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4870         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4871 #endif  /* DEBUGGING */
4872 }
4873
4874 SV *
4875 Perl_re_intuit_string(pTHX_ regexp *prog)
4876 {                               /* Assume that RE_INTUIT is set */
4877     DEBUG_r(
4878         {   STRLEN n_a;
4879             char *s = SvPV(prog->check_substr
4880                       ? prog->check_substr : prog->check_utf8, n_a);
4881
4882             if (!PL_colorset) reginitcolors();
4883             PerlIO_printf(Perl_debug_log,
4884                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4885                       PL_colors[4],
4886                       prog->check_substr ? "" : "utf8 ",
4887                       PL_colors[5],PL_colors[0],
4888                       s,
4889                       PL_colors[1],
4890                       (strlen(s) > 60 ? "..." : ""));
4891         } );
4892
4893     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4894 }
4895
4896 void
4897 Perl_pregfree(pTHX_ struct regexp *r)
4898 {
4899 #ifdef DEBUGGING
4900     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4901 #endif
4902
4903     if (!r || (--r->refcnt > 0))
4904         return;
4905     DEBUG_r({
4906          int len;
4907          char *s;
4908
4909          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4910                 r->prelen, 60, UNI_DISPLAY_REGEX)
4911             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4912          len = SvCUR(dsv);
4913          if (!PL_colorset)
4914               reginitcolors();
4915          PerlIO_printf(Perl_debug_log,
4916                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4917                        PL_colors[4],PL_colors[5],PL_colors[0],
4918                        len, len, s,
4919                        PL_colors[1],
4920                        len > 60 ? "..." : "");
4921     });
4922
4923     if (r->precomp)
4924         Safefree(r->precomp);
4925     if (r->offsets)             /* 20010421 MJD */
4926         Safefree(r->offsets);
4927     if (RX_MATCH_COPIED(r))
4928         Safefree(r->subbeg);
4929     if (r->substrs) {
4930         if (r->anchored_substr)
4931             SvREFCNT_dec(r->anchored_substr);
4932         if (r->anchored_utf8)
4933             SvREFCNT_dec(r->anchored_utf8);
4934         if (r->float_substr)
4935             SvREFCNT_dec(r->float_substr);
4936         if (r->float_utf8)
4937             SvREFCNT_dec(r->float_utf8);
4938         Safefree(r->substrs);
4939     }
4940     if (r->data) {
4941         int n = r->data->count;
4942         PAD* new_comppad = NULL;
4943         PAD* old_comppad;
4944
4945         while (--n >= 0) {
4946           /* If you add a ->what type here, update the comment in regcomp.h */
4947             switch (r->data->what[n]) {
4948             case 's':
4949                 SvREFCNT_dec((SV*)r->data->data[n]);
4950                 break;
4951             case 'f':
4952                 Safefree(r->data->data[n]);
4953                 break;
4954             case 'p':
4955                 new_comppad = (AV*)r->data->data[n];
4956                 break;
4957             case 'o':
4958                 if (new_comppad == NULL)
4959                     Perl_croak(aTHX_ "panic: pregfree comppad");
4960                 PAD_SAVE_LOCAL(old_comppad,
4961                     /* Watch out for global destruction's random ordering. */
4962                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4963                                 new_comppad : Null(PAD *)
4964                 );
4965                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4966                     op_free((OP_4tree*)r->data->data[n]);
4967                 }
4968
4969                 PAD_RESTORE_LOCAL(old_comppad);
4970                 SvREFCNT_dec((SV*)new_comppad);
4971                 new_comppad = NULL;
4972                 break;
4973             case 'n':
4974                 break;
4975             default:
4976                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4977             }
4978         }
4979         Safefree(r->data->what);
4980         Safefree(r->data);
4981     }
4982     Safefree(r->startp);
4983     Safefree(r->endp);
4984     Safefree(r);
4985 }
4986
4987 /*
4988  - regnext - dig the "next" pointer out of a node
4989  *
4990  * [Note, when REGALIGN is defined there are two places in regmatch()
4991  * that bypass this code for speed.]
4992  */
4993 regnode *
4994 Perl_regnext(pTHX_ register regnode *p)
4995 {
4996     register I32 offset;
4997
4998     if (p == &PL_regdummy)
4999         return(NULL);
5000
5001     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5002     if (offset == 0)
5003         return(NULL);
5004
5005     return(p+offset);
5006 }
5007
5008 STATIC void     
5009 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5010 {
5011     va_list args;
5012     STRLEN l1 = strlen(pat1);
5013     STRLEN l2 = strlen(pat2);
5014     char buf[512];
5015     SV *msv;
5016     char *message;
5017
5018     if (l1 > 510)
5019         l1 = 510;
5020     if (l1 + l2 > 510)
5021         l2 = 510 - l1;
5022     Copy(pat1, buf, l1 , char);
5023     Copy(pat2, buf + l1, l2 , char);
5024     buf[l1 + l2] = '\n';
5025     buf[l1 + l2 + 1] = '\0';
5026 #ifdef I_STDARG
5027     /* ANSI variant takes additional second argument */
5028     va_start(args, pat2);
5029 #else
5030     va_start(args);
5031 #endif
5032     msv = vmess(buf, &args);
5033     va_end(args);
5034     message = SvPV(msv,l1);
5035     if (l1 > 512)
5036         l1 = 512;
5037     Copy(message, buf, l1 , char);
5038     buf[l1] = '\0';                     /* Overwrite \n */
5039     Perl_croak(aTHX_ "%s", buf);
5040 }
5041
5042 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5043
5044 void
5045 Perl_save_re_context(pTHX)
5046 {
5047     SAVEI32(PL_reg_flags);              /* from regexec.c */
5048     SAVEPPTR(PL_bostr);
5049     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5050     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5051     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5052     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5053     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5054     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5055     SAVEVPTR(PL_reglastcloseparen);     /* Similarly for lastcloseparen. */
5056     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5057     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5058     PL_reg_start_tmp = 0;
5059     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5060     PL_reg_start_tmpl = 0;
5061     SAVEVPTR(PL_regdata);
5062     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5063     SAVEI32(PL_regnarrate);             /* from regexec.c */
5064     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5065     SAVEINT(PL_regindent);              /* from regexec.c */
5066     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5067     SAVEVPTR(PL_curcop);
5068     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5069     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5070     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5071     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5072     SAVEBOOL(PL_reg_match_utf8);        /* from regexec.c */
5073     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5074     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5075     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5076     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5077     SAVEPPTR(PL_reg_oldsaved);          /* old saved substr during match */
5078     PL_reg_oldsaved = Nullch;
5079     SAVEI32(PL_reg_oldsavedlen);        /* old length of saved substr during match */
5080     PL_reg_oldsavedlen = 0;
5081     SAVEI32(PL_reg_maxiter);            /* max wait until caching pos */
5082     PL_reg_maxiter = 0;
5083     SAVEI32(PL_reg_leftiter);           /* wait until caching pos */
5084     PL_reg_leftiter = 0;
5085     SAVEGENERICPV(PL_reg_poscache);     /* cache of pos of WHILEM */
5086     PL_reg_poscache = Nullch;
5087     SAVEI32(PL_reg_poscache_size);      /* size of pos cache of WHILEM */
5088     PL_reg_poscache_size = 0;
5089     SAVEPPTR(PL_regprecomp);            /* uncompiled string. */
5090     SAVEI32(PL_regnpar);                /* () count. */
5091     SAVEI32(PL_regsize);                /* from regexec.c */
5092
5093     {
5094         /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5095         U32 i;
5096         GV *mgv;
5097         REGEXP *rx;
5098         char digits[16];
5099
5100         if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5101             for (i = 1; i <= rx->nparens; i++) {
5102                 sprintf(digits, "%lu", (long)i);
5103                 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5104                     save_scalar(mgv);
5105             }
5106         }
5107     }
5108
5109 #ifdef DEBUGGING
5110     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5111 #endif
5112 }
5113
5114 static void
5115 clear_re(pTHX_ void *r)
5116 {
5117     ReREFCNT_dec((regexp *)r);
5118 }
5119