This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Documentation added.
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
397     } STMT_END                                                               \
398
399
400 #define vWARN2(loc, m, a1)                                                   \
401     STMT_START {                                                             \
402         IV offset = loc - RExC_precomp;          \
403         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
404                  a1,                                                         \
405                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
406     } STMT_END
407
408 #define vWARN3(loc, m, a1, a2)                                               \
409     STMT_START {                                                             \
410       IV offset = loc - RExC_precomp;        \
411         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
412                  a1, a2,                                                     \
413                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
414     } STMT_END
415
416 #define vWARN4(loc, m, a1, a2, a3)                                           \
417     STMT_START {                                                             \
418       IV offset = loc - RExC_precomp;            \
419         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
420                  a1, a2, a3,                                                 \
421                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
422     } STMT_END
423
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
426   STMT_START {                                                   \
427       IV offset = loc - RExC_precomp;   \
428         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
429                  a1, a2, a3, a4,                                 \
430                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
431     } STMT_END
432
433
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
436
437 /* Macros for recording node offsets.   20001227 mjd@plover.com 
438  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
439  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
440  * Element 0 holds the number n.
441  */
442
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447 #  define Set_Node_Offset_To_R(node,byte)                           \
448    STMT_START {                                        \
449      if (! SIZE_ONLY) {                                  \
450        if((node) < 0) {                   \
451          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452        } else {                                                        \
453          RExC_offsets[2*(node)-1] = (byte);                               \
454        }                                                               \
455      }                                                                 \
456    } STMT_END
457
458 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461 #  define Set_Node_Length_To_R(node,len)                            \
462    STMT_START {                                        \
463      if (! SIZE_ONLY) {                                  \
464        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465        if((node) < 0) {                   \
466          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467        } else {                                                        \
468          RExC_offsets[2*(node)] = (len);                               \
469        }                                                               \
470      }                                                                 \
471    } STMT_END
472
473 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
475 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
476
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
481 static void clear_re(pTHX_ void *r);
482
483 /* Mark that we cannot extend a found fixed substring at this point.
484    Updata the longest found anchored substring and the longest found
485    floating substrings if needed. */
486
487 STATIC void
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
489 {
490     STRLEN l = CHR_SVLEN(data->last_found);
491     STRLEN old_l = CHR_SVLEN(*data->longest);
492
493     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494         sv_setsv(*data->longest, data->last_found);
495         if (*data->longest == data->longest_fixed) {
496             data->offset_fixed = l ? data->last_start_min : data->pos_min;
497             if (data->flags & SF_BEFORE_EOL)
498                 data->flags
499                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500             else
501                 data->flags &= ~SF_FIX_BEFORE_EOL;
502         }
503         else {
504             data->offset_float_min = l ? data->last_start_min : data->pos_min;
505             data->offset_float_max = (l
506                                       ? data->last_start_max
507                                       : data->pos_min + data->pos_delta);
508             if ((U32)data->offset_float_max > (U32)I32_MAX)
509                 data->offset_float_max = I32_MAX;
510             if (data->flags & SF_BEFORE_EOL)
511                 data->flags
512                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
513             else
514                 data->flags &= ~SF_FL_BEFORE_EOL;
515         }
516     }
517     SvCUR_set(data->last_found, 0);
518     data->last_end = -1;
519     data->flags &= ~SF_BEFORE_EOL;
520 }
521
522 /* Can match anything (initialization) */
523 STATIC void
524 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
525 {
526     ANYOF_CLASS_ZERO(cl);
527     ANYOF_BITMAP_SETALL(cl);
528     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
529     if (LOC)
530         cl->flags |= ANYOF_LOCALE;
531 }
532
533 /* Can match anything (initialization) */
534 STATIC int
535 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
536 {
537     int value;
538
539     for (value = 0; value <= ANYOF_MAX; value += 2)
540         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
541             return 1;
542     if (!(cl->flags & ANYOF_UNICODE_ALL))
543         return 0;
544     if (!ANYOF_BITMAP_TESTALLSET(cl))
545         return 0;
546     return 1;
547 }
548
549 /* Can match anything (initialization) */
550 STATIC void
551 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
552 {
553     Zero(cl, 1, struct regnode_charclass_class);
554     cl->type = ANYOF;
555     cl_anything(pRExC_state, cl);
556 }
557
558 STATIC void
559 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
560 {
561     Zero(cl, 1, struct regnode_charclass_class);
562     cl->type = ANYOF;
563     cl_anything(pRExC_state, cl);
564     if (LOC)
565         cl->flags |= ANYOF_LOCALE;
566 }
567
568 /* 'And' a given class with another one.  Can create false positives */
569 /* We assume that cl is not inverted */
570 STATIC void
571 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
572          struct regnode_charclass_class *and_with)
573 {
574     if (!(and_with->flags & ANYOF_CLASS)
575         && !(cl->flags & ANYOF_CLASS)
576         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
577         && !(and_with->flags & ANYOF_FOLD)
578         && !(cl->flags & ANYOF_FOLD)) {
579         int i;
580
581         if (and_with->flags & ANYOF_INVERT)
582             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583                 cl->bitmap[i] &= ~and_with->bitmap[i];
584         else
585             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586                 cl->bitmap[i] &= and_with->bitmap[i];
587     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
588     if (!(and_with->flags & ANYOF_EOS))
589         cl->flags &= ~ANYOF_EOS;
590
591     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
592         cl->flags &= ~ANYOF_UNICODE_ALL;
593         cl->flags |= ANYOF_UNICODE;
594         ARG_SET(cl, ARG(and_with));
595     }
596     if (!(and_with->flags & ANYOF_UNICODE_ALL))
597         cl->flags &= ~ANYOF_UNICODE_ALL;
598     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
599         cl->flags &= ~ANYOF_UNICODE;
600 }
601
602 /* 'OR' a given class with another one.  Can create false positives */
603 /* We assume that cl is not inverted */
604 STATIC void
605 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
606 {
607     if (or_with->flags & ANYOF_INVERT) {
608         /* We do not use
609          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
610          *   <= (B1 | !B2) | (CL1 | !CL2)
611          * which is wasteful if CL2 is small, but we ignore CL2:
612          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
613          * XXXX Can we handle case-fold?  Unclear:
614          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
615          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
616          */
617         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618              && !(or_with->flags & ANYOF_FOLD)
619              && !(cl->flags & ANYOF_FOLD) ) {
620             int i;
621
622             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
623                 cl->bitmap[i] |= ~or_with->bitmap[i];
624         } /* XXXX: logic is complicated otherwise */
625         else {
626             cl_anything(pRExC_state, cl);
627         }
628     } else {
629         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
630         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
631              && (!(or_with->flags & ANYOF_FOLD)
632                  || (cl->flags & ANYOF_FOLD)) ) {
633             int i;
634
635             /* OR char bitmap and class bitmap separately */
636             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
637                 cl->bitmap[i] |= or_with->bitmap[i];
638             if (or_with->flags & ANYOF_CLASS) {
639                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
640                     cl->classflags[i] |= or_with->classflags[i];
641                 cl->flags |= ANYOF_CLASS;
642             }
643         }
644         else { /* XXXX: logic is complicated, leave it along for a moment. */
645             cl_anything(pRExC_state, cl);
646         }
647     }
648     if (or_with->flags & ANYOF_EOS)
649         cl->flags |= ANYOF_EOS;
650
651     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
652         ARG(cl) != ARG(or_with)) {
653         cl->flags |= ANYOF_UNICODE_ALL;
654         cl->flags &= ~ANYOF_UNICODE;
655     }
656     if (or_with->flags & ANYOF_UNICODE_ALL) {
657         cl->flags |= ANYOF_UNICODE_ALL;
658         cl->flags &= ~ANYOF_UNICODE;
659     }
660 }
661
662 /*
663  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
664  * These need to be revisited when a newer toolchain becomes available.
665  */
666 #if defined(__sparc64__) && defined(__GNUC__)
667 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
668 #       undef  SPARC64_GCC_WORKAROUND
669 #       define SPARC64_GCC_WORKAROUND 1
670 #   endif
671 #endif
672
673 /* REx optimizer.  Converts nodes into quickier variants "in place".
674    Finds fixed substrings.  */
675
676 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
677    to the position after last scanned or to NULL. */
678
679 STATIC I32
680 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
681                         /* scanp: Start here (read-write). */
682                         /* deltap: Write maxlen-minlen here. */
683                         /* last: Stop before this one. */
684 {
685     I32 min = 0, pars = 0, code;
686     regnode *scan = *scanp, *next;
687     I32 delta = 0;
688     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
689     int is_inf_internal = 0;            /* The studied chunk is infinite */
690     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
691     scan_data_t data_fake;
692     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
693
694     while (scan && OP(scan) != END && scan < last) {
695         /* Peephole optimizer: */
696
697         if (PL_regkind[(U8)OP(scan)] == EXACT) {
698             /* Merge several consecutive EXACTish nodes into one. */
699             regnode *n = regnext(scan);
700             U32 stringok = 1;
701 #ifdef DEBUGGING
702             regnode *stop = scan;
703 #endif
704
705             next = scan + NODE_SZ_STR(scan);
706             /* Skip NOTHING, merge EXACT*. */
707             while (n &&
708                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
709                      (stringok && (OP(n) == OP(scan))))
710                    && NEXT_OFF(n)
711                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
712                 if (OP(n) == TAIL || n > next)
713                     stringok = 0;
714                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
715                     NEXT_OFF(scan) += NEXT_OFF(n);
716                     next = n + NODE_STEP_REGNODE;
717 #ifdef DEBUGGING
718                     if (stringok)
719                         stop = n;
720 #endif
721                     n = regnext(n);
722                 }
723                 else if (stringok) {
724                     int oldl = STR_LEN(scan);
725                     regnode *nnext = regnext(n);
726
727                     if (oldl + STR_LEN(n) > U8_MAX)
728                         break;
729                     NEXT_OFF(scan) += NEXT_OFF(n);
730                     STR_LEN(scan) += STR_LEN(n);
731                     next = n + NODE_SZ_STR(n);
732                     /* Now we can overwrite *n : */
733                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
734 #ifdef DEBUGGING
735                     stop = next - 1;
736 #endif
737                     n = nnext;
738                 }
739             }
740
741             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
742 /*
743   Two problematic code points in Unicode casefolding of EXACT nodes:
744
745    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
746    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
747
748    which casefold to
749
750    Unicode                      UTF-8
751
752    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
753    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
754
755    This means that in case-insensitive matching (or "loose matching",
756    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
757    length of the above casefolded versions) can match a target string
758    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
759    This would rather mess up the minimum length computation.
760
761    What we'll do is to look for the tail four bytes, and then peek
762    at the preceding two bytes to see whether we need to decrease
763    the minimum length by four (six minus two).
764
765    Thanks to the design of UTF-8, there cannot be false matches:
766    A sequence of valid UTF-8 bytes cannot be a subsequence of
767    another valid sequence of UTF-8 bytes.
768
769 */
770                  char *s0 = STRING(scan), *s, *t;
771                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
772                  char *t0 = "\xcc\x88\xcc\x81";
773                  char *t1 = t0 + 3;
774                  
775                  for (s = s0 + 2;
776                       s < s2 && (t = ninstr(s, s1, t0, t1));
777                       s = t + 4) {
778                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
779                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
780                            min -= 4;
781                  }
782             }
783
784 #ifdef DEBUGGING
785             /* Allow dumping */
786             n = scan + NODE_SZ_STR(scan);
787             while (n <= stop) {
788                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
789                     OP(n) = OPTIMIZED;
790                     NEXT_OFF(n) = 0;
791                 }
792                 n++;
793             }
794 #endif
795         }
796         /* Follow the next-chain of the current node and optimize
797            away all the NOTHINGs from it.  */
798         if (OP(scan) != CURLYX) {
799             int max = (reg_off_by_arg[OP(scan)]
800                        ? I32_MAX
801                        /* I32 may be smaller than U16 on CRAYs! */
802                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
803             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
804             int noff;
805             regnode *n = scan;
806         
807             /* Skip NOTHING and LONGJMP. */
808             while ((n = regnext(n))
809                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
810                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811                    && off + noff < max)
812                 off += noff;
813             if (reg_off_by_arg[OP(scan)])
814                 ARG(scan) = off;
815             else
816                 NEXT_OFF(scan) = off;
817         }
818         /* The principal pseudo-switch.  Cannot be a switch, since we
819            look into several different things.  */
820         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
821                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
822             next = regnext(scan);
823             code = OP(scan);
824         
825             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
826                 I32 max1 = 0, min1 = I32_MAX, num = 0;
827                 struct regnode_charclass_class accum;
828                 
829                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
831                 if (flags & SCF_DO_STCLASS)
832                     cl_init_zero(pRExC_state, &accum);
833                 while (OP(scan) == code) {
834                     I32 deltanext, minnext, f = 0, fake;
835                     struct regnode_charclass_class this_class;
836
837                     num++;
838                     data_fake.flags = 0;
839                     if (data) {         
840                         data_fake.whilem_c = data->whilem_c;
841                         data_fake.last_closep = data->last_closep;
842                     }
843                     else
844                         data_fake.last_closep = &fake;
845                     next = regnext(scan);
846                     scan = NEXTOPER(scan);
847                     if (code != BRANCH)
848                         scan = NEXTOPER(scan);
849                     if (flags & SCF_DO_STCLASS) {
850                         cl_init(pRExC_state, &this_class);
851                         data_fake.start_class = &this_class;
852                         f = SCF_DO_STCLASS_AND;
853                     }           
854                     if (flags & SCF_WHILEM_VISITED_POS)
855                         f |= SCF_WHILEM_VISITED_POS;
856                     /* we suppose the run is continuous, last=next...*/
857                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
858                                           next, &data_fake, f);
859                     if (min1 > minnext)
860                         min1 = minnext;
861                     if (max1 < minnext + deltanext)
862                         max1 = minnext + deltanext;
863                     if (deltanext == I32_MAX)
864                         is_inf = is_inf_internal = 1;
865                     scan = next;
866                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
867                         pars++;
868                     if (data && (data_fake.flags & SF_HAS_EVAL))
869                         data->flags |= SF_HAS_EVAL;
870                     if (data)
871                         data->whilem_c = data_fake.whilem_c;
872                     if (flags & SCF_DO_STCLASS)
873                         cl_or(pRExC_state, &accum, &this_class);
874                     if (code == SUSPEND)
875                         break;
876                 }
877                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
878                     min1 = 0;
879                 if (flags & SCF_DO_SUBSTR) {
880                     data->pos_min += min1;
881                     data->pos_delta += max1 - min1;
882                     if (max1 != min1 || is_inf)
883                         data->longest = &(data->longest_float);
884                 }
885                 min += min1;
886                 delta += max1 - min1;
887                 if (flags & SCF_DO_STCLASS_OR) {
888                     cl_or(pRExC_state, data->start_class, &accum);
889                     if (min1) {
890                         cl_and(data->start_class, &and_with);
891                         flags &= ~SCF_DO_STCLASS;
892                     }
893                 }
894                 else if (flags & SCF_DO_STCLASS_AND) {
895                     if (min1) {
896                         cl_and(data->start_class, &accum);
897                         flags &= ~SCF_DO_STCLASS;
898                     }
899                     else {
900                         /* Switch to OR mode: cache the old value of
901                          * data->start_class */
902                         StructCopy(data->start_class, &and_with,
903                                    struct regnode_charclass_class);
904                         flags &= ~SCF_DO_STCLASS_AND;
905                         StructCopy(&accum, data->start_class,
906                                    struct regnode_charclass_class);
907                         flags |= SCF_DO_STCLASS_OR;
908                         data->start_class->flags |= ANYOF_EOS;
909                     }
910                 }
911             }
912             else if (code == BRANCHJ)   /* single branch is optimized. */
913                 scan = NEXTOPER(NEXTOPER(scan));
914             else                        /* single branch is optimized. */
915                 scan = NEXTOPER(scan);
916             continue;
917         }
918         else if (OP(scan) == EXACT) {
919             I32 l = STR_LEN(scan);
920             UV uc = *((U8*)STRING(scan));
921             if (UTF) {
922                 U8 *s = (U8*)STRING(scan);
923                 l = utf8_length(s, s + l);
924                 uc = utf8_to_uvchr(s, NULL);
925             }
926             min += l;
927             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
928                 /* The code below prefers earlier match for fixed
929                    offset, later match for variable offset.  */
930                 if (data->last_end == -1) { /* Update the start info. */
931                     data->last_start_min = data->pos_min;
932                     data->last_start_max = is_inf
933                         ? I32_MAX : data->pos_min + data->pos_delta;
934                 }
935                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
936                 if (UTF)
937                     SvUTF8_on(data->last_found);
938                 data->last_end = data->pos_min + l;
939                 data->pos_min += l; /* As in the first entry. */
940                 data->flags &= ~SF_BEFORE_EOL;
941             }
942             if (flags & SCF_DO_STCLASS_AND) {
943                 /* Check whether it is compatible with what we know already! */
944                 int compat = 1;
945
946                 if (uc >= 0x100 ||
947                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
948                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
949                     && (!(data->start_class->flags & ANYOF_FOLD)
950                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
951                     )
952                     compat = 0;
953                 ANYOF_CLASS_ZERO(data->start_class);
954                 ANYOF_BITMAP_ZERO(data->start_class);
955                 if (compat)
956                     ANYOF_BITMAP_SET(data->start_class, uc);
957                 data->start_class->flags &= ~ANYOF_EOS;
958                 if (uc < 0x100)
959                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960             }
961             else if (flags & SCF_DO_STCLASS_OR) {
962                 /* false positive possible if the class is case-folded */
963                 if (uc < 0x100)
964                     ANYOF_BITMAP_SET(data->start_class, uc);
965                 else
966                     data->start_class->flags |= ANYOF_UNICODE_ALL;
967                 data->start_class->flags &= ~ANYOF_EOS;
968                 cl_and(data->start_class, &and_with);
969             }
970             flags &= ~SCF_DO_STCLASS;
971         }
972         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
973             I32 l = STR_LEN(scan);
974             UV uc = *((U8*)STRING(scan));
975
976             /* Search for fixed substrings supports EXACT only. */
977             if (flags & SCF_DO_SUBSTR)
978                 scan_commit(pRExC_state, data);
979             if (UTF) {
980                 U8 *s = (U8 *)STRING(scan);
981                 l = utf8_length(s, s + l);
982                 uc = utf8_to_uvchr(s, NULL);
983             }
984             min += l;
985             if (data && (flags & SCF_DO_SUBSTR))
986                 data->pos_min += l;
987             if (flags & SCF_DO_STCLASS_AND) {
988                 /* Check whether it is compatible with what we know already! */
989                 int compat = 1;
990
991                 if (uc >= 0x100 ||
992                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
993                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
994                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995                     compat = 0;
996                 ANYOF_CLASS_ZERO(data->start_class);
997                 ANYOF_BITMAP_ZERO(data->start_class);
998                 if (compat) {
999                     ANYOF_BITMAP_SET(data->start_class, uc);
1000                     data->start_class->flags &= ~ANYOF_EOS;
1001                     data->start_class->flags |= ANYOF_FOLD;
1002                     if (OP(scan) == EXACTFL)
1003                         data->start_class->flags |= ANYOF_LOCALE;
1004                 }
1005             }
1006             else if (flags & SCF_DO_STCLASS_OR) {
1007                 if (data->start_class->flags & ANYOF_FOLD) {
1008                     /* false positive possible if the class is case-folded.
1009                        Assume that the locale settings are the same... */
1010                     if (uc < 0x100)
1011                         ANYOF_BITMAP_SET(data->start_class, uc);
1012                     data->start_class->flags &= ~ANYOF_EOS;
1013                 }
1014                 cl_and(data->start_class, &and_with);
1015             }
1016             flags &= ~SCF_DO_STCLASS;
1017         }
1018         else if (strchr((char*)PL_varies,OP(scan))) {
1019             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1020             I32 f = flags, pos_before = 0;
1021             regnode *oscan = scan;
1022             struct regnode_charclass_class this_class;
1023             struct regnode_charclass_class *oclass = NULL;
1024             I32 next_is_eval = 0;
1025
1026             switch (PL_regkind[(U8)OP(scan)]) {
1027             case WHILEM:                /* End of (?:...)* . */
1028                 scan = NEXTOPER(scan);
1029                 goto finish;
1030             case PLUS:
1031                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1032                     next = NEXTOPER(scan);
1033                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034                         mincount = 1;
1035                         maxcount = REG_INFTY;
1036                         next = regnext(scan);
1037                         scan = NEXTOPER(scan);
1038                         goto do_curly;
1039                     }
1040                 }
1041                 if (flags & SCF_DO_SUBSTR)
1042                     data->pos_min++;
1043                 min++;
1044                 /* Fall through. */
1045             case STAR:
1046                 if (flags & SCF_DO_STCLASS) {
1047                     mincount = 0;
1048                     maxcount = REG_INFTY;
1049                     next = regnext(scan);
1050                     scan = NEXTOPER(scan);
1051                     goto do_curly;
1052                 }
1053                 is_inf = is_inf_internal = 1;
1054                 scan = regnext(scan);
1055                 if (flags & SCF_DO_SUBSTR) {
1056                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1057                     data->longest = &(data->longest_float);
1058                 }
1059                 goto optimize_curly_tail;
1060             case CURLY:
1061                 mincount = ARG1(scan);
1062                 maxcount = ARG2(scan);
1063                 next = regnext(scan);
1064                 if (OP(scan) == CURLYX) {
1065                     I32 lp = (data ? *(data->last_closep) : 0);
1066
1067                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068                 }
1069                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1070                 next_is_eval = (OP(scan) == EVAL);
1071               do_curly:
1072                 if (flags & SCF_DO_SUBSTR) {
1073                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1074                     pos_before = data->pos_min;
1075                 }
1076                 if (data) {
1077                     fl = data->flags;
1078                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079                     if (is_inf)
1080                         data->flags |= SF_IS_INF;
1081                 }
1082                 if (flags & SCF_DO_STCLASS) {
1083                     cl_init(pRExC_state, &this_class);
1084                     oclass = data->start_class;
1085                     data->start_class = &this_class;
1086                     f |= SCF_DO_STCLASS_AND;
1087                     f &= ~SCF_DO_STCLASS_OR;
1088                 }
1089                 /* These are the cases when once a subexpression
1090                    fails at a particular position, it cannot succeed
1091                    even after backtracking at the enclosing scope.
1092                 
1093                    XXXX what if minimal match and we are at the
1094                         initial run of {n,m}? */
1095                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1096                     f &= ~SCF_WHILEM_VISITED_POS;
1097
1098                 /* This will finish on WHILEM, setting scan, or on NULL: */
1099                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100                                       mincount == 0
1101                                         ? (f & ~SCF_DO_SUBSTR) : f);
1102
1103                 if (flags & SCF_DO_STCLASS)
1104                     data->start_class = oclass;
1105                 if (mincount == 0 || minnext == 0) {
1106                     if (flags & SCF_DO_STCLASS_OR) {
1107                         cl_or(pRExC_state, data->start_class, &this_class);
1108                     }
1109                     else if (flags & SCF_DO_STCLASS_AND) {
1110                         /* Switch to OR mode: cache the old value of
1111                          * data->start_class */
1112                         StructCopy(data->start_class, &and_with,
1113                                    struct regnode_charclass_class);
1114                         flags &= ~SCF_DO_STCLASS_AND;
1115                         StructCopy(&this_class, data->start_class,
1116                                    struct regnode_charclass_class);
1117                         flags |= SCF_DO_STCLASS_OR;
1118                         data->start_class->flags |= ANYOF_EOS;
1119                     }
1120                 } else {                /* Non-zero len */
1121                     if (flags & SCF_DO_STCLASS_OR) {
1122                         cl_or(pRExC_state, data->start_class, &this_class);
1123                         cl_and(data->start_class, &and_with);
1124                     }
1125                     else if (flags & SCF_DO_STCLASS_AND)
1126                         cl_and(data->start_class, &this_class);
1127                     flags &= ~SCF_DO_STCLASS;
1128                 }
1129                 if (!scan)              /* It was not CURLYX, but CURLY. */
1130                     scan = next;
1131                 if (ckWARN(WARN_REGEXP)
1132                        /* ? quantifier ok, except for (?{ ... }) */
1133                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1134                     && (minnext == 0) && (deltanext == 0)
1135                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1136                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1137                 {
1138                     vWARN(RExC_parse,
1139                           "Quantifier unexpected on zero-length expression");
1140                 }
1141
1142                 min += minnext * mincount;
1143                 is_inf_internal |= ((maxcount == REG_INFTY
1144                                      && (minnext + deltanext) > 0)
1145                                     || deltanext == I32_MAX);
1146                 is_inf |= is_inf_internal;
1147                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148
1149                 /* Try powerful optimization CURLYX => CURLYN. */
1150                 if (  OP(oscan) == CURLYX && data
1151                       && data->flags & SF_IN_PAR
1152                       && !(data->flags & SF_HAS_EVAL)
1153                       && !deltanext && minnext == 1 ) {
1154                     /* Try to optimize to CURLYN.  */
1155                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1156                     regnode *nxt1 = nxt;
1157 #ifdef DEBUGGING
1158                     regnode *nxt2;
1159 #endif
1160
1161                     /* Skip open. */
1162                     nxt = regnext(nxt);
1163                     if (!strchr((char*)PL_simple,OP(nxt))
1164                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1165                              && STR_LEN(nxt) == 1))
1166                         goto nogo;
1167 #ifdef DEBUGGING
1168                     nxt2 = nxt;
1169 #endif
1170                     nxt = regnext(nxt);
1171                     if (OP(nxt) != CLOSE)
1172                         goto nogo;
1173                     /* Now we know that nxt2 is the only contents: */
1174                     oscan->flags = (U8)ARG(nxt);
1175                     OP(oscan) = CURLYN;
1176                     OP(nxt1) = NOTHING; /* was OPEN. */
1177 #ifdef DEBUGGING
1178                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1179                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1180                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1181                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1182                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1183                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1184 #endif
1185                 }
1186               nogo:
1187
1188                 /* Try optimization CURLYX => CURLYM. */
1189                 if (  OP(oscan) == CURLYX && data
1190                       && !(data->flags & SF_HAS_PAR)
1191                       && !(data->flags & SF_HAS_EVAL)
1192                       && !deltanext  ) {
1193                     /* XXXX How to optimize if data == 0? */
1194                     /* Optimize to a simpler form.  */
1195                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1196                     regnode *nxt2;
1197
1198                     OP(oscan) = CURLYM;
1199                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1200                             && (OP(nxt2) != WHILEM))
1201                         nxt = nxt2;
1202                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1203                     /* Need to optimize away parenths. */
1204                     if (data->flags & SF_IN_PAR) {
1205                         /* Set the parenth number.  */
1206                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207
1208                         if (OP(nxt) != CLOSE)
1209                             FAIL("Panic opt close");
1210                         oscan->flags = (U8)ARG(nxt);
1211                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1212                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1213 #ifdef DEBUGGING
1214                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1215                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1216                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1217                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1218 #endif
1219 #if 0
1220                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1221                             regnode *nnxt = regnext(nxt1);
1222                         
1223                             if (nnxt == nxt) {
1224                                 if (reg_off_by_arg[OP(nxt1)])
1225                                     ARG_SET(nxt1, nxt2 - nxt1);
1226                                 else if (nxt2 - nxt1 < U16_MAX)
1227                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1228                                 else
1229                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1230                             }
1231                             nxt1 = nnxt;
1232                         }
1233 #endif
1234                         /* Optimize again: */
1235                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1236                                     NULL, 0);
1237                     }
1238                     else
1239                         oscan->flags = 0;
1240                 }
1241                 else if ((OP(oscan) == CURLYX)
1242                          && (flags & SCF_WHILEM_VISITED_POS)
1243                          /* See the comment on a similar expression above.
1244                             However, this time it not a subexpression
1245                             we care about, but the expression itself. */
1246                          && (maxcount == REG_INFTY)
1247                          && data && ++data->whilem_c < 16) {
1248                     /* This stays as CURLYX, we can put the count/of pair. */
1249                     /* Find WHILEM (as in regexec.c) */
1250                     regnode *nxt = oscan + NEXT_OFF(oscan);
1251
1252                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253                         nxt += ARG(nxt);
1254                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1255                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1256                 }
1257                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1258                     pars++;
1259                 if (flags & SCF_DO_SUBSTR) {
1260                     SV *last_str = Nullsv;
1261                     int counted = mincount != 0;
1262
1263                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1264 #if defined(SPARC64_GCC_WORKAROUND)
1265                         I32 b = 0;
1266                         STRLEN l = 0;
1267                         char *s = NULL;
1268                         I32 old = 0;
1269
1270                         if (pos_before >= data->last_start_min)
1271                             b = pos_before;
1272                         else
1273                             b = data->last_start_min;
1274
1275                         l = 0;
1276                         s = SvPV(data->last_found, l);
1277                         old = b - data->last_start_min;
1278
1279 #else
1280                         I32 b = pos_before >= data->last_start_min
1281                             ? pos_before : data->last_start_min;
1282                         STRLEN l;
1283                         char *s = SvPV(data->last_found, l);
1284                         I32 old = b - data->last_start_min;
1285 #endif
1286
1287                         if (UTF)
1288                             old = utf8_hop((U8*)s, old) - (U8*)s;
1289                         
1290                         l -= old;
1291                         /* Get the added string: */
1292                         last_str = newSVpvn(s  + old, l);
1293                         if (deltanext == 0 && pos_before == b) {
1294                             /* What was added is a constant string */
1295                             if (mincount > 1) {
1296                                 SvGROW(last_str, (mincount * l) + 1);
1297                                 repeatcpy(SvPVX(last_str) + l,
1298                                           SvPVX(last_str), l, mincount - 1);
1299                                 SvCUR(last_str) *= mincount;
1300                                 /* Add additional parts. */
1301                                 SvCUR_set(data->last_found,
1302                                           SvCUR(data->last_found) - l);
1303                                 sv_catsv(data->last_found, last_str);
1304                                 data->last_end += l * (mincount - 1);
1305                             }
1306                         } else {
1307                             /* start offset must point into the last copy */
1308                             data->last_start_min += minnext * (mincount - 1);
1309                             data->last_start_max += is_inf ? 0 : (maxcount - 1)
1310                                 * (minnext + data->pos_delta);
1311                         }
1312                     }
1313                     /* It is counted once already... */
1314                     data->pos_min += minnext * (mincount - counted);
1315                     data->pos_delta += - counted * deltanext +
1316                         (minnext + deltanext) * maxcount - minnext * mincount;
1317                     if (mincount != maxcount) {
1318                          /* Cannot extend fixed substrings found inside
1319                             the group.  */
1320                         scan_commit(pRExC_state,data);
1321                         if (mincount && last_str) {
1322                             sv_setsv(data->last_found, last_str);
1323                             data->last_end = data->pos_min;
1324                             data->last_start_min =
1325                                 data->pos_min - CHR_SVLEN(last_str);
1326                             data->last_start_max = is_inf
1327                                 ? I32_MAX
1328                                 : data->pos_min + data->pos_delta
1329                                 - CHR_SVLEN(last_str);
1330                         }
1331                         data->longest = &(data->longest_float);
1332                     }
1333                     SvREFCNT_dec(last_str);
1334                 }
1335                 if (data && (fl & SF_HAS_EVAL))
1336                     data->flags |= SF_HAS_EVAL;
1337               optimize_curly_tail:
1338                 if (OP(oscan) != CURLYX) {
1339                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1340                            && NEXT_OFF(next))
1341                         NEXT_OFF(oscan) += NEXT_OFF(next);
1342                 }
1343                 continue;
1344             default:                    /* REF and CLUMP only? */
1345                 if (flags & SCF_DO_SUBSTR) {
1346                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1347                     data->longest = &(data->longest_float);
1348                 }
1349                 is_inf = is_inf_internal = 1;
1350                 if (flags & SCF_DO_STCLASS_OR)
1351                     cl_anything(pRExC_state, data->start_class);
1352                 flags &= ~SCF_DO_STCLASS;
1353                 break;
1354             }
1355         }
1356         else if (strchr((char*)PL_simple,OP(scan))) {
1357             int value = 0;
1358
1359             if (flags & SCF_DO_SUBSTR) {
1360                 scan_commit(pRExC_state,data);
1361                 data->pos_min++;
1362             }
1363             min++;
1364             if (flags & SCF_DO_STCLASS) {
1365                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1366
1367                 /* Some of the logic below assumes that switching
1368                    locale on will only add false positives. */
1369                 switch (PL_regkind[(U8)OP(scan)]) {
1370                 case SANY:
1371                 default:
1372                   do_default:
1373                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1374                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1375                         cl_anything(pRExC_state, data->start_class);
1376                     break;
1377                 case REG_ANY:
1378                     if (OP(scan) == SANY)
1379                         goto do_default;
1380                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1381                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1382                                  || (data->start_class->flags & ANYOF_CLASS));
1383                         cl_anything(pRExC_state, data->start_class);
1384                     }
1385                     if (flags & SCF_DO_STCLASS_AND || !value)
1386                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387                     break;
1388                 case ANYOF:
1389                     if (flags & SCF_DO_STCLASS_AND)
1390                         cl_and(data->start_class,
1391                                (struct regnode_charclass_class*)scan);
1392                     else
1393                         cl_or(pRExC_state, data->start_class,
1394                               (struct regnode_charclass_class*)scan);
1395                     break;
1396                 case ALNUM:
1397                     if (flags & SCF_DO_STCLASS_AND) {
1398                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1399                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1400                             for (value = 0; value < 256; value++)
1401                                 if (!isALNUM(value))
1402                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1403                         }
1404                     }
1405                     else {
1406                         if (data->start_class->flags & ANYOF_LOCALE)
1407                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1408                         else {
1409                             for (value = 0; value < 256; value++)
1410                                 if (isALNUM(value))
1411                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1412                         }
1413                     }
1414                     break;
1415                 case ALNUML:
1416                     if (flags & SCF_DO_STCLASS_AND) {
1417                         if (data->start_class->flags & ANYOF_LOCALE)
1418                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419                     }
1420                     else {
1421                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1422                         data->start_class->flags |= ANYOF_LOCALE;
1423                     }
1424                     break;
1425                 case NALNUM:
1426                     if (flags & SCF_DO_STCLASS_AND) {
1427                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1428                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1429                             for (value = 0; value < 256; value++)
1430                                 if (isALNUM(value))
1431                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1432                         }
1433                     }
1434                     else {
1435                         if (data->start_class->flags & ANYOF_LOCALE)
1436                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437                         else {
1438                             for (value = 0; value < 256; value++)
1439                                 if (!isALNUM(value))
1440                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1441                         }
1442                     }
1443                     break;
1444                 case NALNUML:
1445                     if (flags & SCF_DO_STCLASS_AND) {
1446                         if (data->start_class->flags & ANYOF_LOCALE)
1447                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448                     }
1449                     else {
1450                         data->start_class->flags |= ANYOF_LOCALE;
1451                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1452                     }
1453                     break;
1454                 case SPACE:
1455                     if (flags & SCF_DO_STCLASS_AND) {
1456                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1457                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1458                             for (value = 0; value < 256; value++)
1459                                 if (!isSPACE(value))
1460                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1461                         }
1462                     }
1463                     else {
1464                         if (data->start_class->flags & ANYOF_LOCALE)
1465                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466                         else {
1467                             for (value = 0; value < 256; value++)
1468                                 if (isSPACE(value))
1469                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1470                         }
1471                     }
1472                     break;
1473                 case SPACEL:
1474                     if (flags & SCF_DO_STCLASS_AND) {
1475                         if (data->start_class->flags & ANYOF_LOCALE)
1476                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477                     }
1478                     else {
1479                         data->start_class->flags |= ANYOF_LOCALE;
1480                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1481                     }
1482                     break;
1483                 case NSPACE:
1484                     if (flags & SCF_DO_STCLASS_AND) {
1485                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1486                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1487                             for (value = 0; value < 256; value++)
1488                                 if (isSPACE(value))
1489                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1490                         }
1491                     }
1492                     else {
1493                         if (data->start_class->flags & ANYOF_LOCALE)
1494                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1495                         else {
1496                             for (value = 0; value < 256; value++)
1497                                 if (!isSPACE(value))
1498                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1499                         }
1500                     }
1501                     break;
1502                 case NSPACEL:
1503                     if (flags & SCF_DO_STCLASS_AND) {
1504                         if (data->start_class->flags & ANYOF_LOCALE) {
1505                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1506                             for (value = 0; value < 256; value++)
1507                                 if (!isSPACE(value))
1508                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1509                         }
1510                     }
1511                     else {
1512                         data->start_class->flags |= ANYOF_LOCALE;
1513                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1514                     }
1515                     break;
1516                 case DIGIT:
1517                     if (flags & SCF_DO_STCLASS_AND) {
1518                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1519                         for (value = 0; value < 256; value++)
1520                             if (!isDIGIT(value))
1521                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522                     }
1523                     else {
1524                         if (data->start_class->flags & ANYOF_LOCALE)
1525                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1526                         else {
1527                             for (value = 0; value < 256; value++)
1528                                 if (isDIGIT(value))
1529                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1530                         }
1531                     }
1532                     break;
1533                 case NDIGIT:
1534                     if (flags & SCF_DO_STCLASS_AND) {
1535                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1536                         for (value = 0; value < 256; value++)
1537                             if (isDIGIT(value))
1538                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539                     }
1540                     else {
1541                         if (data->start_class->flags & ANYOF_LOCALE)
1542                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1543                         else {
1544                             for (value = 0; value < 256; value++)
1545                                 if (!isDIGIT(value))
1546                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1547                         }
1548                     }
1549                     break;
1550                 }
1551                 if (flags & SCF_DO_STCLASS_OR)
1552                     cl_and(data->start_class, &and_with);
1553                 flags &= ~SCF_DO_STCLASS;
1554             }
1555         }
1556         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1557             data->flags |= (OP(scan) == MEOL
1558                             ? SF_BEFORE_MEOL
1559                             : SF_BEFORE_SEOL);
1560         }
1561         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1562                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1563                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1564                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1565             /* Lookahead/lookbehind */
1566             I32 deltanext, minnext, fake = 0;
1567             regnode *nscan;
1568             struct regnode_charclass_class intrnl;
1569             int f = 0;
1570
1571             data_fake.flags = 0;
1572             if (data) {         
1573                 data_fake.whilem_c = data->whilem_c;
1574                 data_fake.last_closep = data->last_closep;
1575             }
1576             else
1577                 data_fake.last_closep = &fake;
1578             if ( flags & SCF_DO_STCLASS && !scan->flags
1579                  && OP(scan) == IFMATCH ) { /* Lookahead */
1580                 cl_init(pRExC_state, &intrnl);
1581                 data_fake.start_class = &intrnl;
1582                 f |= SCF_DO_STCLASS_AND;
1583             }
1584             if (flags & SCF_WHILEM_VISITED_POS)
1585                 f |= SCF_WHILEM_VISITED_POS;
1586             next = regnext(scan);
1587             nscan = NEXTOPER(NEXTOPER(scan));
1588             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1589             if (scan->flags) {
1590                 if (deltanext) {
1591                     vFAIL("Variable length lookbehind not implemented");
1592                 }
1593                 else if (minnext > U8_MAX) {
1594                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1595                 }
1596                 scan->flags = (U8)minnext;
1597             }
1598             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1599                 pars++;
1600             if (data && (data_fake.flags & SF_HAS_EVAL))
1601                 data->flags |= SF_HAS_EVAL;
1602             if (data)
1603                 data->whilem_c = data_fake.whilem_c;
1604             if (f & SCF_DO_STCLASS_AND) {
1605                 int was = (data->start_class->flags & ANYOF_EOS);
1606
1607                 cl_and(data->start_class, &intrnl);
1608                 if (was)
1609                     data->start_class->flags |= ANYOF_EOS;
1610             }
1611         }
1612         else if (OP(scan) == OPEN) {
1613             pars++;
1614         }
1615         else if (OP(scan) == CLOSE) {
1616             if ((I32)ARG(scan) == is_par) {
1617                 next = regnext(scan);
1618
1619                 if ( next && (OP(next) != WHILEM) && next < last)
1620                     is_par = 0;         /* Disable optimization */
1621             }
1622             if (data)
1623                 *(data->last_closep) = ARG(scan);
1624         }
1625         else if (OP(scan) == EVAL) {
1626                 if (data)
1627                     data->flags |= SF_HAS_EVAL;
1628         }
1629         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1630                 if (flags & SCF_DO_SUBSTR) {
1631                     scan_commit(pRExC_state,data);
1632                     data->longest = &(data->longest_float);
1633                 }
1634                 is_inf = is_inf_internal = 1;
1635                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1636                     cl_anything(pRExC_state, data->start_class);
1637                 flags &= ~SCF_DO_STCLASS;
1638         }
1639         /* Else: zero-length, ignore. */
1640         scan = regnext(scan);
1641     }
1642
1643   finish:
1644     *scanp = scan;
1645     *deltap = is_inf_internal ? I32_MAX : delta;
1646     if (flags & SCF_DO_SUBSTR && is_inf)
1647         data->pos_delta = I32_MAX - data->pos_min;
1648     if (is_par > U8_MAX)
1649         is_par = 0;
1650     if (is_par && pars==1 && data) {
1651         data->flags |= SF_IN_PAR;
1652         data->flags &= ~SF_HAS_PAR;
1653     }
1654     else if (pars && data) {
1655         data->flags |= SF_HAS_PAR;
1656         data->flags &= ~SF_IN_PAR;
1657     }
1658     if (flags & SCF_DO_STCLASS_OR)
1659         cl_and(data->start_class, &and_with);
1660     return min;
1661 }
1662
1663 STATIC I32
1664 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1665 {
1666     if (RExC_rx->data) {
1667         Renewc(RExC_rx->data,
1668                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1669                char, struct reg_data);
1670         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1671         RExC_rx->data->count += n;
1672     }
1673     else {
1674         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1675              char, struct reg_data);
1676         New(1208, RExC_rx->data->what, n, U8);
1677         RExC_rx->data->count = n;
1678     }
1679     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1680     return RExC_rx->data->count - n;
1681 }
1682
1683 void
1684 Perl_reginitcolors(pTHX)
1685 {
1686     int i = 0;
1687     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1688         
1689     if (s) {
1690         PL_colors[0] = s = savepv(s);
1691         while (++i < 6) {
1692             s = strchr(s, '\t');
1693             if (s) {
1694                 *s = '\0';
1695                 PL_colors[i] = ++s;
1696             }
1697             else
1698                 PL_colors[i] = s = "";
1699         }
1700     } else {
1701         while (i < 6)
1702             PL_colors[i++] = "";
1703     }
1704     PL_colorset = 1;
1705 }
1706
1707
1708 /*
1709  - pregcomp - compile a regular expression into internal code
1710  *
1711  * We can't allocate space until we know how big the compiled form will be,
1712  * but we can't compile it (and thus know how big it is) until we've got a
1713  * place to put the code.  So we cheat:  we compile it twice, once with code
1714  * generation turned off and size counting turned on, and once "for real".
1715  * This also means that we don't allocate space until we are sure that the
1716  * thing really will compile successfully, and we never have to move the
1717  * code and thus invalidate pointers into it.  (Note that it has to be in
1718  * one piece because free() must be able to free it all.) [NB: not true in perl]
1719  *
1720  * Beware that the optimization-preparation code in here knows about some
1721  * of the structure of the compiled regexp.  [I'll say.]
1722  */
1723 regexp *
1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1725 {
1726     register regexp *r;
1727     regnode *scan;
1728     regnode *first;
1729     I32 flags;
1730     I32 minlen = 0;
1731     I32 sawplus = 0;
1732     I32 sawopen = 0;
1733     scan_data_t data;
1734     RExC_state_t RExC_state;
1735     RExC_state_t *pRExC_state = &RExC_state;
1736
1737     if (exp == NULL)
1738         FAIL("NULL regexp argument");
1739
1740     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1741
1742     RExC_precomp = exp;
1743     DEBUG_r({
1744          if (!PL_colorset) reginitcolors();
1745          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746                        PL_colors[4],PL_colors[5],PL_colors[0],
1747                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1748     });
1749     RExC_flags = pm->op_pmflags;
1750     RExC_sawback = 0;
1751
1752     RExC_seen = 0;
1753     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754     RExC_seen_evals = 0;
1755     RExC_extralen = 0;
1756
1757     /* First pass: determine size, legality. */
1758     RExC_parse = exp;
1759     RExC_start = exp;
1760     RExC_end = xend;
1761     RExC_naughty = 0;
1762     RExC_npar = 1;
1763     RExC_size = 0L;
1764     RExC_emit = &PL_regdummy;
1765     RExC_whilem_seen = 0;
1766 #if 0 /* REGC() is (currently) a NOP at the first pass.
1767        * Clever compilers notice this and complain. --jhi */
1768     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1769 #endif
1770     if (reg(pRExC_state, 0, &flags) == NULL) {
1771         RExC_precomp = Nullch;
1772         return(NULL);
1773     }
1774     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1775
1776     /* Small enough for pointer-storage convention?
1777        If extralen==0, this means that we will not need long jumps. */
1778     if (RExC_size >= 0x10000L && RExC_extralen)
1779         RExC_size += RExC_extralen;
1780     else
1781         RExC_extralen = 0;
1782     if (RExC_whilem_seen > 15)
1783         RExC_whilem_seen = 15;
1784
1785     /* Allocate space and initialize. */
1786     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787          char, regexp);
1788     if (r == NULL)
1789         FAIL("Regexp out of space");
1790
1791 #ifdef DEBUGGING
1792     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1793     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 #endif
1795     r->refcnt = 1;
1796     r->prelen = xend - exp;
1797     r->precomp = savepvn(RExC_precomp, r->prelen);
1798     r->subbeg = NULL;
1799     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1800     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1801
1802     r->substrs = 0;                     /* Useful during FAIL. */
1803     r->startp = 0;                      /* Useful during FAIL. */
1804     r->endp = 0;                        /* Useful during FAIL. */
1805
1806     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1807     if (r->offsets) {
1808       r->offsets[0] = RExC_size; 
1809     }
1810     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1811                           "%s %"UVuf" bytes for offset annotations.\n", 
1812                           r->offsets ? "Got" : "Couldn't get", 
1813                           (UV)((2*RExC_size+1) * sizeof(U32))));
1814
1815     RExC_rx = r;
1816
1817     /* Second pass: emit code. */
1818     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1819     RExC_parse = exp;
1820     RExC_end = xend;
1821     RExC_naughty = 0;
1822     RExC_npar = 1;
1823     RExC_emit_start = r->program;
1824     RExC_emit = r->program;
1825     /* Store the count of eval-groups for security checks: */
1826     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1827     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1828     r->data = 0;
1829     if (reg(pRExC_state, 0, &flags) == NULL)
1830         return(NULL);
1831
1832     /* Dig out information for optimizations. */
1833     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1834     pm->op_pmflags = RExC_flags;
1835     if (UTF)
1836         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1837     r->regstclass = NULL;
1838     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1839         r->reganch |= ROPT_NAUGHTY;
1840     scan = r->program + 1;              /* First BRANCH. */
1841
1842     /* XXXX To minimize changes to RE engine we always allocate
1843        3-units-long substrs field. */
1844     Newz(1004, r->substrs, 1, struct reg_substr_data);
1845
1846     StructCopy(&zero_scan_data, &data, scan_data_t);
1847     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1848     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1849         I32 fake;
1850         STRLEN longest_float_length, longest_fixed_length;
1851         struct regnode_charclass_class ch_class;
1852         int stclass_flag;
1853         I32 last_close = 0;
1854
1855         first = scan;
1856         /* Skip introductions and multiplicators >= 1. */
1857         while ((OP(first) == OPEN && (sawopen = 1)) ||
1858                /* An OR of *one* alternative - should not happen now. */
1859             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1860             (OP(first) == PLUS) ||
1861             (OP(first) == MINMOD) ||
1862                /* An {n,m} with n>0 */
1863             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1864                 if (OP(first) == PLUS)
1865                     sawplus = 1;
1866                 else
1867                     first += regarglen[(U8)OP(first)];
1868                 first = NEXTOPER(first);
1869         }
1870
1871         /* Starting-point info. */
1872       again:
1873         if (PL_regkind[(U8)OP(first)] == EXACT) {
1874             if (OP(first) == EXACT)
1875                 ;       /* Empty, get anchored substr later. */
1876             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1877                 r->regstclass = first;
1878         }
1879         else if (strchr((char*)PL_simple,OP(first)))
1880             r->regstclass = first;
1881         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1882                  PL_regkind[(U8)OP(first)] == NBOUND)
1883             r->regstclass = first;
1884         else if (PL_regkind[(U8)OP(first)] == BOL) {
1885             r->reganch |= (OP(first) == MBOL
1886                            ? ROPT_ANCH_MBOL
1887                            : (OP(first) == SBOL
1888                               ? ROPT_ANCH_SBOL
1889                               : ROPT_ANCH_BOL));
1890             first = NEXTOPER(first);
1891             goto again;
1892         }
1893         else if (OP(first) == GPOS) {
1894             r->reganch |= ROPT_ANCH_GPOS;
1895             first = NEXTOPER(first);
1896             goto again;
1897         }
1898         else if (!sawopen && (OP(first) == STAR &&
1899             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1900             !(r->reganch & ROPT_ANCH) )
1901         {
1902             /* turn .* into ^.* with an implied $*=1 */
1903             int type = OP(NEXTOPER(first));
1904
1905             if (type == REG_ANY)
1906                 type = ROPT_ANCH_MBOL;
1907             else
1908                 type = ROPT_ANCH_SBOL;
1909
1910             r->reganch |= type | ROPT_IMPLICIT;
1911             first = NEXTOPER(first);
1912             goto again;
1913         }
1914         if (sawplus && (!sawopen || !RExC_sawback)
1915             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1916             /* x+ must match at the 1st pos of run of x's */
1917             r->reganch |= ROPT_SKIP;
1918
1919         /* Scan is after the zeroth branch, first is atomic matcher. */
1920         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1921                               (IV)(first - scan + 1)));
1922         /*
1923         * If there's something expensive in the r.e., find the
1924         * longest literal string that must appear and make it the
1925         * regmust.  Resolve ties in favor of later strings, since
1926         * the regstart check works with the beginning of the r.e.
1927         * and avoiding duplication strengthens checking.  Not a
1928         * strong reason, but sufficient in the absence of others.
1929         * [Now we resolve ties in favor of the earlier string if
1930         * it happens that c_offset_min has been invalidated, since the
1931         * earlier string may buy us something the later one won't.]
1932         */
1933         minlen = 0;
1934
1935         data.longest_fixed = newSVpvn("",0);
1936         data.longest_float = newSVpvn("",0);
1937         data.last_found = newSVpvn("",0);
1938         data.longest = &(data.longest_fixed);
1939         first = scan;
1940         if (!r->regstclass) {
1941             cl_init(pRExC_state, &ch_class);
1942             data.start_class = &ch_class;
1943             stclass_flag = SCF_DO_STCLASS_AND;
1944         } else                          /* XXXX Check for BOUND? */
1945             stclass_flag = 0;
1946         data.last_closep = &last_close;
1947
1948         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1949                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1950         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1951              && data.last_start_min == 0 && data.last_end > 0
1952              && !RExC_seen_zerolen
1953              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1954             r->reganch |= ROPT_CHECK_ALL;
1955         scan_commit(pRExC_state, &data);
1956         SvREFCNT_dec(data.last_found);
1957
1958         longest_float_length = CHR_SVLEN(data.longest_float);
1959         if (longest_float_length
1960             || (data.flags & SF_FL_BEFORE_EOL
1961                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1962                     || (RExC_flags & PMf_MULTILINE)))) {
1963             int t;
1964
1965             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1966                 && data.offset_fixed == data.offset_float_min
1967                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1968                     goto remove_float;          /* As in (a)+. */
1969
1970             if (SvUTF8(data.longest_float)) {
1971                 r->float_utf8 = data.longest_float;
1972                 r->float_substr = Nullsv;
1973             } else {
1974                 r->float_substr = data.longest_float;
1975                 r->float_utf8 = Nullsv;
1976             }
1977             r->float_min_offset = data.offset_float_min;
1978             r->float_max_offset = data.offset_float_max;
1979             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1980                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1981                            || (RExC_flags & PMf_MULTILINE)));
1982             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1983         }
1984         else {
1985           remove_float:
1986             r->float_substr = r->float_utf8 = Nullsv;
1987             SvREFCNT_dec(data.longest_float);
1988             longest_float_length = 0;
1989         }
1990
1991         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1992         if (longest_fixed_length
1993             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1994                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1995                     || (RExC_flags & PMf_MULTILINE)))) {
1996             int t;
1997
1998             if (SvUTF8(data.longest_fixed)) {
1999                 r->anchored_utf8 = data.longest_fixed;
2000                 r->anchored_substr = Nullsv;
2001             } else {
2002                 r->anchored_substr = data.longest_fixed;
2003                 r->anchored_utf8 = Nullsv;
2004             }
2005             r->anchored_offset = data.offset_fixed;
2006             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2007                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2008                      || (RExC_flags & PMf_MULTILINE)));
2009             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2010         }
2011         else {
2012             r->anchored_substr = r->anchored_utf8 = Nullsv;
2013             SvREFCNT_dec(data.longest_fixed);
2014             longest_fixed_length = 0;
2015         }
2016         if (r->regstclass
2017             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2018             r->regstclass = NULL;
2019         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2020             && stclass_flag
2021             && !(data.start_class->flags & ANYOF_EOS)
2022             && !cl_is_anything(data.start_class))
2023         {
2024             I32 n = add_data(pRExC_state, 1, "f");
2025
2026             New(1006, RExC_rx->data->data[n], 1,
2027                 struct regnode_charclass_class);
2028             StructCopy(data.start_class,
2029                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2030                        struct regnode_charclass_class);
2031             r->regstclass = (regnode*)RExC_rx->data->data[n];
2032             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2033             PL_regdata = r->data; /* for regprop() */
2034             DEBUG_r({ SV *sv = sv_newmortal();
2035                       regprop(sv, (regnode*)data.start_class);
2036                       PerlIO_printf(Perl_debug_log,
2037                                     "synthetic stclass `%s'.\n",
2038                                     SvPVX(sv));});
2039         }
2040
2041         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2042         if (longest_fixed_length > longest_float_length) {
2043             r->check_substr = r->anchored_substr;
2044             r->check_utf8 = r->anchored_utf8;
2045             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2046             if (r->reganch & ROPT_ANCH_SINGLE)
2047                 r->reganch |= ROPT_NOSCAN;
2048         }
2049         else {
2050             r->check_substr = r->float_substr;
2051             r->check_utf8 = r->float_utf8;
2052             r->check_offset_min = data.offset_float_min;
2053             r->check_offset_max = data.offset_float_max;
2054         }
2055         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2056            This should be changed ASAP!  */
2057         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2058             r->reganch |= RE_USE_INTUIT;
2059             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2060                 r->reganch |= RE_INTUIT_TAIL;
2061         }
2062     }
2063     else {
2064         /* Several toplevels. Best we can is to set minlen. */
2065         I32 fake;
2066         struct regnode_charclass_class ch_class;
2067         I32 last_close = 0;
2068         
2069         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2070         scan = r->program + 1;
2071         cl_init(pRExC_state, &ch_class);
2072         data.start_class = &ch_class;
2073         data.last_closep = &last_close;
2074         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2075         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2076                 = r->float_substr = r->float_utf8 = Nullsv;
2077         if (!(data.start_class->flags & ANYOF_EOS)
2078             && !cl_is_anything(data.start_class))
2079         {
2080             I32 n = add_data(pRExC_state, 1, "f");
2081
2082             New(1006, RExC_rx->data->data[n], 1,
2083                 struct regnode_charclass_class);
2084             StructCopy(data.start_class,
2085                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2086                        struct regnode_charclass_class);
2087             r->regstclass = (regnode*)RExC_rx->data->data[n];
2088             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2089             DEBUG_r({ SV* sv = sv_newmortal();
2090                       regprop(sv, (regnode*)data.start_class);
2091                       PerlIO_printf(Perl_debug_log,
2092                                     "synthetic stclass `%s'.\n",
2093                                     SvPVX(sv));});
2094         }
2095     }
2096
2097     r->minlen = minlen;
2098     if (RExC_seen & REG_SEEN_GPOS)
2099         r->reganch |= ROPT_GPOS_SEEN;
2100     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2101         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2102     if (RExC_seen & REG_SEEN_EVAL)
2103         r->reganch |= ROPT_EVAL_SEEN;
2104     if (RExC_seen & REG_SEEN_CANY)
2105         r->reganch |= ROPT_CANY_SEEN;
2106     Newz(1002, r->startp, RExC_npar, I32);
2107     Newz(1002, r->endp, RExC_npar, I32);
2108     PL_regdata = r->data; /* for regprop() */
2109     DEBUG_r(regdump(r));
2110     return(r);
2111 }
2112
2113 /*
2114  - reg - regular expression, i.e. main body or parenthesized thing
2115  *
2116  * Caller must absorb opening parenthesis.
2117  *
2118  * Combining parenthesis handling with the base level of regular expression
2119  * is a trifle forced, but the need to tie the tails of the branches to what
2120  * follows makes it hard to avoid.
2121  */
2122 STATIC regnode *
2123 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2124     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2125 {
2126     register regnode *ret;              /* Will be the head of the group. */
2127     register regnode *br;
2128     register regnode *lastbr;
2129     register regnode *ender = 0;
2130     register I32 parno = 0;
2131     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2132
2133     /* for (?g), (?gc), and (?o) warnings; warning
2134        about (?c) will warn about (?g) -- japhy    */
2135
2136     I32 wastedflags = 0x00,
2137         wasted_o    = 0x01,
2138         wasted_g    = 0x02,
2139         wasted_gc   = 0x02 | 0x04,
2140         wasted_c    = 0x04;
2141
2142     char * parse_start = RExC_parse; /* MJD */
2143     char *oregcomp_parse = RExC_parse;
2144     char c;
2145
2146     *flagp = 0;                         /* Tentatively. */
2147
2148
2149     /* Make an OPEN node, if parenthesized. */
2150     if (paren) {
2151         if (*RExC_parse == '?') { /* (?...) */
2152             U32 posflags = 0, negflags = 0;
2153             U32 *flagsp = &posflags;
2154             int logical = 0;
2155             char *seqstart = RExC_parse;
2156
2157             RExC_parse++;
2158             paren = *RExC_parse++;
2159             ret = NULL;                 /* For look-ahead/behind. */
2160             switch (paren) {
2161             case '<':           /* (?<...) */
2162                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2163                 if (*RExC_parse == '!')
2164                     paren = ',';
2165                 if (*RExC_parse != '=' && *RExC_parse != '!')
2166                     goto unknown;
2167                 RExC_parse++;
2168             case '=':           /* (?=...) */
2169             case '!':           /* (?!...) */
2170                 RExC_seen_zerolen++;
2171             case ':':           /* (?:...) */
2172             case '>':           /* (?>...) */
2173                 break;
2174             case '$':           /* (?$...) */
2175             case '@':           /* (?@...) */
2176                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2177                 break;
2178             case '#':           /* (?#...) */
2179                 while (*RExC_parse && *RExC_parse != ')')
2180                     RExC_parse++;
2181                 if (*RExC_parse != ')')
2182                     FAIL("Sequence (?#... not terminated");
2183                 nextchar(pRExC_state);
2184                 *flagp = TRYAGAIN;
2185                 return NULL;
2186             case 'p':           /* (?p...) */
2187                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2188                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2189                 /* FALL THROUGH*/
2190             case '?':           /* (??...) */
2191                 logical = 1;
2192                 if (*RExC_parse != '{')
2193                     goto unknown;
2194                 paren = *RExC_parse++;
2195                 /* FALL THROUGH */
2196             case '{':           /* (?{...}) */
2197             {
2198                 I32 count = 1, n = 0;
2199                 char c;
2200                 char *s = RExC_parse;
2201                 SV *sv;
2202                 OP_4tree *sop, *rop;
2203
2204                 RExC_seen_zerolen++;
2205                 RExC_seen |= REG_SEEN_EVAL;
2206                 while (count && (c = *RExC_parse)) {
2207                     if (c == '\\' && RExC_parse[1])
2208                         RExC_parse++;
2209                     else if (c == '{')
2210                         count++;
2211                     else if (c == '}')
2212                         count--;
2213                     RExC_parse++;
2214                 }
2215                 if (*RExC_parse != ')')
2216                 {
2217                     RExC_parse = s;             
2218                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2219                 }
2220                 if (!SIZE_ONLY) {
2221                     AV *av;
2222                 
2223                     if (RExC_parse - 1 - s)
2224                         sv = newSVpvn(s, RExC_parse - 1 - s);
2225                     else
2226                         sv = newSVpvn("", 0);
2227
2228                     ENTER;
2229                     Perl_save_re_context(aTHX);
2230                     rop = sv_compile_2op(sv, &sop, "re", &av);
2231                     sop->op_private |= OPpREFCOUNTED;
2232                     /* re_dup will OpREFCNT_inc */
2233                     OpREFCNT_set(sop, 1);
2234                     LEAVE;
2235
2236                     n = add_data(pRExC_state, 3, "nop");
2237                     RExC_rx->data->data[n] = (void*)rop;
2238                     RExC_rx->data->data[n+1] = (void*)sop;
2239                     RExC_rx->data->data[n+2] = (void*)av;
2240                     SvREFCNT_dec(sv);
2241                 }
2242                 else {                                          /* First pass */
2243                     if (PL_reginterp_cnt < ++RExC_seen_evals
2244                         && PL_curcop != &PL_compiling)
2245                         /* No compiled RE interpolated, has runtime
2246                            components ===> unsafe.  */
2247                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2248                     if (PL_tainting && PL_tainted)
2249                         FAIL("Eval-group in insecure regular expression");
2250                 }
2251                 
2252                 nextchar(pRExC_state);
2253                 if (logical) {
2254                     ret = reg_node(pRExC_state, LOGICAL);
2255                     if (!SIZE_ONLY)
2256                         ret->flags = 2;
2257                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2258                     /* deal with the length of this later - MJD */
2259                     return ret;
2260                 }
2261                 return reganode(pRExC_state, EVAL, n);
2262             }
2263             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2264             {
2265                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2266                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2267                         || RExC_parse[1] == '<'
2268                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2269                         I32 flag;
2270                         
2271                         ret = reg_node(pRExC_state, LOGICAL);
2272                         if (!SIZE_ONLY)
2273                             ret->flags = 1;
2274                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2275                         goto insert_if;
2276                     }
2277                 }
2278                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2279                     /* (?(1)...) */
2280                     parno = atoi(RExC_parse++);
2281
2282                     while (isDIGIT(*RExC_parse))
2283                         RExC_parse++;
2284                     ret = reganode(pRExC_state, GROUPP, parno);
2285                     
2286                     if ((c = *nextchar(pRExC_state)) != ')')
2287                         vFAIL("Switch condition not recognized");
2288                   insert_if:
2289                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2290                     br = regbranch(pRExC_state, &flags, 1);
2291                     if (br == NULL)
2292                         br = reganode(pRExC_state, LONGJMP, 0);
2293                     else
2294                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2295                     c = *nextchar(pRExC_state);
2296                     if (flags&HASWIDTH)
2297                         *flagp |= HASWIDTH;
2298                     if (c == '|') {
2299                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2300                         regbranch(pRExC_state, &flags, 1);
2301                         regtail(pRExC_state, ret, lastbr);
2302                         if (flags&HASWIDTH)
2303                             *flagp |= HASWIDTH;
2304                         c = *nextchar(pRExC_state);
2305                     }
2306                     else
2307                         lastbr = NULL;
2308                     if (c != ')')
2309                         vFAIL("Switch (?(condition)... contains too many branches");
2310                     ender = reg_node(pRExC_state, TAIL);
2311                     regtail(pRExC_state, br, ender);
2312                     if (lastbr) {
2313                         regtail(pRExC_state, lastbr, ender);
2314                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2315                     }
2316                     else
2317                         regtail(pRExC_state, ret, ender);
2318                     return ret;
2319                 }
2320                 else {
2321                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2322                 }
2323             }
2324             case 0:
2325                 RExC_parse--; /* for vFAIL to print correctly */
2326                 vFAIL("Sequence (? incomplete");
2327                 break;
2328             default:
2329                 --RExC_parse;
2330               parse_flags:      /* (?i) */
2331                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2332                     /* (?g), (?gc) and (?o) are useless here
2333                        and must be globally applied -- japhy */
2334
2335                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2336                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2337                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2338                             if (! (wastedflags & wflagbit) ) {
2339                                 wastedflags |= wflagbit;
2340                                 vWARN5(
2341                                     RExC_parse + 1,
2342                                     "Useless (%s%c) - %suse /%c modifier",
2343                                     flagsp == &negflags ? "?-" : "?",
2344                                     *RExC_parse,
2345                                     flagsp == &negflags ? "don't " : "",
2346                                     *RExC_parse
2347                                 );
2348                             }
2349                         }
2350                     }
2351                     else if (*RExC_parse == 'c') {
2352                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2353                             if (! (wastedflags & wasted_c) ) {
2354                                 wastedflags |= wasted_gc;
2355                                 vWARN3(
2356                                     RExC_parse + 1,
2357                                     "Useless (%sc) - %suse /gc modifier",
2358                                     flagsp == &negflags ? "?-" : "?",
2359                                     flagsp == &negflags ? "don't " : ""
2360                                 );
2361                             }
2362                         }
2363                     }
2364                     else { pmflag(flagsp, *RExC_parse); }
2365
2366                     ++RExC_parse;
2367                 }
2368                 if (*RExC_parse == '-') {
2369                     flagsp = &negflags;
2370                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2371                     ++RExC_parse;
2372                     goto parse_flags;
2373                 }
2374                 RExC_flags |= posflags;
2375                 RExC_flags &= ~negflags;
2376                 if (*RExC_parse == ':') {
2377                     RExC_parse++;
2378                     paren = ':';
2379                     break;
2380                 }               
2381               unknown:
2382                 if (*RExC_parse != ')') {
2383                     RExC_parse++;
2384                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2385                 }
2386                 nextchar(pRExC_state);
2387                 *flagp = TRYAGAIN;
2388                 return NULL;
2389             }
2390         }
2391         else {                  /* (...) */
2392             parno = RExC_npar;
2393             RExC_npar++;
2394             ret = reganode(pRExC_state, OPEN, parno);
2395             Set_Node_Length(ret, 1); /* MJD */
2396             Set_Node_Offset(ret, RExC_parse); /* MJD */
2397             open = 1;
2398         }
2399     }
2400     else                        /* ! paren */
2401         ret = NULL;
2402
2403     /* Pick up the branches, linking them together. */
2404     parse_start = RExC_parse;   /* MJD */
2405     br = regbranch(pRExC_state, &flags, 1);
2406     /*     branch_len = (paren != 0); */
2407     
2408     if (br == NULL)
2409         return(NULL);
2410     if (*RExC_parse == '|') {
2411         if (!SIZE_ONLY && RExC_extralen) {
2412             reginsert(pRExC_state, BRANCHJ, br);
2413         }
2414         else {                  /* MJD */
2415             reginsert(pRExC_state, BRANCH, br);
2416             Set_Node_Length(br, paren != 0);
2417             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2418         }
2419         have_branch = 1;
2420         if (SIZE_ONLY)
2421             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2422     }
2423     else if (paren == ':') {
2424         *flagp |= flags&SIMPLE;
2425     }
2426     if (open) {                         /* Starts with OPEN. */
2427         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2428     }
2429     else if (paren != '?')              /* Not Conditional */
2430         ret = br;
2431     *flagp |= flags & (SPSTART | HASWIDTH);
2432     lastbr = br;
2433     while (*RExC_parse == '|') {
2434         if (!SIZE_ONLY && RExC_extralen) {
2435             ender = reganode(pRExC_state, LONGJMP,0);
2436             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2437         }
2438         if (SIZE_ONLY)
2439             RExC_extralen += 2;         /* Account for LONGJMP. */
2440         nextchar(pRExC_state);
2441         br = regbranch(pRExC_state, &flags, 0);
2442         
2443         if (br == NULL)
2444             return(NULL);
2445         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2446         lastbr = br;
2447         if (flags&HASWIDTH)
2448             *flagp |= HASWIDTH;
2449         *flagp |= flags&SPSTART;
2450     }
2451
2452     if (have_branch || paren != ':') {
2453         /* Make a closing node, and hook it on the end. */
2454         switch (paren) {
2455         case ':':
2456             ender = reg_node(pRExC_state, TAIL);
2457             break;
2458         case 1:
2459             ender = reganode(pRExC_state, CLOSE, parno);
2460             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2461             Set_Node_Length(ender,1); /* MJD */
2462             break;
2463         case '<':
2464         case ',':
2465         case '=':
2466         case '!':
2467             *flagp &= ~HASWIDTH;
2468             /* FALL THROUGH */
2469         case '>':
2470             ender = reg_node(pRExC_state, SUCCEED);
2471             break;
2472         case 0:
2473             ender = reg_node(pRExC_state, END);
2474             break;
2475         }
2476         regtail(pRExC_state, lastbr, ender);
2477
2478         if (have_branch) {
2479             /* Hook the tails of the branches to the closing node. */
2480             for (br = ret; br != NULL; br = regnext(br)) {
2481                 regoptail(pRExC_state, br, ender);
2482             }
2483         }
2484     }
2485
2486     {
2487         char *p;
2488         static char parens[] = "=!<,>";
2489
2490         if (paren && (p = strchr(parens, paren))) {
2491             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2492             int flag = (p - parens) > 1;
2493
2494             if (paren == '>')
2495                 node = SUSPEND, flag = 0;
2496             reginsert(pRExC_state, node,ret);
2497             ret->flags = flag;
2498             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2499         }
2500     }
2501
2502     /* Check for proper termination. */
2503     if (paren) {
2504         RExC_flags = oregflags;
2505         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2506             RExC_parse = oregcomp_parse;
2507             vFAIL("Unmatched (");
2508         }
2509     }
2510     else if (!paren && RExC_parse < RExC_end) {
2511         if (*RExC_parse == ')') {
2512             RExC_parse++;
2513             vFAIL("Unmatched )");
2514         }
2515         else
2516             FAIL("Junk on end of regexp");      /* "Can't happen". */
2517         /* NOTREACHED */
2518     }
2519
2520     return(ret);
2521 }
2522
2523 /*
2524  - regbranch - one alternative of an | operator
2525  *
2526  * Implements the concatenation operator.
2527  */
2528 STATIC regnode *
2529 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2530 {
2531     register regnode *ret;
2532     register regnode *chain = NULL;
2533     register regnode *latest;
2534     I32 flags = 0, c = 0;
2535
2536     if (first)
2537         ret = NULL;
2538     else {
2539         if (!SIZE_ONLY && RExC_extralen)
2540             ret = reganode(pRExC_state, BRANCHJ,0);
2541         else {
2542             ret = reg_node(pRExC_state, BRANCH);
2543             Set_Node_Length(ret, 1);
2544         }
2545     }
2546         
2547     if (!first && SIZE_ONLY)
2548         RExC_extralen += 1;                     /* BRANCHJ */
2549
2550     *flagp = WORST;                     /* Tentatively. */
2551
2552     RExC_parse--;
2553     nextchar(pRExC_state);
2554     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2555         flags &= ~TRYAGAIN;
2556         latest = regpiece(pRExC_state, &flags);
2557         if (latest == NULL) {
2558             if (flags & TRYAGAIN)
2559                 continue;
2560             return(NULL);
2561         }
2562         else if (ret == NULL)
2563             ret = latest;
2564         *flagp |= flags&HASWIDTH;
2565         if (chain == NULL)      /* First piece. */
2566             *flagp |= flags&SPSTART;
2567         else {
2568             RExC_naughty++;
2569             regtail(pRExC_state, chain, latest);
2570         }
2571         chain = latest;
2572         c++;
2573     }
2574     if (chain == NULL) {        /* Loop ran zero times. */
2575         chain = reg_node(pRExC_state, NOTHING);
2576         if (ret == NULL)
2577             ret = chain;
2578     }
2579     if (c == 1) {
2580         *flagp |= flags&SIMPLE;
2581     }
2582
2583     return(ret);
2584 }
2585
2586 /*
2587  - regpiece - something followed by possible [*+?]
2588  *
2589  * Note that the branching code sequences used for ? and the general cases
2590  * of * and + are somewhat optimized:  they use the same NOTHING node as
2591  * both the endmarker for their branch list and the body of the last branch.
2592  * It might seem that this node could be dispensed with entirely, but the
2593  * endmarker role is not redundant.
2594  */
2595 STATIC regnode *
2596 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2597 {
2598     register regnode *ret;
2599     register char op;
2600     register char *next;
2601     I32 flags;
2602     char *origparse = RExC_parse;
2603     char *maxpos;
2604     I32 min;
2605     I32 max = REG_INFTY;
2606     char *parse_start;
2607
2608     ret = regatom(pRExC_state, &flags);
2609     if (ret == NULL) {
2610         if (flags & TRYAGAIN)
2611             *flagp |= TRYAGAIN;
2612         return(NULL);
2613     }
2614
2615     op = *RExC_parse;
2616
2617     if (op == '{' && regcurly(RExC_parse)) {
2618         parse_start = RExC_parse; /* MJD */
2619         next = RExC_parse + 1;
2620         maxpos = Nullch;
2621         while (isDIGIT(*next) || *next == ',') {
2622             if (*next == ',') {
2623                 if (maxpos)
2624                     break;
2625                 else
2626                     maxpos = next;
2627             }
2628             next++;
2629         }
2630         if (*next == '}') {             /* got one */
2631             if (!maxpos)
2632                 maxpos = next;
2633             RExC_parse++;
2634             min = atoi(RExC_parse);
2635             if (*maxpos == ',')
2636                 maxpos++;
2637             else
2638                 maxpos = RExC_parse;
2639             max = atoi(maxpos);
2640             if (!max && *maxpos != '0')
2641                 max = REG_INFTY;                /* meaning "infinity" */
2642             else if (max >= REG_INFTY)
2643                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2644             RExC_parse = next;
2645             nextchar(pRExC_state);
2646
2647         do_curly:
2648             if ((flags&SIMPLE)) {
2649                 RExC_naughty += 2 + RExC_naughty / 2;
2650                 reginsert(pRExC_state, CURLY, ret);
2651                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2652                 Set_Node_Cur_Length(ret);
2653             }
2654             else {
2655                 regnode *w = reg_node(pRExC_state, WHILEM);
2656
2657                 w->flags = 0;
2658                 regtail(pRExC_state, ret, w);
2659                 if (!SIZE_ONLY && RExC_extralen) {
2660                     reginsert(pRExC_state, LONGJMP,ret);
2661                     reginsert(pRExC_state, NOTHING,ret);
2662                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2663                 }
2664                 reginsert(pRExC_state, CURLYX,ret);
2665                                 /* MJD hk */
2666                 Set_Node_Offset(ret, parse_start+1);
2667                 Set_Node_Length(ret, 
2668                                 op == '{' ? (RExC_parse - parse_start) : 1);
2669                 
2670                 if (!SIZE_ONLY && RExC_extralen)
2671                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2672                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2673                 if (SIZE_ONLY)
2674                     RExC_whilem_seen++, RExC_extralen += 3;
2675                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2676             }
2677             ret->flags = 0;
2678
2679             if (min > 0)
2680                 *flagp = WORST;
2681             if (max > 0)
2682                 *flagp |= HASWIDTH;
2683             if (max && max < min)
2684                 vFAIL("Can't do {n,m} with n > m");
2685             if (!SIZE_ONLY) {
2686                 ARG1_SET(ret, (U16)min);
2687                 ARG2_SET(ret, (U16)max);
2688             }
2689
2690             goto nest_check;
2691         }
2692     }
2693
2694     if (!ISMULT1(op)) {
2695         *flagp = flags;
2696         return(ret);
2697     }
2698
2699 #if 0                           /* Now runtime fix should be reliable. */
2700
2701     /* if this is reinstated, don't forget to put this back into perldiag:
2702
2703             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2704
2705            (F) The part of the regexp subject to either the * or + quantifier
2706            could match an empty string. The {#} shows in the regular
2707            expression about where the problem was discovered.
2708
2709     */
2710
2711     if (!(flags&HASWIDTH) && op != '?')
2712       vFAIL("Regexp *+ operand could be empty");
2713 #endif
2714
2715     parse_start = RExC_parse;
2716     nextchar(pRExC_state);
2717
2718     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2719
2720     if (op == '*' && (flags&SIMPLE)) {
2721         reginsert(pRExC_state, STAR, ret);
2722         ret->flags = 0;
2723         RExC_naughty += 4;
2724     }
2725     else if (op == '*') {
2726         min = 0;
2727         goto do_curly;
2728     }
2729     else if (op == '+' && (flags&SIMPLE)) {
2730         reginsert(pRExC_state, PLUS, ret);
2731         ret->flags = 0;
2732         RExC_naughty += 3;
2733     }
2734     else if (op == '+') {
2735         min = 1;
2736         goto do_curly;
2737     }
2738     else if (op == '?') {
2739         min = 0; max = 1;
2740         goto do_curly;
2741     }
2742   nest_check:
2743     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2744         vWARN3(RExC_parse,
2745                "%.*s matches null string many times",
2746                RExC_parse - origparse,
2747                origparse);
2748     }
2749
2750     if (*RExC_parse == '?') {
2751         nextchar(pRExC_state);
2752         reginsert(pRExC_state, MINMOD, ret);
2753         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2754     }
2755     if (ISMULT2(RExC_parse)) {
2756         RExC_parse++;
2757         vFAIL("Nested quantifiers");
2758     }
2759
2760     return(ret);
2761 }
2762
2763 /*
2764  - regatom - the lowest level
2765  *
2766  * Optimization:  gobbles an entire sequence of ordinary characters so that
2767  * it can turn them into a single node, which is smaller to store and
2768  * faster to run.  Backslashed characters are exceptions, each becoming a
2769  * separate node; the code is simpler that way and it's not worth fixing.
2770  *
2771  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2772 STATIC regnode *
2773 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2774 {
2775     register regnode *ret = 0;
2776     I32 flags;
2777     char *parse_start = 0;
2778
2779     *flagp = WORST;             /* Tentatively. */
2780
2781 tryagain:
2782     switch (*RExC_parse) {
2783     case '^':
2784         RExC_seen_zerolen++;
2785         nextchar(pRExC_state);
2786         if (RExC_flags & PMf_MULTILINE)
2787             ret = reg_node(pRExC_state, MBOL);
2788         else if (RExC_flags & PMf_SINGLELINE)
2789             ret = reg_node(pRExC_state, SBOL);
2790         else
2791             ret = reg_node(pRExC_state, BOL);
2792         Set_Node_Length(ret, 1); /* MJD */
2793         break;
2794     case '$':
2795         nextchar(pRExC_state);
2796         if (*RExC_parse)
2797             RExC_seen_zerolen++;
2798         if (RExC_flags & PMf_MULTILINE)
2799             ret = reg_node(pRExC_state, MEOL);
2800         else if (RExC_flags & PMf_SINGLELINE)
2801             ret = reg_node(pRExC_state, SEOL);
2802         else
2803             ret = reg_node(pRExC_state, EOL);
2804         Set_Node_Length(ret, 1); /* MJD */
2805         break;
2806     case '.':
2807         nextchar(pRExC_state);
2808         if (RExC_flags & PMf_SINGLELINE)
2809             ret = reg_node(pRExC_state, SANY);
2810         else
2811             ret = reg_node(pRExC_state, REG_ANY);
2812         *flagp |= HASWIDTH|SIMPLE;
2813         RExC_naughty++;
2814         Set_Node_Length(ret, 1); /* MJD */
2815         break;
2816     case '[':
2817     {
2818         char *oregcomp_parse = ++RExC_parse;
2819         ret = regclass(pRExC_state);
2820         if (*RExC_parse != ']') {
2821             RExC_parse = oregcomp_parse;
2822             vFAIL("Unmatched [");
2823         }
2824         nextchar(pRExC_state);
2825         *flagp |= HASWIDTH|SIMPLE;
2826         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2827         break;
2828     }
2829     case '(':
2830         nextchar(pRExC_state);
2831         ret = reg(pRExC_state, 1, &flags);
2832         if (ret == NULL) {
2833                 if (flags & TRYAGAIN) {
2834                     if (RExC_parse == RExC_end) {
2835                          /* Make parent create an empty node if needed. */
2836                         *flagp |= TRYAGAIN;
2837                         return(NULL);
2838                     }
2839                     goto tryagain;
2840                 }
2841                 return(NULL);
2842         }
2843         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2844         break;
2845     case '|':
2846     case ')':
2847         if (flags & TRYAGAIN) {
2848             *flagp |= TRYAGAIN;
2849             return NULL;
2850         }
2851         vFAIL("Internal urp");
2852                                 /* Supposed to be caught earlier. */
2853         break;
2854     case '{':
2855         if (!regcurly(RExC_parse)) {
2856             RExC_parse++;
2857             goto defchar;
2858         }
2859         /* FALL THROUGH */
2860     case '?':
2861     case '+':
2862     case '*':
2863         RExC_parse++;
2864         vFAIL("Quantifier follows nothing");
2865         break;
2866     case '\\':
2867         switch (*++RExC_parse) {
2868         case 'A':
2869             RExC_seen_zerolen++;
2870             ret = reg_node(pRExC_state, SBOL);
2871             *flagp |= SIMPLE;
2872             nextchar(pRExC_state);
2873             Set_Node_Length(ret, 2); /* MJD */
2874             break;
2875         case 'G':
2876             ret = reg_node(pRExC_state, GPOS);
2877             RExC_seen |= REG_SEEN_GPOS;
2878             *flagp |= SIMPLE;
2879             nextchar(pRExC_state);
2880             Set_Node_Length(ret, 2); /* MJD */
2881             break;
2882         case 'Z':
2883             ret = reg_node(pRExC_state, SEOL);
2884             *flagp |= SIMPLE;
2885             RExC_seen_zerolen++;                /* Do not optimize RE away */
2886             nextchar(pRExC_state);
2887             break;
2888         case 'z':
2889             ret = reg_node(pRExC_state, EOS);
2890             *flagp |= SIMPLE;
2891             RExC_seen_zerolen++;                /* Do not optimize RE away */
2892             nextchar(pRExC_state);
2893             Set_Node_Length(ret, 2); /* MJD */
2894             break;
2895         case 'C':
2896             ret = reg_node(pRExC_state, CANY);
2897             RExC_seen |= REG_SEEN_CANY;
2898             *flagp |= HASWIDTH|SIMPLE;
2899             nextchar(pRExC_state);
2900             Set_Node_Length(ret, 2); /* MJD */
2901             break;
2902         case 'X':
2903             ret = reg_node(pRExC_state, CLUMP);
2904             *flagp |= HASWIDTH;
2905             nextchar(pRExC_state);
2906             Set_Node_Length(ret, 2); /* MJD */
2907             break;
2908         case 'w':
2909             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2910             *flagp |= HASWIDTH|SIMPLE;
2911             nextchar(pRExC_state);
2912             Set_Node_Length(ret, 2); /* MJD */
2913             break;
2914         case 'W':
2915             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2916             *flagp |= HASWIDTH|SIMPLE;
2917             nextchar(pRExC_state);
2918             Set_Node_Length(ret, 2); /* MJD */
2919             break;
2920         case 'b':
2921             RExC_seen_zerolen++;
2922             RExC_seen |= REG_SEEN_LOOKBEHIND;
2923             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2924             *flagp |= SIMPLE;
2925             nextchar(pRExC_state);
2926             Set_Node_Length(ret, 2); /* MJD */
2927             break;
2928         case 'B':
2929             RExC_seen_zerolen++;
2930             RExC_seen |= REG_SEEN_LOOKBEHIND;
2931             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2932             *flagp |= SIMPLE;
2933             nextchar(pRExC_state);
2934             Set_Node_Length(ret, 2); /* MJD */
2935             break;
2936         case 's':
2937             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2938             *flagp |= HASWIDTH|SIMPLE;
2939             nextchar(pRExC_state);
2940             Set_Node_Length(ret, 2); /* MJD */
2941             break;
2942         case 'S':
2943             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2944             *flagp |= HASWIDTH|SIMPLE;
2945             nextchar(pRExC_state);
2946             Set_Node_Length(ret, 2); /* MJD */
2947             break;
2948         case 'd':
2949             ret = reg_node(pRExC_state, DIGIT);
2950             *flagp |= HASWIDTH|SIMPLE;
2951             nextchar(pRExC_state);
2952             Set_Node_Length(ret, 2); /* MJD */
2953             break;
2954         case 'D':
2955             ret = reg_node(pRExC_state, NDIGIT);
2956             *flagp |= HASWIDTH|SIMPLE;
2957             nextchar(pRExC_state);
2958             Set_Node_Length(ret, 2); /* MJD */
2959             break;
2960         case 'p':
2961         case 'P':
2962             {   
2963                 char* oldregxend = RExC_end;
2964                 char* parse_start = RExC_parse;
2965
2966                 if (RExC_parse[1] == '{') {
2967                   /* a lovely hack--pretend we saw [\pX] instead */
2968                     RExC_end = strchr(RExC_parse, '}');
2969                     if (!RExC_end) {
2970                         U8 c = (U8)*RExC_parse;
2971                         RExC_parse += 2;
2972                         RExC_end = oldregxend;
2973                         vFAIL2("Missing right brace on \\%c{}", c);
2974                     }
2975                     RExC_end++;
2976                 }
2977                 else {
2978                     RExC_end = RExC_parse + 2;
2979                     if (RExC_end > oldregxend)
2980                         RExC_end = oldregxend;
2981                 }
2982                 RExC_parse--;
2983
2984                 ret = regclass(pRExC_state);
2985
2986                 RExC_end = oldregxend;
2987                 RExC_parse--;
2988                 Set_Node_Cur_Length(ret); /* MJD */
2989                 nextchar(pRExC_state);
2990                 *flagp |= HASWIDTH|SIMPLE;
2991             }
2992             break;
2993         case 'n':
2994         case 'r':
2995         case 't':
2996         case 'f':
2997         case 'e':
2998         case 'a':
2999         case 'x':
3000         case 'c':
3001         case '0':
3002             goto defchar;
3003         case '1': case '2': case '3': case '4':
3004         case '5': case '6': case '7': case '8': case '9':
3005             {
3006                 I32 num = atoi(RExC_parse);
3007
3008                 if (num > 9 && num >= RExC_npar)
3009                     goto defchar;
3010                 else {
3011                     char * parse_start = RExC_parse - 1; /* MJD */
3012                     while (isDIGIT(*RExC_parse))
3013                         RExC_parse++;
3014
3015                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3016                         vFAIL("Reference to nonexistent group");
3017                     RExC_sawback = 1;
3018                     ret = reganode(pRExC_state,
3019                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3020                                    num);
3021                     *flagp |= HASWIDTH;
3022                     
3023                     /* override incorrect value set in reganode MJD */
3024                     Set_Node_Offset(ret, parse_start+1); 
3025                     Set_Node_Cur_Length(ret); /* MJD */
3026                     RExC_parse--;
3027                     nextchar(pRExC_state);
3028                 }
3029             }
3030             break;
3031         case '\0':
3032             if (RExC_parse >= RExC_end)
3033                 FAIL("Trailing \\");
3034             /* FALL THROUGH */
3035         default:
3036             /* Do not generate `unrecognized' warnings here, we fall
3037                back into the quick-grab loop below */
3038             goto defchar;
3039         }
3040         break;
3041
3042     case '#':
3043         if (RExC_flags & PMf_EXTENDED) {
3044             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3045             if (RExC_parse < RExC_end)
3046                 goto tryagain;
3047         }
3048         /* FALL THROUGH */
3049
3050     default: {
3051             register STRLEN len;
3052             register UV ender;
3053             register char *p;
3054             char *oldp, *s;
3055             STRLEN numlen;
3056             STRLEN foldlen;
3057             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3058
3059             parse_start = RExC_parse - 1;
3060
3061             RExC_parse++;
3062
3063         defchar:
3064             ender = 0;
3065             ret = reg_node(pRExC_state,
3066                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3067             s = STRING(ret);
3068             for (len = 0, p = RExC_parse - 1;
3069               len < 127 && p < RExC_end;
3070               len++)
3071             {
3072                 oldp = p;
3073
3074                 if (RExC_flags & PMf_EXTENDED)
3075                     p = regwhite(p, RExC_end);
3076                 switch (*p) {
3077                 case '^':
3078                 case '$':
3079                 case '.':
3080                 case '[':
3081                 case '(':
3082                 case ')':
3083                 case '|':
3084                     goto loopdone;
3085                 case '\\':
3086                     switch (*++p) {
3087                     case 'A':
3088                     case 'C':
3089                     case 'X':
3090                     case 'G':
3091                     case 'Z':
3092                     case 'z':
3093                     case 'w':
3094                     case 'W':
3095                     case 'b':
3096                     case 'B':
3097                     case 's':
3098                     case 'S':
3099                     case 'd':
3100                     case 'D':
3101                     case 'p':
3102                     case 'P':
3103                         --p;
3104                         goto loopdone;
3105                     case 'n':
3106                         ender = '\n';
3107                         p++;
3108                         break;
3109                     case 'r':
3110                         ender = '\r';
3111                         p++;
3112                         break;
3113                     case 't':
3114                         ender = '\t';
3115                         p++;
3116                         break;
3117                     case 'f':
3118                         ender = '\f';
3119                         p++;
3120                         break;
3121                     case 'e':
3122                           ender = ASCII_TO_NATIVE('\033');
3123                         p++;
3124                         break;
3125                     case 'a':
3126                           ender = ASCII_TO_NATIVE('\007');
3127                         p++;
3128                         break;
3129                     case 'x':
3130                         if (*++p == '{') {
3131                             char* e = strchr(p, '}');
3132         
3133                             if (!e) {
3134                                 RExC_parse = p + 1;
3135                                 vFAIL("Missing right brace on \\x{}");
3136                             }
3137                             else {
3138                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3139                                     | PERL_SCAN_DISALLOW_PREFIX;
3140                                 numlen = e - p - 1;
3141                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3142                                 if (ender > 0xff)
3143                                     RExC_utf8 = 1;
3144                                 /* numlen is generous */
3145                                 if (numlen + len >= 127) {
3146                                     p--;
3147                                     goto loopdone;
3148                                 }
3149                                 p = e + 1;
3150                             }
3151                         }
3152                         else {
3153                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3154                             numlen = 2;
3155                             ender = grok_hex(p, &numlen, &flags, NULL);
3156                             p += numlen;
3157                         }
3158                         break;
3159                     case 'c':
3160                         p++;
3161                         ender = UCHARAT(p++);
3162                         ender = toCTRL(ender);
3163                         break;
3164                     case '0': case '1': case '2': case '3':case '4':
3165                     case '5': case '6': case '7': case '8':case '9':
3166                         if (*p == '0' ||
3167                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3168                             I32 flags = 0;
3169                             numlen = 3;
3170                             ender = grok_oct(p, &numlen, &flags, NULL);
3171                             p += numlen;
3172                         }
3173                         else {
3174                             --p;
3175                             goto loopdone;
3176                         }
3177                         break;
3178                     case '\0':
3179                         if (p >= RExC_end)
3180                             FAIL("Trailing \\");
3181                         /* FALL THROUGH */
3182                     default:
3183                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3184                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3185                         goto normal_default;
3186                     }
3187                     break;
3188                 default:
3189                   normal_default:
3190                     if (UTF8_IS_START(*p) && UTF) {
3191                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3192                                                &numlen, 0);
3193                         p += numlen;
3194                     }
3195                     else
3196                         ender = *p++;
3197                     break;
3198                 }
3199                 if (RExC_flags & PMf_EXTENDED)
3200                     p = regwhite(p, RExC_end);
3201                 if (UTF && FOLD) {
3202                     /* Prime the casefolded buffer. */
3203                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3204                 }
3205                 if (ISMULT2(p)) { /* Back off on ?+*. */
3206                     if (len)
3207                         p = oldp;
3208                     else if (UTF) {
3209                          STRLEN unilen;
3210
3211                          if (FOLD) {
3212                               /* Emit all the Unicode characters. */
3213                               for (foldbuf = tmpbuf;
3214                                    foldlen;
3215                                    foldlen -= numlen) {
3216                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3217                                    if (numlen > 0) {
3218                                         reguni(pRExC_state, ender, s, &unilen);
3219                                         s       += unilen;
3220                                         len     += unilen;
3221                                         /* In EBCDIC the numlen
3222                                          * and unilen can differ. */
3223                                         foldbuf += numlen;
3224                                         if (numlen >= foldlen)
3225                                              break;
3226                                    }
3227                                    else
3228                                         break; /* "Can't happen." */
3229                               }
3230                          }
3231                          else {
3232                               reguni(pRExC_state, ender, s, &unilen);
3233                               if (unilen > 0) {
3234                                    s   += unilen;
3235                                    len += unilen;
3236                               }
3237                          }
3238                     }
3239                     else {
3240                         len++;
3241                         REGC((char)ender, s++);
3242                     }
3243                     break;
3244                 }
3245                 if (UTF) {
3246                      STRLEN unilen;
3247
3248                      if (FOLD) {
3249                           /* Emit all the Unicode characters. */
3250                           for (foldbuf = tmpbuf;
3251                                foldlen;
3252                                foldlen -= numlen) {
3253                                ender = utf8_to_uvchr(foldbuf, &numlen);
3254                                if (numlen > 0) {
3255                                     reguni(pRExC_state, ender, s, &unilen);
3256                                     len     += unilen;
3257                                     s       += unilen;
3258                                     /* In EBCDIC the numlen
3259                                      * and unilen can differ. */
3260                                     foldbuf += numlen;
3261                                     if (numlen >= foldlen)
3262                                          break;
3263                                }
3264                                else
3265                                     break;
3266                           }
3267                      }
3268                      else {
3269                           reguni(pRExC_state, ender, s, &unilen);
3270                           if (unilen > 0) {
3271                                s   += unilen;
3272                                len += unilen;
3273                           }
3274                      }
3275                      len--;
3276                 }
3277                 else
3278                     REGC((char)ender, s++);
3279             }
3280         loopdone:
3281             RExC_parse = p - 1;
3282             Set_Node_Cur_Length(ret); /* MJD */
3283             nextchar(pRExC_state);
3284             {
3285                 /* len is STRLEN which is unsigned, need to copy to signed */
3286                 IV iv = len;
3287                 if (iv < 0)
3288                     vFAIL("Internal disaster");
3289             }
3290             if (len > 0)
3291                 *flagp |= HASWIDTH;
3292             if (len == 1)
3293                 *flagp |= SIMPLE;
3294             if (!SIZE_ONLY)
3295                 STR_LEN(ret) = len;
3296             if (SIZE_ONLY)
3297                 RExC_size += STR_SZ(len);
3298             else
3299                 RExC_emit += STR_SZ(len);
3300         }
3301         break;
3302     }
3303
3304     /* If the encoding pragma is in effect recode the text of
3305      * any EXACT-kind nodes. */
3306     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3307          STRLEN oldlen = STR_LEN(ret);
3308          SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3309
3310          if (RExC_utf8)
3311               SvUTF8_on(sv);
3312          if (sv_utf8_downgrade(sv, TRUE)) {
3313               char *s       = sv_recode_to_utf8(sv, PL_encoding);
3314               STRLEN newlen = SvCUR(sv);
3315          
3316               if (!SIZE_ONLY) {
3317                    DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3318                                          (int)oldlen, STRING(ret),
3319                                          (int)newlen, s));
3320                    Copy(s, STRING(ret), newlen, char);
3321                    STR_LEN(ret) += newlen - oldlen;
3322                    RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3323               } else
3324                    RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3325          }
3326     }
3327
3328     return(ret);
3329 }
3330
3331 STATIC char *
3332 S_regwhite(pTHX_ char *p, char *e)
3333 {
3334     while (p < e) {
3335         if (isSPACE(*p))
3336             ++p;
3337         else if (*p == '#') {
3338             do {
3339                 p++;
3340             } while (p < e && *p != '\n');
3341         }
3342         else
3343             break;
3344     }
3345     return p;
3346 }
3347
3348 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3349    Character classes ([:foo:]) can also be negated ([:^foo:]).
3350    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3351    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3352    but trigger failures because they are currently unimplemented. */
3353
3354 #define POSIXCC_DONE(c)   ((c) == ':')
3355 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3356 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3357
3358 STATIC I32
3359 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3360 {
3361     char *posixcc = 0;
3362     I32 namedclass = OOB_NAMEDCLASS;
3363
3364     if (value == '[' && RExC_parse + 1 < RExC_end &&
3365         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3366         POSIXCC(UCHARAT(RExC_parse))) {
3367         char  c = UCHARAT(RExC_parse);
3368         char* s = RExC_parse++;
3369         
3370         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3371             RExC_parse++;
3372         if (RExC_parse == RExC_end)
3373             /* Grandfather lone [:, [=, [. */
3374             RExC_parse = s;
3375         else {
3376             char* t = RExC_parse++; /* skip over the c */
3377
3378             if (UCHARAT(RExC_parse) == ']') {
3379                 RExC_parse++; /* skip over the ending ] */
3380                 posixcc = s + 1;
3381                 if (*s == ':') {
3382                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3383                     I32 skip = 5; /* the most common skip */
3384
3385                     switch (*posixcc) {
3386                     case 'a':
3387                         if (strnEQ(posixcc, "alnum", 5))
3388                             namedclass =
3389                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3390                         else if (strnEQ(posixcc, "alpha", 5))
3391                             namedclass =
3392                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3393                         else if (strnEQ(posixcc, "ascii", 5))
3394                             namedclass =
3395                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3396                         break;
3397                     case 'b':
3398                         if (strnEQ(posixcc, "blank", 5))
3399                             namedclass =
3400                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3401                         break;
3402                     case 'c':
3403                         if (strnEQ(posixcc, "cntrl", 5))
3404                             namedclass =
3405                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3406                         break;
3407                     case 'd':
3408                         if (strnEQ(posixcc, "digit", 5))
3409                             namedclass =
3410                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3411                         break;
3412                     case 'g':
3413                         if (strnEQ(posixcc, "graph", 5))
3414                             namedclass =
3415                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3416                         break;
3417                     case 'l':
3418                         if (strnEQ(posixcc, "lower", 5))
3419                             namedclass =
3420                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3421                         break;
3422                     case 'p':
3423                         if (strnEQ(posixcc, "print", 5))
3424                             namedclass =
3425                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3426                         else if (strnEQ(posixcc, "punct", 5))
3427                             namedclass =
3428                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3429                         break;
3430                     case 's':
3431                         if (strnEQ(posixcc, "space", 5))
3432                             namedclass =
3433                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3434                         break;
3435                     case 'u':
3436                         if (strnEQ(posixcc, "upper", 5))
3437                             namedclass =
3438                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3439                         break;
3440                     case 'w': /* this is not POSIX, this is the Perl \w */
3441                         if (strnEQ(posixcc, "word", 4)) {
3442                             namedclass =
3443                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3444                             skip = 4;
3445                         }
3446                         break;
3447                     case 'x':
3448                         if (strnEQ(posixcc, "xdigit", 6)) {
3449                             namedclass =
3450                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3451                             skip = 6;
3452                         }
3453                         break;
3454                     }
3455                     if (namedclass == OOB_NAMEDCLASS ||
3456                         posixcc[skip] != ':' ||
3457                         posixcc[skip+1] != ']')
3458                     {
3459                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3460                                       t - s - 1, s + 1);
3461                     }
3462                 } else if (!SIZE_ONLY) {
3463                     /* [[=foo=]] and [[.foo.]] are still future. */
3464
3465                     /* adjust RExC_parse so the warning shows after
3466                        the class closes */
3467                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3468                         RExC_parse++;
3469                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3470                 }
3471             } else {
3472                 /* Maternal grandfather:
3473                  * "[:" ending in ":" but not in ":]" */
3474                 RExC_parse = s;
3475             }
3476         }
3477     }
3478
3479     return namedclass;
3480 }
3481
3482 STATIC void
3483 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3484 {
3485     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3486         char *s = RExC_parse;
3487         char  c = *s++;
3488
3489         while(*s && isALNUM(*s))
3490             s++;
3491         if (*s && c == *s && s[1] == ']') {
3492             if (ckWARN(WARN_REGEXP))
3493                 vWARN3(s+2,
3494                         "POSIX syntax [%c %c] belongs inside character classes",
3495                         c, c);
3496
3497             /* [[=foo=]] and [[.foo.]] are still future. */
3498             if (POSIXCC_NOTYET(c)) {
3499                 /* adjust RExC_parse so the error shows after
3500                    the class closes */
3501                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3502                     ;
3503                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3504             }
3505         }
3506     }
3507 }
3508
3509 STATIC regnode *
3510 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3511 {
3512     register UV value;
3513     register UV nextvalue;
3514     register IV prevvalue = OOB_UNICODE;
3515     register IV range = 0;
3516     register regnode *ret;
3517     STRLEN numlen;
3518     IV namedclass;
3519     char *rangebegin = 0;
3520     bool need_class = 0;
3521     SV *listsv = Nullsv;
3522     register char *e;
3523     UV n;
3524     bool optimize_invert   = TRUE;
3525     AV* unicode_alternate  = 0;
3526
3527     ret = reganode(pRExC_state, ANYOF, 0);
3528
3529     if (!SIZE_ONLY)
3530         ANYOF_FLAGS(ret) = 0;
3531
3532     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3533         RExC_naughty++;
3534         RExC_parse++;
3535         if (!SIZE_ONLY)
3536             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3537     }
3538
3539     if (SIZE_ONLY)
3540         RExC_size += ANYOF_SKIP;
3541     else {
3542         RExC_emit += ANYOF_SKIP;
3543         if (FOLD)
3544             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3545         if (LOC)
3546             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3547         ANYOF_BITMAP_ZERO(ret);
3548         listsv = newSVpvn("# comment\n", 10);
3549     }
3550
3551     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3552
3553     if (!SIZE_ONLY && POSIXCC(nextvalue))
3554         checkposixcc(pRExC_state);
3555
3556     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3557     if (UCHARAT(RExC_parse) == ']')
3558         goto charclassloop;
3559
3560     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3561
3562     charclassloop:
3563
3564         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3565
3566         if (!range)
3567             rangebegin = RExC_parse;
3568         if (UTF) {
3569             value = utf8n_to_uvchr((U8*)RExC_parse,
3570                                    RExC_end - RExC_parse,
3571                                    &numlen, 0);
3572             RExC_parse += numlen;
3573         }
3574         else
3575             value = UCHARAT(RExC_parse++);
3576         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3577         if (value == '[' && POSIXCC(nextvalue))
3578             namedclass = regpposixcc(pRExC_state, value);
3579         else if (value == '\\') {
3580             if (UTF) {
3581                 value = utf8n_to_uvchr((U8*)RExC_parse,
3582                                    RExC_end - RExC_parse,
3583                                    &numlen, 0);
3584                 RExC_parse += numlen;
3585             }
3586             else
3587                 value = UCHARAT(RExC_parse++);
3588             /* Some compilers cannot handle switching on 64-bit integer
3589              * values, therefore value cannot be an UV.  Yes, this will
3590              * be a problem later if we want switch on Unicode.
3591              * A similar issue a little bit later when switching on
3592              * namedclass. --jhi */
3593             switch ((I32)value) {
3594             case 'w':   namedclass = ANYOF_ALNUM;       break;
3595             case 'W':   namedclass = ANYOF_NALNUM;      break;
3596             case 's':   namedclass = ANYOF_SPACE;       break;
3597             case 'S':   namedclass = ANYOF_NSPACE;      break;
3598             case 'd':   namedclass = ANYOF_DIGIT;       break;
3599             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3600             case 'p':
3601             case 'P':
3602                 if (RExC_parse >= RExC_end)
3603                     vFAIL2("Empty \\%c{}", (U8)value);
3604                 if (*RExC_parse == '{') {
3605                     U8 c = (U8)value;
3606                     e = strchr(RExC_parse++, '}');
3607                     if (!e)
3608                         vFAIL2("Missing right brace on \\%c{}", c);
3609                     while (isSPACE(UCHARAT(RExC_parse)))
3610                         RExC_parse++;
3611                     if (e == RExC_parse)
3612                         vFAIL2("Empty \\%c{}", c);
3613                     n = e - RExC_parse;
3614                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3615                         n--;
3616                 }
3617                 else {
3618                     e = RExC_parse;
3619                     n = 1;
3620                 }
3621                 if (!SIZE_ONLY) {
3622                     if (UCHARAT(RExC_parse) == '^') {
3623                          RExC_parse++;
3624                          n--;
3625                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3626                          while (isSPACE(UCHARAT(RExC_parse))) {
3627                               RExC_parse++;
3628                               n--;
3629                          }
3630                     }
3631                     if (value == 'p')
3632                          Perl_sv_catpvf(aTHX_ listsv,
3633                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3634                     else
3635                          Perl_sv_catpvf(aTHX_ listsv,
3636                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3637                 }
3638                 RExC_parse = e + 1;
3639                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3640                 continue;
3641             case 'n':   value = '\n';                   break;
3642             case 'r':   value = '\r';                   break;
3643             case 't':   value = '\t';                   break;
3644             case 'f':   value = '\f';                   break;
3645             case 'b':   value = '\b';                   break;
3646             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3647             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3648             case 'x':
3649                 if (*RExC_parse == '{') {
3650                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3651                         | PERL_SCAN_DISALLOW_PREFIX;
3652                     e = strchr(RExC_parse++, '}');
3653                     if (!e)
3654                         vFAIL("Missing right brace on \\x{}");
3655
3656                     numlen = e - RExC_parse;
3657                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3658                     RExC_parse = e + 1;
3659                 }
3660                 else {
3661                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3662                     numlen = 2;
3663                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3664                     RExC_parse += numlen;
3665                 }
3666                 break;
3667             case 'c':
3668                 value = UCHARAT(RExC_parse++);
3669                 value = toCTRL(value);
3670                 break;
3671             case '0': case '1': case '2': case '3': case '4':
3672             case '5': case '6': case '7': case '8': case '9':
3673             {
3674                 I32 flags = 0;
3675                 numlen = 3;
3676                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3677                 RExC_parse += numlen;
3678                 break;
3679             }
3680             default:
3681                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3682                     vWARN2(RExC_parse,
3683                            "Unrecognized escape \\%c in character class passed through",
3684                            (int)value);
3685                 break;
3686             }
3687         } /* end of \blah */
3688
3689         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3690
3691             if (!SIZE_ONLY && !need_class)
3692                 ANYOF_CLASS_ZERO(ret);
3693
3694             need_class = 1;
3695
3696             /* a bad range like a-\d, a-[:digit:] ? */
3697             if (range) {
3698                 if (!SIZE_ONLY) {
3699                     if (ckWARN(WARN_REGEXP))
3700                         vWARN4(RExC_parse,
3701                                "False [] range \"%*.*s\"",
3702                                RExC_parse - rangebegin,
3703                                RExC_parse - rangebegin,
3704                                rangebegin);
3705                     if (prevvalue < 256) {
3706                         ANYOF_BITMAP_SET(ret, prevvalue);
3707                         ANYOF_BITMAP_SET(ret, '-');
3708                     }
3709                     else {
3710                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3711                         Perl_sv_catpvf(aTHX_ listsv,
3712                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3713                     }
3714                 }
3715
3716                 range = 0; /* this was not a true range */
3717             }
3718
3719             if (!SIZE_ONLY) {
3720                 if (namedclass > OOB_NAMEDCLASS)
3721                     optimize_invert = FALSE;
3722                 /* Possible truncation here but in some 64-bit environments
3723                  * the compiler gets heartburn about switch on 64-bit values.
3724                  * A similar issue a little earlier when switching on value.
3725                  * --jhi */
3726                 switch ((I32)namedclass) {
3727                 case ANYOF_ALNUM:
3728                     if (LOC)
3729                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3730                     else {
3731                         for (value = 0; value < 256; value++)
3732                             if (isALNUM(value))
3733                                 ANYOF_BITMAP_SET(ret, value);
3734                     }
3735                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3736                     break;
3737                 case ANYOF_NALNUM:
3738                     if (LOC)
3739                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3740                     else {
3741                         for (value = 0; value < 256; value++)
3742                             if (!isALNUM(value))
3743                                 ANYOF_BITMAP_SET(ret, value);
3744                     }
3745                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3746                     break;
3747                 case ANYOF_ALNUMC:
3748                     if (LOC)
3749                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3750                     else {
3751                         for (value = 0; value < 256; value++)
3752                             if (isALNUMC(value))
3753                                 ANYOF_BITMAP_SET(ret, value);
3754                     }
3755                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3756                     break;
3757                 case ANYOF_NALNUMC:
3758                     if (LOC)
3759                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3760                     else {
3761                         for (value = 0; value < 256; value++)
3762                             if (!isALNUMC(value))
3763                                 ANYOF_BITMAP_SET(ret, value);
3764                     }
3765                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3766                     break;
3767                 case ANYOF_ALPHA:
3768                     if (LOC)
3769                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3770                     else {
3771                         for (value = 0; value < 256; value++)
3772                             if (isALPHA(value))
3773                                 ANYOF_BITMAP_SET(ret, value);
3774                     }
3775                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3776                     break;
3777                 case ANYOF_NALPHA:
3778                     if (LOC)
3779                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3780                     else {
3781                         for (value = 0; value < 256; value++)
3782                             if (!isALPHA(value))
3783                                 ANYOF_BITMAP_SET(ret, value);
3784                     }
3785                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3786                     break;
3787                 case ANYOF_ASCII:
3788                     if (LOC)
3789                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3790                     else {
3791 #ifndef EBCDIC
3792                         for (value = 0; value < 128; value++)
3793                             ANYOF_BITMAP_SET(ret, value);
3794 #else  /* EBCDIC */
3795                         for (value = 0; value < 256; value++) {
3796                             if (isASCII(value))
3797                                 ANYOF_BITMAP_SET(ret, value);
3798                         }
3799 #endif /* EBCDIC */
3800                     }
3801                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3802                     break;
3803                 case ANYOF_NASCII:
3804                     if (LOC)
3805                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3806                     else {
3807 #ifndef EBCDIC
3808                         for (value = 128; value < 256; value++)
3809                             ANYOF_BITMAP_SET(ret, value);
3810 #else  /* EBCDIC */
3811                         for (value = 0; value < 256; value++) {
3812                             if (!isASCII(value))
3813                                 ANYOF_BITMAP_SET(ret, value);
3814                         }
3815 #endif /* EBCDIC */
3816                     }
3817                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3818                     break;
3819                 case ANYOF_BLANK:
3820                     if (LOC)
3821                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3822                     else {
3823                         for (value = 0; value < 256; value++)
3824                             if (isBLANK(value))
3825                                 ANYOF_BITMAP_SET(ret, value);
3826                     }
3827                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3828                     break;
3829                 case ANYOF_NBLANK:
3830                     if (LOC)
3831                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3832                     else {
3833                         for (value = 0; value < 256; value++)
3834                             if (!isBLANK(value))
3835                                 ANYOF_BITMAP_SET(ret, value);
3836                     }
3837                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3838                     break;
3839                 case ANYOF_CNTRL:
3840                     if (LOC)
3841                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3842                     else {
3843                         for (value = 0; value < 256; value++)
3844                             if (isCNTRL(value))
3845                                 ANYOF_BITMAP_SET(ret, value);
3846                     }
3847                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3848                     break;
3849                 case ANYOF_NCNTRL:
3850                     if (LOC)
3851                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3852                     else {
3853                         for (value = 0; value < 256; value++)
3854                             if (!isCNTRL(value))
3855                                 ANYOF_BITMAP_SET(ret, value);
3856                     }
3857                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3858                     break;
3859                 case ANYOF_DIGIT:
3860                     if (LOC)
3861                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3862                     else {
3863                         /* consecutive digits assumed */
3864                         for (value = '0'; value <= '9'; value++)
3865                             ANYOF_BITMAP_SET(ret, value);
3866                     }
3867                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3868                     break;
3869                 case ANYOF_NDIGIT:
3870                     if (LOC)
3871                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3872                     else {
3873                         /* consecutive digits assumed */
3874                         for (value = 0; value < '0'; value++)
3875                             ANYOF_BITMAP_SET(ret, value);
3876                         for (value = '9' + 1; value < 256; value++)
3877                             ANYOF_BITMAP_SET(ret, value);
3878                     }
3879                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3880                     break;
3881                 case ANYOF_GRAPH:
3882                     if (LOC)
3883                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3884                     else {
3885                         for (value = 0; value < 256; value++)
3886                             if (isGRAPH(value))
3887                                 ANYOF_BITMAP_SET(ret, value);
3888                     }
3889                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3890                     break;
3891                 case ANYOF_NGRAPH:
3892                     if (LOC)
3893                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3894                     else {
3895                         for (value = 0; value < 256; value++)
3896                             if (!isGRAPH(value))
3897                                 ANYOF_BITMAP_SET(ret, value);
3898                     }
3899                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3900                     break;
3901                 case ANYOF_LOWER:
3902                     if (LOC)
3903                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3904                     else {
3905                         for (value = 0; value < 256; value++)
3906                             if (isLOWER(value))
3907                                 ANYOF_BITMAP_SET(ret, value);
3908                     }
3909                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3910                     break;
3911                 case ANYOF_NLOWER:
3912                     if (LOC)
3913                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3914                     else {
3915                         for (value = 0; value < 256; value++)
3916                             if (!isLOWER(value))
3917                                 ANYOF_BITMAP_SET(ret, value);
3918                     }
3919                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3920                     break;
3921                 case ANYOF_PRINT:
3922                     if (LOC)
3923                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3924                     else {
3925                         for (value = 0; value < 256; value++)
3926                             if (isPRINT(value))
3927                                 ANYOF_BITMAP_SET(ret, value);
3928                     }
3929                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3930                     break;
3931                 case ANYOF_NPRINT:
3932                     if (LOC)
3933                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3934                     else {
3935                         for (value = 0; value < 256; value++)
3936                             if (!isPRINT(value))
3937                                 ANYOF_BITMAP_SET(ret, value);
3938                     }
3939                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3940                     break;
3941                 case ANYOF_PSXSPC:
3942                     if (LOC)
3943                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3944                     else {
3945                         for (value = 0; value < 256; value++)
3946                             if (isPSXSPC(value))
3947                                 ANYOF_BITMAP_SET(ret, value);
3948                     }
3949                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3950                     break;
3951                 case ANYOF_NPSXSPC:
3952                     if (LOC)
3953                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3954                     else {
3955                         for (value = 0; value < 256; value++)
3956                             if (!isPSXSPC(value))
3957                                 ANYOF_BITMAP_SET(ret, value);
3958                     }
3959                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3960                     break;
3961                 case ANYOF_PUNCT:
3962                     if (LOC)
3963                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3964                     else {
3965                         for (value = 0; value < 256; value++)
3966                             if (isPUNCT(value))
3967                                 ANYOF_BITMAP_SET(ret, value);
3968                     }
3969                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3970                     break;
3971                 case ANYOF_NPUNCT:
3972                     if (LOC)
3973                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3974                     else {
3975                         for (value = 0; value < 256; value++)
3976                             if (!isPUNCT(value))
3977                                 ANYOF_BITMAP_SET(ret, value);
3978                     }
3979                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3980                     break;
3981                 case ANYOF_SPACE:
3982                     if (LOC)
3983                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3984                     else {
3985                         for (value = 0; value < 256; value++)
3986                             if (isSPACE(value))
3987                                 ANYOF_BITMAP_SET(ret, value);
3988                     }
3989                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3990                     break;
3991                 case ANYOF_NSPACE:
3992                     if (LOC)
3993                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3994                     else {
3995                         for (value = 0; value < 256; value++)
3996                             if (!isSPACE(value))
3997                                 ANYOF_BITMAP_SET(ret, value);
3998                     }
3999                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4000                     break;
4001                 case ANYOF_UPPER:
4002                     if (LOC)
4003                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4004                     else {
4005                         for (value = 0; value < 256; value++)
4006                             if (isUPPER(value))
4007                                 ANYOF_BITMAP_SET(ret, value);
4008                     }
4009                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4010                     break;
4011                 case ANYOF_NUPPER:
4012                     if (LOC)
4013                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4014                     else {
4015                         for (value = 0; value < 256; value++)
4016                             if (!isUPPER(value))
4017                                 ANYOF_BITMAP_SET(ret, value);
4018                     }
4019                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4020                     break;
4021                 case ANYOF_XDIGIT:
4022                     if (LOC)
4023                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4024                     else {
4025                         for (value = 0; value < 256; value++)
4026                             if (isXDIGIT(value))
4027                                 ANYOF_BITMAP_SET(ret, value);
4028                     }
4029                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4030                     break;
4031                 case ANYOF_NXDIGIT:
4032                     if (LOC)
4033                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4034                     else {
4035                         for (value = 0; value < 256; value++)
4036                             if (!isXDIGIT(value))
4037                                 ANYOF_BITMAP_SET(ret, value);
4038                     }
4039                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4040                     break;
4041                 default:
4042                     vFAIL("Invalid [::] class");
4043                     break;
4044                 }
4045                 if (LOC)
4046                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4047                 continue;
4048             }
4049         } /* end of namedclass \blah */
4050
4051         if (range) {
4052             if (prevvalue > (IV)value) /* b-a */ {
4053                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4054                               RExC_parse - rangebegin,
4055                               RExC_parse - rangebegin,
4056                               rangebegin);
4057                 range = 0; /* not a valid range */
4058             }
4059         }
4060         else {
4061             prevvalue = value; /* save the beginning of the range */
4062             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4063                 RExC_parse[1] != ']') {
4064                 RExC_parse++;
4065
4066                 /* a bad range like \w-, [:word:]- ? */
4067                 if (namedclass > OOB_NAMEDCLASS) {
4068                     if (ckWARN(WARN_REGEXP))
4069                         vWARN4(RExC_parse,
4070                                "False [] range \"%*.*s\"",
4071                                RExC_parse - rangebegin,
4072                                RExC_parse - rangebegin,
4073                                rangebegin);
4074                     if (!SIZE_ONLY)
4075                         ANYOF_BITMAP_SET(ret, '-');
4076                 } else
4077                     range = 1;  /* yeah, it's a range! */
4078                 continue;       /* but do it the next time */
4079             }
4080         }
4081
4082         /* now is the next time */
4083         if (!SIZE_ONLY) {
4084             IV i;
4085
4086             if (prevvalue < 256) {
4087                 IV ceilvalue = value < 256 ? value : 255;
4088
4089 #ifdef EBCDIC
4090                 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4091                     (isUPPER(prevvalue) && isUPPER(ceilvalue)))
4092                 {
4093                     if (isLOWER(prevvalue)) {
4094                         for (i = prevvalue; i <= ceilvalue; i++)
4095                             if (isLOWER(i))
4096                                 ANYOF_BITMAP_SET(ret, i);
4097                     } else {
4098                         for (i = prevvalue; i <= ceilvalue; i++)
4099                             if (isUPPER(i))
4100                                 ANYOF_BITMAP_SET(ret, i);
4101                     }
4102                 }
4103                 else
4104 #endif
4105                       for (i = prevvalue; i <= ceilvalue; i++)
4106                           ANYOF_BITMAP_SET(ret, i);
4107           }
4108           if (value > 255 || UTF) {
4109                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4110                 UV natvalue      = NATIVE_TO_UNI(value);
4111
4112                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4113                 if (prevnatvalue < natvalue) { /* what about > ? */
4114                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4115                                    prevnatvalue, natvalue);
4116                 }
4117                 else if (prevnatvalue == natvalue) {
4118                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4119                     if (FOLD) {
4120                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4121                          STRLEN foldlen;
4122                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4123
4124                          /* If folding and foldable and a single
4125                           * character, insert also the folded version
4126                           * to the charclass. */
4127                          if (f != value) {
4128                               if (foldlen == (STRLEN)UNISKIP(f))
4129                                   Perl_sv_catpvf(aTHX_ listsv,
4130                                                  "%04"UVxf"\n", f);
4131                               else {
4132                                   /* Any multicharacter foldings
4133                                    * require the following transform:
4134                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4135                                    * where E folds into "pq" and F folds
4136                                    * into "rst", all other characters
4137                                    * fold to single characters.  We save
4138                                    * away these multicharacter foldings,
4139                                    * to be later saved as part of the
4140                                    * additional "s" data. */
4141                                   SV *sv;
4142
4143                                   if (!unicode_alternate)
4144                                       unicode_alternate = newAV();
4145                                   sv = newSVpvn((char*)foldbuf, foldlen);
4146                                   SvUTF8_on(sv);
4147                                   av_push(unicode_alternate, sv);
4148                               }
4149                          }
4150
4151                          /* If folding and the value is one of the Greek
4152                           * sigmas insert a few more sigmas to make the
4153                           * folding rules of the sigmas to work right.
4154                           * Note that not all the possible combinations
4155                           * are handled here: some of them are handled
4156                           * by the standard folding rules, and some of
4157                           * them (literal or EXACTF cases) are handled
4158                           * during runtime in regexec.c:S_find_byclass(). */
4159                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4160                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4161                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4162                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4163                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4164                          }
4165                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4166                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4167                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4168                     }
4169                 }
4170             }
4171         }
4172
4173         range = 0; /* this range (if it was one) is done now */
4174     }
4175
4176     if (need_class) {
4177         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4178         if (SIZE_ONLY)
4179             RExC_size += ANYOF_CLASS_ADD_SKIP;
4180         else
4181             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4182     }
4183
4184     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4185     if (!SIZE_ONLY &&
4186          /* If the only flag is folding (plus possibly inversion). */
4187         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4188        ) {
4189         for (value = 0; value < 256; ++value) {
4190             if (ANYOF_BITMAP_TEST(ret, value)) {
4191                 UV fold = PL_fold[value];
4192
4193                 if (fold != value)
4194                     ANYOF_BITMAP_SET(ret, fold);
4195             }
4196         }
4197         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4198     }
4199
4200     /* optimize inverted simple patterns (e.g. [^a-z]) */
4201     if (!SIZE_ONLY && optimize_invert &&
4202         /* If the only flag is inversion. */
4203         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4204         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4205             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4206         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4207     }
4208
4209     if (!SIZE_ONLY) {
4210         AV *av = newAV();
4211         SV *rv;
4212
4213         /* The 0th element stores the character class description
4214          * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4215          * to initialize the appropriate swash (which gets stored in
4216          * the 1st element), and also useful for dumping the regnode.
4217          * The 2nd element stores the multicharacter foldings,
4218          * used later (regexec.c:s_reginclasslen()). */
4219         av_store(av, 0, listsv);
4220         av_store(av, 1, NULL);
4221         av_store(av, 2, (SV*)unicode_alternate);
4222         rv = newRV_noinc((SV*)av);
4223         n = add_data(pRExC_state, 1, "s");
4224         RExC_rx->data->data[n] = (void*)rv;
4225         ARG_SET(ret, n);
4226     }
4227
4228     return ret;
4229 }
4230
4231 STATIC char*
4232 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4233 {
4234     char* retval = RExC_parse++;
4235
4236     for (;;) {
4237         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4238                 RExC_parse[2] == '#') {
4239             while (*RExC_parse && *RExC_parse != ')')
4240                 RExC_parse++;
4241             RExC_parse++;
4242             continue;
4243         }
4244         if (RExC_flags & PMf_EXTENDED) {
4245             if (isSPACE(*RExC_parse)) {
4246                 RExC_parse++;
4247                 continue;
4248             }
4249             else if (*RExC_parse == '#') {
4250                 while (*RExC_parse && *RExC_parse != '\n')
4251                     RExC_parse++;
4252                 RExC_parse++;
4253                 continue;
4254             }
4255         }
4256         return retval;
4257     }
4258 }
4259
4260 /*
4261 - reg_node - emit a node
4262 */
4263 STATIC regnode *                        /* Location. */
4264 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4265 {
4266     register regnode *ret;
4267     register regnode *ptr;
4268
4269     ret = RExC_emit;
4270     if (SIZE_ONLY) {
4271         SIZE_ALIGN(RExC_size);
4272         RExC_size += 1;
4273         return(ret);
4274     }
4275
4276     NODE_ALIGN_FILL(ret);
4277     ptr = ret;
4278     FILL_ADVANCE_NODE(ptr, op);
4279     if (RExC_offsets) {         /* MJD */
4280       MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4281               "reg_node", __LINE__, 
4282               reg_name[op],
4283               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4284               ? "Overwriting end of array!\n" : "OK",
4285               RExC_emit - RExC_emit_start,
4286               RExC_parse - RExC_start,
4287               RExC_offsets[0])); 
4288       Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4289     }
4290             
4291     RExC_emit = ptr;
4292
4293     return(ret);
4294 }
4295
4296 /*
4297 - reganode - emit a node with an argument
4298 */
4299 STATIC regnode *                        /* Location. */
4300 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4301 {
4302     register regnode *ret;
4303     register regnode *ptr;
4304
4305     ret = RExC_emit;
4306     if (SIZE_ONLY) {
4307         SIZE_ALIGN(RExC_size);
4308         RExC_size += 2;
4309         return(ret);
4310     }
4311
4312     NODE_ALIGN_FILL(ret);
4313     ptr = ret;
4314     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4315     if (RExC_offsets) {         /* MJD */
4316       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4317               "reganode",
4318               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4319               "Overwriting end of array!\n" : "OK",
4320               RExC_emit - RExC_emit_start,
4321               RExC_parse - RExC_start,
4322               RExC_offsets[0])); 
4323       Set_Cur_Node_Offset;
4324     }
4325             
4326     RExC_emit = ptr;
4327
4328     return(ret);
4329 }
4330
4331 /*
4332 - reguni - emit (if appropriate) a Unicode character
4333 */
4334 STATIC void
4335 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4336 {
4337     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4338 }
4339
4340 /*
4341 - reginsert - insert an operator in front of already-emitted operand
4342 *
4343 * Means relocating the operand.
4344 */
4345 STATIC void
4346 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4347 {
4348     register regnode *src;
4349     register regnode *dst;
4350     register regnode *place;
4351     register int offset = regarglen[(U8)op];
4352
4353 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4354
4355     if (SIZE_ONLY) {
4356         RExC_size += NODE_STEP_REGNODE + offset;
4357         return;
4358     }
4359
4360     src = RExC_emit;
4361     RExC_emit += NODE_STEP_REGNODE + offset;
4362     dst = RExC_emit;
4363     while (src > opnd) {
4364         StructCopy(--src, --dst, regnode);
4365         if (RExC_offsets) {     /* MJD 20010112 */
4366           MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 
4367                   "reg_insert",
4368                   dst - RExC_emit_start > RExC_offsets[0] 
4369                   ? "Overwriting end of array!\n" : "OK",
4370                   src - RExC_emit_start,
4371                   dst - RExC_emit_start,
4372                   RExC_offsets[0])); 
4373           Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4374           Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4375         }
4376     }
4377     
4378
4379     place = opnd;               /* Op node, where operand used to be. */
4380     if (RExC_offsets) {         /* MJD */
4381       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4382               "reginsert",
4383               place - RExC_emit_start > RExC_offsets[0] 
4384               ? "Overwriting end of array!\n" : "OK",
4385               place - RExC_emit_start,
4386               RExC_parse - RExC_start,
4387               RExC_offsets[0])); 
4388       Set_Node_Offset(place, RExC_parse);
4389     }
4390     src = NEXTOPER(place);
4391     FILL_ADVANCE_NODE(place, op);
4392     Zero(src, offset, regnode);
4393 }
4394
4395 /*
4396 - regtail - set the next-pointer at the end of a node chain of p to val.
4397 */
4398 STATIC void
4399 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4400 {
4401     register regnode *scan;
4402     register regnode *temp;
4403
4404     if (SIZE_ONLY)
4405         return;
4406
4407     /* Find last node. */
4408     scan = p;
4409     for (;;) {
4410         temp = regnext(scan);
4411         if (temp == NULL)
4412             break;
4413         scan = temp;
4414     }
4415
4416     if (reg_off_by_arg[OP(scan)]) {
4417         ARG_SET(scan, val - scan);
4418     }
4419     else {
4420         NEXT_OFF(scan) = val - scan;
4421     }
4422 }
4423
4424 /*
4425 - regoptail - regtail on operand of first argument; nop if operandless
4426 */
4427 STATIC void
4428 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4429 {
4430     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4431     if (p == NULL || SIZE_ONLY)
4432         return;
4433     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4434         regtail(pRExC_state, NEXTOPER(p), val);
4435     }
4436     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4437         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4438     }
4439     else
4440         return;
4441 }
4442
4443 /*
4444  - regcurly - a little FSA that accepts {\d+,?\d*}
4445  */
4446 STATIC I32
4447 S_regcurly(pTHX_ register char *s)
4448 {
4449     if (*s++ != '{')
4450         return FALSE;
4451     if (!isDIGIT(*s))
4452         return FALSE;
4453     while (isDIGIT(*s))
4454         s++;
4455     if (*s == ',')
4456         s++;
4457     while (isDIGIT(*s))
4458         s++;
4459     if (*s != '}')
4460         return FALSE;
4461     return TRUE;
4462 }
4463
4464
4465 #ifdef DEBUGGING
4466
4467 STATIC regnode *
4468 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4469 {
4470     register U8 op = EXACT;     /* Arbitrary non-END op. */
4471     register regnode *next;
4472
4473     while (op != END && (!last || node < last)) {
4474         /* While that wasn't END last time... */
4475
4476         NODE_ALIGN(node);
4477         op = OP(node);
4478         if (op == CLOSE)
4479             l--;        
4480         next = regnext(node);
4481         /* Where, what. */
4482         if (OP(node) == OPTIMIZED)
4483             goto after_print;
4484         regprop(sv, node);
4485         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4486                       (int)(2*l + 1), "", SvPVX(sv));
4487         if (next == NULL)               /* Next ptr. */
4488             PerlIO_printf(Perl_debug_log, "(0)");
4489         else
4490             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4491         (void)PerlIO_putc(Perl_debug_log, '\n');
4492       after_print:
4493         if (PL_regkind[(U8)op] == BRANCHJ) {
4494             register regnode *nnode = (OP(next) == LONGJMP
4495                                        ? regnext(next)
4496                                        : next);
4497             if (last && nnode > last)
4498                 nnode = last;
4499             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4500         }
4501         else if (PL_regkind[(U8)op] == BRANCH) {
4502             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4503         }
4504         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4505             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4506                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4507         }
4508         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4509             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4510                              next, sv, l + 1);
4511         }
4512         else if ( op == PLUS || op == STAR) {
4513             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4514         }
4515         else if (op == ANYOF) {
4516             /* arglen 1 + class block */
4517             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4518                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4519             node = NEXTOPER(node);
4520         }
4521         else if (PL_regkind[(U8)op] == EXACT) {
4522             /* Literal string, where present. */
4523             node += NODE_SZ_STR(node) - 1;
4524             node = NEXTOPER(node);
4525         }
4526         else {
4527             node = NEXTOPER(node);
4528             node += regarglen[(U8)op];
4529         }
4530         if (op == CURLYX || op == OPEN)
4531             l++;
4532         else if (op == WHILEM)
4533             l--;
4534     }
4535     return node;
4536 }
4537
4538 #endif  /* DEBUGGING */
4539
4540 /*
4541  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4542  */
4543 void
4544 Perl_regdump(pTHX_ regexp *r)
4545 {
4546 #ifdef DEBUGGING
4547     SV *sv = sv_newmortal();
4548
4549     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4550
4551     /* Header fields of interest. */
4552     if (r->anchored_substr)
4553         PerlIO_printf(Perl_debug_log,
4554                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4555                       PL_colors[0],
4556                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4557                       SvPVX(r->anchored_substr),
4558                       PL_colors[1],
4559                       SvTAIL(r->anchored_substr) ? "$" : "",
4560                       (IV)r->anchored_offset);
4561     else if (r->anchored_utf8)
4562         PerlIO_printf(Perl_debug_log,
4563                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4564                       PL_colors[0],
4565                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4566                       SvPVX(r->anchored_utf8),
4567                       PL_colors[1],
4568                       SvTAIL(r->anchored_utf8) ? "$" : "",
4569                       (IV)r->anchored_offset);
4570     if (r->float_substr)
4571         PerlIO_printf(Perl_debug_log,
4572                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4573                       PL_colors[0],
4574                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4575                       SvPVX(r->float_substr),
4576                       PL_colors[1],
4577                       SvTAIL(r->float_substr) ? "$" : "",
4578                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4579     else if (r->float_utf8)
4580         PerlIO_printf(Perl_debug_log,
4581                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4582                       PL_colors[0],
4583                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4584                       SvPVX(r->float_utf8),
4585                       PL_colors[1],
4586                       SvTAIL(r->float_utf8) ? "$" : "",
4587                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4588     if (r->check_substr || r->check_utf8)
4589         PerlIO_printf(Perl_debug_log,
4590                       r->check_substr == r->float_substr
4591                       && r->check_utf8 == r->float_utf8
4592                       ? "(checking floating" : "(checking anchored");
4593     if (r->reganch & ROPT_NOSCAN)
4594         PerlIO_printf(Perl_debug_log, " noscan");
4595     if (r->reganch & ROPT_CHECK_ALL)
4596         PerlIO_printf(Perl_debug_log, " isall");
4597     if (r->check_substr || r->check_utf8)
4598         PerlIO_printf(Perl_debug_log, ") ");
4599
4600     if (r->regstclass) {
4601         regprop(sv, r->regstclass);
4602         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4603     }
4604     if (r->reganch & ROPT_ANCH) {
4605         PerlIO_printf(Perl_debug_log, "anchored");
4606         if (r->reganch & ROPT_ANCH_BOL)
4607             PerlIO_printf(Perl_debug_log, "(BOL)");
4608         if (r->reganch & ROPT_ANCH_MBOL)
4609             PerlIO_printf(Perl_debug_log, "(MBOL)");
4610         if (r->reganch & ROPT_ANCH_SBOL)
4611             PerlIO_printf(Perl_debug_log, "(SBOL)");
4612         if (r->reganch & ROPT_ANCH_GPOS)
4613             PerlIO_printf(Perl_debug_log, "(GPOS)");
4614         PerlIO_putc(Perl_debug_log, ' ');
4615     }
4616     if (r->reganch & ROPT_GPOS_SEEN)
4617         PerlIO_printf(Perl_debug_log, "GPOS ");
4618     if (r->reganch & ROPT_SKIP)
4619         PerlIO_printf(Perl_debug_log, "plus ");
4620     if (r->reganch & ROPT_IMPLICIT)
4621         PerlIO_printf(Perl_debug_log, "implicit ");
4622     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4623     if (r->reganch & ROPT_EVAL_SEEN)
4624         PerlIO_printf(Perl_debug_log, "with eval ");
4625     PerlIO_printf(Perl_debug_log, "\n");
4626     if (r->offsets) {
4627       U32 i;
4628       U32 len = r->offsets[0];
4629       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4630       for (i = 1; i <= len; i++)
4631         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4632                       (UV)r->offsets[i*2-1], 
4633                       (UV)r->offsets[i*2]);
4634       PerlIO_printf(Perl_debug_log, "\n");
4635     }
4636 #endif  /* DEBUGGING */
4637 }
4638
4639 #ifdef DEBUGGING
4640
4641 STATIC void
4642 S_put_byte(pTHX_ SV *sv, int c)
4643 {
4644     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4645         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4646     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4647         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4648     else
4649         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4650 }
4651
4652 #endif  /* DEBUGGING */
4653
4654 /*
4655 - regprop - printable representation of opcode
4656 */
4657 void
4658 Perl_regprop(pTHX_ SV *sv, regnode *o)
4659 {
4660 #ifdef DEBUGGING
4661     register int k;
4662
4663     sv_setpvn(sv, "", 0);
4664     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4665         /* It would be nice to FAIL() here, but this may be called from
4666            regexec.c, and it would be hard to supply pRExC_state. */
4667         Perl_croak(aTHX_ "Corrupted regexp opcode");
4668     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4669
4670     k = PL_regkind[(U8)OP(o)];
4671
4672     if (k == EXACT) {
4673         SV *dsv = sv_2mortal(newSVpvn("", 0));
4674         /* Using is_utf8_string() is a crude hack but it may
4675          * be the best for now since we have no flag "this EXACTish
4676          * node was UTF-8" --jhi */
4677         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4678         char *s    = do_utf8 ?
4679           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4680                          UNI_DISPLAY_REGEX) :
4681           STRING(o);
4682         int len = do_utf8 ?
4683           strlen(s) :
4684           STR_LEN(o);
4685         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4686                        PL_colors[0],
4687                        len, s,
4688                        PL_colors[1]);
4689     }
4690     else if (k == CURLY) {
4691         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4692             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4693         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4694     }
4695     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4696         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4697     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4698         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4699     else if (k == LOGICAL)
4700         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4701     else if (k == ANYOF) {
4702         int i, rangestart = -1;
4703         U8 flags = ANYOF_FLAGS(o);
4704         const char * const anyofs[] = { /* Should be syncronized with
4705                                          * ANYOF_ #xdefines in regcomp.h */
4706             "\\w",
4707             "\\W",
4708             "\\s",
4709             "\\S",
4710             "\\d",
4711             "\\D",
4712             "[:alnum:]",
4713             "[:^alnum:]",
4714             "[:alpha:]",
4715             "[:^alpha:]",
4716             "[:ascii:]",
4717             "[:^ascii:]",
4718             "[:ctrl:]",
4719             "[:^ctrl:]",
4720             "[:graph:]",
4721             "[:^graph:]",
4722             "[:lower:]",
4723             "[:^lower:]",
4724             "[:print:]",
4725             "[:^print:]",
4726             "[:punct:]",
4727             "[:^punct:]",
4728             "[:upper:]",
4729             "[:^upper:]",
4730             "[:xdigit:]",
4731             "[:^xdigit:]",
4732             "[:space:]",
4733             "[:^space:]",
4734             "[:blank:]",
4735             "[:^blank:]"
4736         };
4737
4738         if (flags & ANYOF_LOCALE)
4739             sv_catpv(sv, "{loc}");
4740         if (flags & ANYOF_FOLD)
4741             sv_catpv(sv, "{i}");
4742         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4743         if (flags & ANYOF_INVERT)
4744             sv_catpv(sv, "^");
4745         for (i = 0; i <= 256; i++) {
4746             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4747                 if (rangestart == -1)
4748                     rangestart = i;
4749             } else if (rangestart != -1) {
4750                 if (i <= rangestart + 3)
4751                     for (; rangestart < i; rangestart++)
4752                         put_byte(sv, rangestart);
4753                 else {
4754                     put_byte(sv, rangestart);
4755                     sv_catpv(sv, "-");
4756                     put_byte(sv, i - 1);
4757                 }
4758                 rangestart = -1;
4759             }
4760         }
4761
4762         if (o->flags & ANYOF_CLASS)
4763             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4764                 if (ANYOF_CLASS_TEST(o,i))
4765                     sv_catpv(sv, anyofs[i]);
4766
4767         if (flags & ANYOF_UNICODE)
4768             sv_catpv(sv, "{unicode}");
4769         else if (flags & ANYOF_UNICODE_ALL)
4770             sv_catpv(sv, "{unicode_all}");
4771
4772         {
4773             SV *lv;
4774             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4775         
4776             if (lv) {
4777                 if (sw) {
4778                     U8 s[UTF8_MAXLEN+1];
4779                 
4780                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4781                         U8 *e = uvchr_to_utf8(s, i);
4782                         
4783                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4784                             if (rangestart == -1)
4785                                 rangestart = i;
4786                         } else if (rangestart != -1) {
4787                             U8 *p;
4788                         
4789                             if (i <= rangestart + 3)
4790                                 for (; rangestart < i; rangestart++) {
4791                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4792                                         put_byte(sv, *p);
4793                                 }
4794                             else {
4795                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4796                                     put_byte(sv, *p);
4797                                 sv_catpv(sv, "-");
4798                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4799                                         put_byte(sv, *p);
4800                                 }
4801                                 rangestart = -1;
4802                             }
4803                         }
4804                         
4805                     sv_catpv(sv, "..."); /* et cetera */
4806                 }
4807
4808                 {
4809                     char *s = savepv(SvPVX(lv));
4810                     char *origs = s;
4811                 
4812                     while(*s && *s != '\n') s++;
4813                 
4814                     if (*s == '\n') {
4815                         char *t = ++s;
4816                         
4817                         while (*s) {
4818                             if (*s == '\n')
4819                                 *s = ' ';
4820                             s++;
4821                         }
4822                         if (s[-1] == ' ')
4823                             s[-1] = 0;
4824                         
4825                         sv_catpv(sv, t);
4826                     }
4827                 
4828                     Safefree(origs);
4829                 }
4830             }
4831         }
4832
4833         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4834     }
4835     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4836         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4837 #endif  /* DEBUGGING */
4838 }
4839
4840 SV *
4841 Perl_re_intuit_string(pTHX_ regexp *prog)
4842 {                               /* Assume that RE_INTUIT is set */
4843     DEBUG_r(
4844         {   STRLEN n_a;
4845             char *s = SvPV(prog->check_substr
4846                       ? prog->check_substr : prog->check_utf8, n_a);
4847
4848             if (!PL_colorset) reginitcolors();
4849             PerlIO_printf(Perl_debug_log,
4850                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4851                       PL_colors[4],
4852                       prog->check_substr ? "" : "utf8 ",
4853                       PL_colors[5],PL_colors[0],
4854                       s,
4855                       PL_colors[1],
4856                       (strlen(s) > 60 ? "..." : ""));
4857         } );
4858
4859     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4860 }
4861
4862 void
4863 Perl_pregfree(pTHX_ struct regexp *r)
4864 {
4865 #ifdef DEBUGGING
4866     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4867 #endif
4868
4869     if (!r || (--r->refcnt > 0))
4870         return;
4871     DEBUG_r({
4872          char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
4873                                   UNI_DISPLAY_REGEX);
4874          int len = SvCUR(dsv);
4875          if (!PL_colorset)
4876               reginitcolors();
4877          PerlIO_printf(Perl_debug_log,
4878                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4879                        PL_colors[4],PL_colors[5],PL_colors[0],
4880                        len, len, s,
4881                        PL_colors[1],
4882                        len > 60 ? "..." : "");
4883     });
4884
4885     if (r->precomp)
4886         Safefree(r->precomp);
4887     if (r->offsets)             /* 20010421 MJD */
4888         Safefree(r->offsets);
4889     if (RX_MATCH_COPIED(r))
4890         Safefree(r->subbeg);
4891     if (r->substrs) {
4892         if (r->anchored_substr)
4893             SvREFCNT_dec(r->anchored_substr);
4894         if (r->anchored_utf8)
4895             SvREFCNT_dec(r->anchored_utf8);
4896         if (r->float_substr)
4897             SvREFCNT_dec(r->float_substr);
4898         if (r->float_utf8)
4899             SvREFCNT_dec(r->float_utf8);
4900         Safefree(r->substrs);
4901     }
4902     if (r->data) {
4903         int n = r->data->count;
4904         AV* new_comppad = NULL;
4905         AV* old_comppad;
4906         SV** old_curpad;
4907
4908         while (--n >= 0) {
4909           /* If you add a ->what type here, update the comment in regcomp.h */
4910             switch (r->data->what[n]) {
4911             case 's':
4912                 SvREFCNT_dec((SV*)r->data->data[n]);
4913                 break;
4914             case 'f':
4915                 Safefree(r->data->data[n]);
4916                 break;
4917             case 'p':
4918                 new_comppad = (AV*)r->data->data[n];
4919                 break;
4920             case 'o':
4921                 if (new_comppad == NULL)
4922                     Perl_croak(aTHX_ "panic: pregfree comppad");
4923                 old_comppad = PL_comppad;
4924                 old_curpad = PL_curpad;
4925                 /* Watch out for global destruction's random ordering. */
4926                 if (SvTYPE(new_comppad) == SVt_PVAV) {
4927                     PL_comppad = new_comppad;
4928                     PL_curpad = AvARRAY(new_comppad);
4929                 }
4930                 else
4931                     PL_curpad = NULL;
4932
4933                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4934                     op_free((OP_4tree*)r->data->data[n]);
4935                 }
4936
4937                 PL_comppad = old_comppad;
4938                 PL_curpad = old_curpad;
4939                 SvREFCNT_dec((SV*)new_comppad);
4940                 new_comppad = NULL;
4941                 break;
4942             case 'n':
4943                 break;
4944             default:
4945                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4946             }
4947         }
4948         Safefree(r->data->what);
4949         Safefree(r->data);
4950     }
4951     Safefree(r->startp);
4952     Safefree(r->endp);
4953     Safefree(r);
4954 }
4955
4956 /*
4957  - regnext - dig the "next" pointer out of a node
4958  *
4959  * [Note, when REGALIGN is defined there are two places in regmatch()
4960  * that bypass this code for speed.]
4961  */
4962 regnode *
4963 Perl_regnext(pTHX_ register regnode *p)
4964 {
4965     register I32 offset;
4966
4967     if (p == &PL_regdummy)
4968         return(NULL);
4969
4970     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4971     if (offset == 0)
4972         return(NULL);
4973
4974     return(p+offset);
4975 }
4976
4977 STATIC void     
4978 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4979 {
4980     va_list args;
4981     STRLEN l1 = strlen(pat1);
4982     STRLEN l2 = strlen(pat2);
4983     char buf[512];
4984     SV *msv;
4985     char *message;
4986
4987     if (l1 > 510)
4988         l1 = 510;
4989     if (l1 + l2 > 510)
4990         l2 = 510 - l1;
4991     Copy(pat1, buf, l1 , char);
4992     Copy(pat2, buf + l1, l2 , char);
4993     buf[l1 + l2] = '\n';
4994     buf[l1 + l2 + 1] = '\0';
4995 #ifdef I_STDARG
4996     /* ANSI variant takes additional second argument */
4997     va_start(args, pat2);
4998 #else
4999     va_start(args);
5000 #endif
5001     msv = vmess(buf, &args);
5002     va_end(args);
5003     message = SvPV(msv,l1);
5004     if (l1 > 512)
5005         l1 = 512;
5006     Copy(message, buf, l1 , char);
5007     buf[l1] = '\0';                     /* Overwrite \n */
5008     Perl_croak(aTHX_ "%s", buf);
5009 }
5010
5011 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5012
5013 void
5014 Perl_save_re_context(pTHX)
5015 {
5016 #if 0
5017     SAVEPPTR(RExC_precomp);             /* uncompiled string. */
5018     SAVEI32(RExC_npar);         /* () count. */
5019     SAVEI32(RExC_size);         /* Code size. */
5020     SAVEI32(RExC_flags);                /* are we folding, multilining? */
5021     SAVEVPTR(RExC_rx);          /* from regcomp.c */
5022     SAVEI32(RExC_seen);         /* from regcomp.c */
5023     SAVEI32(RExC_sawback);              /* Did we see \1, ...? */
5024     SAVEI32(RExC_naughty);              /* How bad is this pattern? */
5025     SAVEVPTR(RExC_emit);                /* Code-emit pointer; &regdummy = don't */
5026     SAVEPPTR(RExC_end);         /* End of input for compile */
5027     SAVEPPTR(RExC_parse);               /* Input-scan pointer. */
5028 #endif
5029
5030     SAVEI32(PL_reg_flags);              /* from regexec.c */
5031     SAVEPPTR(PL_bostr);
5032     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5033     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5034     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5035     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5036     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5037     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5038     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5039     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5040     PL_reg_start_tmp = 0;
5041     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5042     PL_reg_start_tmpl = 0;
5043     SAVEVPTR(PL_regdata);
5044     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5045     SAVEI32(PL_regnarrate);             /* from regexec.c */
5046     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5047     SAVEINT(PL_regindent);              /* from regexec.c */
5048     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5049     SAVEVPTR(PL_curcop);
5050     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5051     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5052     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5053     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5054     SAVEI8(PL_reg_match_utf8);          /* from regexec.c */
5055     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5056     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5057     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5058     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5059     SAVEI32(PL_regnpar);                /* () count. */
5060     SAVEI32(PL_regsize);                /* from regexec.c */
5061 #ifdef DEBUGGING
5062     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5063 #endif
5064 }
5065
5066 static void
5067 clear_re(pTHX_ void *r)
5068 {
5069     ReREFCNT_dec((regexp *)r);
5070 }
5071