This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Class::Struct, simple patch, tests
[perl5.git] / regcomp.c
1 /*    regcomp.c
2  */
3
4 /*
5  * "A fair jaw-cracker dwarf-language must be."  --Samwise Gamgee
6  */
7
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9  * confused with the original package (see point 3 below).  Thanks, Henry!
10  */
11
12 /* Additional note: this code is very heavily munged from Henry's version
13  * in places.  In some spots I've traded clarity for efficiency, so don't
14  * blame Henry for some of the lack of readability.
15  */
16
17 /* The names of the functions have been changed from regcomp and
18  * regexec to  pregcomp and pregexec in order to avoid conflicts
19  * with the POSIX routines of the same names.
20 */
21
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 #  ifndef PERL_IN_XSUB_RE
25 #    define PERL_IN_XSUB_RE
26 #  endif
27 /* need access to debugger hooks */
28 #  if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
29 #    define DEBUGGING
30 #  endif
31 #endif
32
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 #  define Perl_pregcomp my_regcomp
36 #  define Perl_regdump my_regdump
37 #  define Perl_regprop my_regprop
38 #  define Perl_pregfree my_regfree
39 #  define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 #  define Perl_regnext my_regnext
42 #  define Perl_save_re_context my_save_re_context
43 #  define Perl_reginitcolors my_reginitcolors
44
45 #  define PERL_NO_GET_CONTEXT
46 #endif
47
48 /*SUPPRESS 112*/
49 /*
50  * pregcomp and pregexec -- regsub and regerror are not used in perl
51  *
52  *      Copyright (c) 1986 by University of Toronto.
53  *      Written by Henry Spencer.  Not derived from licensed software.
54  *
55  *      Permission is granted to anyone to use this software for any
56  *      purpose on any computer system, and to redistribute it freely,
57  *      subject to the following restrictions:
58  *
59  *      1. The author is not responsible for the consequences of use of
60  *              this software, no matter how awful, even if they arise
61  *              from defects in it.
62  *
63  *      2. The origin of this software must not be misrepresented, either
64  *              by explicit claim or by omission.
65  *
66  *      3. Altered versions must be plainly marked as such, and must not
67  *              be misrepresented as being the original software.
68  *
69  *
70  ****    Alterations to Henry's code are...
71  ****
72  ****    Copyright (c) 1991-2002, Larry Wall
73  ****
74  ****    You may distribute under the terms of either the GNU General Public
75  ****    License or the Artistic License, as specified in the README file.
76
77  *
78  * Beware that some of this code is subtly aware of the way operator
79  * precedence is structured in regular expressions.  Serious changes in
80  * regular-expression syntax might require a total rethink.
81  */
82 #include "EXTERN.h"
83 #define PERL_IN_REGCOMP_C
84 #include "perl.h"
85
86 #ifndef PERL_IN_XSUB_RE
87 #  include "INTERN.h"
88 #endif
89
90 #define REG_COMP_C
91 #include "regcomp.h"
92
93 #ifdef op
94 #undef op
95 #endif /* op */
96
97 #ifdef MSDOS
98 # if defined(BUGGY_MSC6)
99  /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100  # pragma optimize("a",off)
101  /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102  # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
104 #endif /* MSDOS */
105
106 #ifndef STATIC
107 #define STATIC  static
108 #endif
109
110 typedef struct RExC_state_t {
111     U32         flags;                  /* are we folding, multilining? */
112     char        *precomp;               /* uncompiled string. */
113     regexp      *rx;
114     char        *start;                 /* Start of input for compile */
115     char        *end;                   /* End of input for compile */
116     char        *parse;                 /* Input-scan pointer. */
117     I32         whilem_seen;            /* number of WHILEM in this expr */
118     regnode     *emit_start;            /* Start of emitted-code area */
119     regnode     *emit;                  /* Code-emit pointer; &regdummy = don't = compiling */
120     I32         naughty;                /* How bad is this pattern? */
121     I32         sawback;                /* Did we see \1, ...? */
122     U32         seen;
123     I32         size;                   /* Code size. */
124     I32         npar;                   /* () count. */
125     I32         extralen;
126     I32         seen_zerolen;
127     I32         seen_evals;
128     I32         utf8;
129 #if ADD_TO_REGEXEC
130     char        *starttry;              /* -Dr: where regtry was called. */
131 #define RExC_starttry   (pRExC_state->starttry)
132 #endif
133 } RExC_state_t;
134
135 #define RExC_flags      (pRExC_state->flags)
136 #define RExC_precomp    (pRExC_state->precomp)
137 #define RExC_rx         (pRExC_state->rx)
138 #define RExC_start      (pRExC_state->start)
139 #define RExC_end        (pRExC_state->end)
140 #define RExC_parse      (pRExC_state->parse)
141 #define RExC_whilem_seen        (pRExC_state->whilem_seen)
142 #define RExC_offsets    (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit       (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty    (pRExC_state->naughty)
146 #define RExC_sawback    (pRExC_state->sawback)
147 #define RExC_seen       (pRExC_state->seen)
148 #define RExC_size       (pRExC_state->size)
149 #define RExC_npar       (pRExC_state->npar)
150 #define RExC_extralen   (pRExC_state->extralen)
151 #define RExC_seen_zerolen       (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8       (pRExC_state->utf8)
154
155 #define ISMULT1(c)      ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s)      ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157         ((*s) == '{' && regcurly(s)))
158
159 #ifdef SPSTART
160 #undef SPSTART          /* dratted cpp namespace... */
161 #endif
162 /*
163  * Flags to be passed up and down.
164  */
165 #define WORST           0       /* Worst case. */
166 #define HASWIDTH        0x1     /* Known to match non-null strings. */
167 #define SIMPLE          0x2     /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART         0x4     /* Starts with * or +. */
169 #define TRYAGAIN        0x8     /* Weeded out a declaration. */
170
171 /* Length of a variant. */
172
173 typedef struct scan_data_t {
174     I32 len_min;
175     I32 len_delta;
176     I32 pos_min;
177     I32 pos_delta;
178     SV *last_found;
179     I32 last_end;                       /* min value, <0 unless valid. */
180     I32 last_start_min;
181     I32 last_start_max;
182     SV **longest;                       /* Either &l_fixed, or &l_float. */
183     SV *longest_fixed;
184     I32 offset_fixed;
185     SV *longest_float;
186     I32 offset_float_min;
187     I32 offset_float_max;
188     I32 flags;
189     I32 whilem_c;
190     I32 *last_closep;
191     struct regnode_charclass_class *start_class;
192 } scan_data_t;
193
194 /*
195  * Forward declarations for pregcomp()'s friends.
196  */
197
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199                                       0, 0, 0, 0, 0, 0};
200
201 #define SF_BEFORE_EOL           (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL          0x1
203 #define SF_BEFORE_MEOL          0x2
204 #define SF_FIX_BEFORE_EOL       (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL        (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
207 #ifdef NO_UNARY_PLUS
208 #  define SF_FIX_SHIFT_EOL      (0+2)
209 #  define SF_FL_SHIFT_EOL               (0+4)
210 #else
211 #  define SF_FIX_SHIFT_EOL      (+2)
212 #  define SF_FL_SHIFT_EOL               (+4)
213 #endif
214
215 #define SF_FIX_BEFORE_SEOL      (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL      (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218 #define SF_FL_BEFORE_SEOL       (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL       (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF               0x40
221 #define SF_HAS_PAR              0x80
222 #define SF_IN_PAR               0x100
223 #define SF_HAS_EVAL             0x200
224 #define SCF_DO_SUBSTR           0x400
225 #define SCF_DO_STCLASS_AND      0x0800
226 #define SCF_DO_STCLASS_OR       0x1000
227 #define SCF_DO_STCLASS          (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS  0x2000
229
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
233
234 #define OOB_UNICODE             12345678
235 #define OOB_NAMEDCLASS          -1
236
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
240
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
243
244 /*
245  * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246  * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247  * op/pragma/warn/regcomp.
248  */
249 #define MARKER1 "<-- HERE"    /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE "  /* marker as it appears within the regex */
251
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
253
254 /*
255  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256  * arg. Show regex, up to a maximum length. If it's too long, chop and add
257  * "...".
258  */
259 #define FAIL(msg)                                                             \
260     STMT_START {                                                             \
261         char *ellipses = "";                                                 \
262         IV len = RExC_end - RExC_precomp;                                \
263                                                                              \
264         if (!SIZE_ONLY)                                                      \
265             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
266                                                                              \
267         if (len > RegexLengthToShowInErrorMessages) {                        \
268             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
269             len = RegexLengthToShowInErrorMessages - 10;                     \
270             ellipses = "...";                                                \
271         }                                                                    \
272         Perl_croak(aTHX_ "%s in regex m/%.*s%s/",                            \
273                    msg, (int)len, RExC_precomp, ellipses);                  \
274     } STMT_END
275
276 /*
277  * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278  * args. Show regex, up to a maximum length. If it's too long, chop and add
279  * "...".
280  */
281 #define FAIL2(pat,msg)                                                        \
282     STMT_START {                                                             \
283         char *ellipses = "";                                                 \
284         IV len = RExC_end - RExC_precomp;                                \
285                                                                              \
286         if (!SIZE_ONLY)                                                      \
287             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
288                                                                              \
289         if (len > RegexLengthToShowInErrorMessages) {                        \
290             /* chop 10 shorter than the max, to ensure meaning of "..." */   \
291             len = RegexLengthToShowInErrorMessages - 10;                     \
292             ellipses = "...";                                                \
293         }                                                                    \
294         S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/",                        \
295                     msg, (int)len, RExC_precomp, ellipses);                \
296     } STMT_END
297
298
299 /*
300  * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301  */
302 #define Simple_vFAIL(m)                                                      \
303     STMT_START {                                                             \
304       IV offset = RExC_parse - RExC_precomp; \
305                                                                              \
306       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
307                  m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
308     } STMT_END
309
310 /*
311  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312  */
313 #define vFAIL(m)                                                             \
314     STMT_START {                                                             \
315       if (!SIZE_ONLY)                                                        \
316             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
317       Simple_vFAIL(m);                                                       \
318     } STMT_END
319
320 /*
321  * Like Simple_vFAIL(), but accepts two arguments.
322  */
323 #define Simple_vFAIL2(m,a1)                                                  \
324     STMT_START {                                                             \
325       IV offset = RExC_parse - RExC_precomp; \
326                                                                              \
327       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
328                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
329     } STMT_END
330
331 /*
332  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333  */
334 #define vFAIL2(m,a1)                                                         \
335     STMT_START {                                                             \
336       if (!SIZE_ONLY)                                                        \
337             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
338       Simple_vFAIL2(m, a1);                                                  \
339     } STMT_END
340
341
342 /*
343  * Like Simple_vFAIL(), but accepts three arguments.
344  */
345 #define Simple_vFAIL3(m, a1, a2)                                             \
346     STMT_START {                                                             \
347       IV offset = RExC_parse - RExC_precomp; \
348                                                                              \
349       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
350                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
351     } STMT_END
352
353 /*
354  * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355  */
356 #define vFAIL3(m,a1,a2)                                                      \
357     STMT_START {                                                             \
358       if (!SIZE_ONLY)                                                        \
359             SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
360       Simple_vFAIL3(m, a1, a2);                                              \
361     } STMT_END
362
363 /*
364  * Like Simple_vFAIL(), but accepts four arguments.
365  */
366 #define Simple_vFAIL4(m, a1, a2, a3)                                         \
367     STMT_START {                                                             \
368       IV offset = RExC_parse - RExC_precomp; \
369                                                                              \
370       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
371                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
372     } STMT_END
373
374 /*
375  * Like Simple_vFAIL(), but accepts five arguments.
376  */
377 #define Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
378     STMT_START {                                                             \
379       IV offset = RExC_parse - RExC_precomp; \
380       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
381                   (int)offset, RExC_precomp, RExC_precomp + offset);       \
382     } STMT_END
383
384
385 #define vWARN(loc,m)                                                         \
386     STMT_START {                                                             \
387         IV offset = loc - RExC_precomp;          \
388         Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
389                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
390     } STMT_END                                                               \
391
392 #define vWARNdep(loc,m)                                                         \
393     STMT_START {                                                             \
394         IV offset = loc - RExC_precomp;          \
395         Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
396                  m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
397     } STMT_END                                                               \
398
399
400 #define vWARN2(loc, m, a1)                                                   \
401     STMT_START {                                                             \
402         IV offset = loc - RExC_precomp;          \
403         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
404                  a1,                                                         \
405                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
406     } STMT_END
407
408 #define vWARN3(loc, m, a1, a2)                                               \
409     STMT_START {                                                             \
410       IV offset = loc - RExC_precomp;        \
411         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,                    \
412                  a1, a2,                                                     \
413                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
414     } STMT_END
415
416 #define vWARN4(loc, m, a1, a2, a3)                                           \
417     STMT_START {                                                             \
418       IV offset = loc - RExC_precomp;            \
419         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
420                  a1, a2, a3,                                                 \
421                  (int)offset, RExC_precomp, RExC_precomp + offset);        \
422     } STMT_END
423
424 /* used for the parse_flags section for (?c) -- japhy */
425 #define vWARN5(loc, m, a1, a2, a3, a4)                                       \
426   STMT_START {                                                   \
427       IV offset = loc - RExC_precomp;   \
428         Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,      \
429                  a1, a2, a3, a4,                                 \
430                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
431     } STMT_END
432
433
434 /* Allow for side effects in s */
435 #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
436
437 /* Macros for recording node offsets.   20001227 mjd@plover.com 
438  * Nodes are numbered 1, 2, 3, 4.  Node #n's position is recorded in
439  * element 2*n-1 of the array.  Element #2n holds the byte length node #n.
440  * Element 0 holds the number n.
441  */
442
443 #define MJD_OFFSET_DEBUG(x)
444 /* #define MJD_OFFSET_DEBUG(x) fprintf x */
445
446
447 #  define Set_Node_Offset_To_R(node,byte)                           \
448    STMT_START {                                        \
449      if (! SIZE_ONLY) {                                  \
450        if((node) < 0) {                   \
451          Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
452        } else {                                                        \
453          RExC_offsets[2*(node)-1] = (byte);                               \
454        }                                                               \
455      }                                                                 \
456    } STMT_END
457
458 #  define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
459 #  define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
460
461 #  define Set_Node_Length_To_R(node,len)                            \
462    STMT_START {                                        \
463      if (! SIZE_ONLY) {                                  \
464        MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
465        if((node) < 0) {                   \
466          Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
467        } else {                                                        \
468          RExC_offsets[2*(node)] = (len);                               \
469        }                                                               \
470      }                                                                 \
471    } STMT_END
472
473 #  define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
474 #  define Set_Cur_Node_Length(len)  Set_Node_Length(RExC_emit, len)
475 #  define Set_Node_Cur_Length(node)   Set_Node_Length(node, RExC_parse - parse_start)
476
477 /* Get offsets and lengths */
478 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
479 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
480
481 static void clear_re(pTHX_ void *r);
482
483 /* Mark that we cannot extend a found fixed substring at this point.
484    Updata the longest found anchored substring and the longest found
485    floating substrings if needed. */
486
487 STATIC void
488 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
489 {
490     STRLEN l = CHR_SVLEN(data->last_found);
491     STRLEN old_l = CHR_SVLEN(*data->longest);
492
493     if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
494         sv_setsv(*data->longest, data->last_found);
495         if (*data->longest == data->longest_fixed) {
496             data->offset_fixed = l ? data->last_start_min : data->pos_min;
497             if (data->flags & SF_BEFORE_EOL)
498                 data->flags
499                     |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
500             else
501                 data->flags &= ~SF_FIX_BEFORE_EOL;
502         }
503         else {
504             data->offset_float_min = l ? data->last_start_min : data->pos_min;
505             data->offset_float_max = (l
506                                       ? data->last_start_max
507                                       : data->pos_min + data->pos_delta);
508             if ((U32)data->offset_float_max > (U32)I32_MAX)
509                 data->offset_float_max = I32_MAX;
510             if (data->flags & SF_BEFORE_EOL)
511                 data->flags
512                     |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
513             else
514                 data->flags &= ~SF_FL_BEFORE_EOL;
515         }
516     }
517     SvCUR_set(data->last_found, 0);
518     data->last_end = -1;
519     data->flags &= ~SF_BEFORE_EOL;
520 }
521
522 /* Can match anything (initialization) */
523 STATIC void
524 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
525 {
526     ANYOF_CLASS_ZERO(cl);
527     ANYOF_BITMAP_SETALL(cl);
528     cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
529     if (LOC)
530         cl->flags |= ANYOF_LOCALE;
531 }
532
533 /* Can match anything (initialization) */
534 STATIC int
535 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
536 {
537     int value;
538
539     for (value = 0; value <= ANYOF_MAX; value += 2)
540         if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
541             return 1;
542     if (!(cl->flags & ANYOF_UNICODE_ALL))
543         return 0;
544     if (!ANYOF_BITMAP_TESTALLSET(cl))
545         return 0;
546     return 1;
547 }
548
549 /* Can match anything (initialization) */
550 STATIC void
551 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
552 {
553     Zero(cl, 1, struct regnode_charclass_class);
554     cl->type = ANYOF;
555     cl_anything(pRExC_state, cl);
556 }
557
558 STATIC void
559 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
560 {
561     Zero(cl, 1, struct regnode_charclass_class);
562     cl->type = ANYOF;
563     cl_anything(pRExC_state, cl);
564     if (LOC)
565         cl->flags |= ANYOF_LOCALE;
566 }
567
568 /* 'And' a given class with another one.  Can create false positives */
569 /* We assume that cl is not inverted */
570 STATIC void
571 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
572          struct regnode_charclass_class *and_with)
573 {
574     if (!(and_with->flags & ANYOF_CLASS)
575         && !(cl->flags & ANYOF_CLASS)
576         && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
577         && !(and_with->flags & ANYOF_FOLD)
578         && !(cl->flags & ANYOF_FOLD)) {
579         int i;
580
581         if (and_with->flags & ANYOF_INVERT)
582             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
583                 cl->bitmap[i] &= ~and_with->bitmap[i];
584         else
585             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
586                 cl->bitmap[i] &= and_with->bitmap[i];
587     } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
588     if (!(and_with->flags & ANYOF_EOS))
589         cl->flags &= ~ANYOF_EOS;
590
591     if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
592         cl->flags &= ~ANYOF_UNICODE_ALL;
593         cl->flags |= ANYOF_UNICODE;
594         ARG_SET(cl, ARG(and_with));
595     }
596     if (!(and_with->flags & ANYOF_UNICODE_ALL))
597         cl->flags &= ~ANYOF_UNICODE_ALL;
598     if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
599         cl->flags &= ~ANYOF_UNICODE;
600 }
601
602 /* 'OR' a given class with another one.  Can create false positives */
603 /* We assume that cl is not inverted */
604 STATIC void
605 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
606 {
607     if (or_with->flags & ANYOF_INVERT) {
608         /* We do not use
609          * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
610          *   <= (B1 | !B2) | (CL1 | !CL2)
611          * which is wasteful if CL2 is small, but we ignore CL2:
612          *   (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
613          * XXXX Can we handle case-fold?  Unclear:
614          *   (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
615          *   (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
616          */
617         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618              && !(or_with->flags & ANYOF_FOLD)
619              && !(cl->flags & ANYOF_FOLD) ) {
620             int i;
621
622             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
623                 cl->bitmap[i] |= ~or_with->bitmap[i];
624         } /* XXXX: logic is complicated otherwise */
625         else {
626             cl_anything(pRExC_state, cl);
627         }
628     } else {
629         /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
630         if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
631              && (!(or_with->flags & ANYOF_FOLD)
632                  || (cl->flags & ANYOF_FOLD)) ) {
633             int i;
634
635             /* OR char bitmap and class bitmap separately */
636             for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
637                 cl->bitmap[i] |= or_with->bitmap[i];
638             if (or_with->flags & ANYOF_CLASS) {
639                 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
640                     cl->classflags[i] |= or_with->classflags[i];
641                 cl->flags |= ANYOF_CLASS;
642             }
643         }
644         else { /* XXXX: logic is complicated, leave it along for a moment. */
645             cl_anything(pRExC_state, cl);
646         }
647     }
648     if (or_with->flags & ANYOF_EOS)
649         cl->flags |= ANYOF_EOS;
650
651     if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
652         ARG(cl) != ARG(or_with)) {
653         cl->flags |= ANYOF_UNICODE_ALL;
654         cl->flags &= ~ANYOF_UNICODE;
655     }
656     if (or_with->flags & ANYOF_UNICODE_ALL) {
657         cl->flags |= ANYOF_UNICODE_ALL;
658         cl->flags &= ~ANYOF_UNICODE;
659     }
660 }
661
662 /*
663  * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
664  * These need to be revisited when a newer toolchain becomes available.
665  */
666 #if defined(__sparc64__) && defined(__GNUC__)
667 #   if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
668 #       undef  SPARC64_GCC_WORKAROUND
669 #       define SPARC64_GCC_WORKAROUND 1
670 #   endif
671 #endif
672
673 /* REx optimizer.  Converts nodes into quickier variants "in place".
674    Finds fixed substrings.  */
675
676 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
677    to the position after last scanned or to NULL. */
678
679 STATIC I32
680 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
681                         /* scanp: Start here (read-write). */
682                         /* deltap: Write maxlen-minlen here. */
683                         /* last: Stop before this one. */
684 {
685     I32 min = 0, pars = 0, code;
686     regnode *scan = *scanp, *next;
687     I32 delta = 0;
688     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
689     int is_inf_internal = 0;            /* The studied chunk is infinite */
690     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
691     scan_data_t data_fake;
692     struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
693
694     while (scan && OP(scan) != END && scan < last) {
695         /* Peephole optimizer: */
696
697         if (PL_regkind[(U8)OP(scan)] == EXACT) {
698             /* Merge several consecutive EXACTish nodes into one. */
699             regnode *n = regnext(scan);
700             U32 stringok = 1;
701 #ifdef DEBUGGING
702             regnode *stop = scan;
703 #endif
704
705             next = scan + NODE_SZ_STR(scan);
706             /* Skip NOTHING, merge EXACT*. */
707             while (n &&
708                    ( PL_regkind[(U8)OP(n)] == NOTHING ||
709                      (stringok && (OP(n) == OP(scan))))
710                    && NEXT_OFF(n)
711                    && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
712                 if (OP(n) == TAIL || n > next)
713                     stringok = 0;
714                 if (PL_regkind[(U8)OP(n)] == NOTHING) {
715                     NEXT_OFF(scan) += NEXT_OFF(n);
716                     next = n + NODE_STEP_REGNODE;
717 #ifdef DEBUGGING
718                     if (stringok)
719                         stop = n;
720 #endif
721                     n = regnext(n);
722                 }
723                 else if (stringok) {
724                     int oldl = STR_LEN(scan);
725                     regnode *nnext = regnext(n);
726
727                     if (oldl + STR_LEN(n) > U8_MAX)
728                         break;
729                     NEXT_OFF(scan) += NEXT_OFF(n);
730                     STR_LEN(scan) += STR_LEN(n);
731                     next = n + NODE_SZ_STR(n);
732                     /* Now we can overwrite *n : */
733                     Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
734 #ifdef DEBUGGING
735                     stop = next - 1;
736 #endif
737                     n = nnext;
738                 }
739             }
740
741             if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
742 /*
743   Two problematic code points in Unicode casefolding of EXACT nodes:
744
745    U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
746    U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
747
748    which casefold to
749
750    Unicode                      UTF-8
751
752    U+03B9 U+0308 U+0301         0xCE 0xB9 0xCC 0x88 0xCC 0x81
753    U+03C5 U+0308 U+0301         0xCF 0x85 0xCC 0x88 0xCC 0x81
754
755    This means that in case-insensitive matching (or "loose matching",
756    as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
757    length of the above casefolded versions) can match a target string
758    of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
759    This would rather mess up the minimum length computation.
760
761    What we'll do is to look for the tail four bytes, and then peek
762    at the preceding two bytes to see whether we need to decrease
763    the minimum length by four (six minus two).
764
765    Thanks to the design of UTF-8, there cannot be false matches:
766    A sequence of valid UTF-8 bytes cannot be a subsequence of
767    another valid sequence of UTF-8 bytes.
768
769 */
770                  char *s0 = STRING(scan), *s, *t;
771                  char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
772                  char *t0 = "\xcc\x88\xcc\x81";
773                  char *t1 = t0 + 3;
774                  
775                  for (s = s0 + 2;
776                       s < s2 && (t = ninstr(s, s1, t0, t1));
777                       s = t + 4) {
778                       if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
779                           ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
780                            min -= 4;
781                  }
782             }
783
784 #ifdef DEBUGGING
785             /* Allow dumping */
786             n = scan + NODE_SZ_STR(scan);
787             while (n <= stop) {
788                 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
789                     OP(n) = OPTIMIZED;
790                     NEXT_OFF(n) = 0;
791                 }
792                 n++;
793             }
794 #endif
795         }
796         /* Follow the next-chain of the current node and optimize
797            away all the NOTHINGs from it.  */
798         if (OP(scan) != CURLYX) {
799             int max = (reg_off_by_arg[OP(scan)]
800                        ? I32_MAX
801                        /* I32 may be smaller than U16 on CRAYs! */
802                        : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
803             int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
804             int noff;
805             regnode *n = scan;
806         
807             /* Skip NOTHING and LONGJMP. */
808             while ((n = regnext(n))
809                    && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
810                        || ((OP(n) == LONGJMP) && (noff = ARG(n))))
811                    && off + noff < max)
812                 off += noff;
813             if (reg_off_by_arg[OP(scan)])
814                 ARG(scan) = off;
815             else
816                 NEXT_OFF(scan) = off;
817         }
818         /* The principal pseudo-switch.  Cannot be a switch, since we
819            look into several different things.  */
820         if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
821                    || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
822             next = regnext(scan);
823             code = OP(scan);
824         
825             if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
826                 I32 max1 = 0, min1 = I32_MAX, num = 0;
827                 struct regnode_charclass_class accum;
828                 
829                 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830                     scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
831                 if (flags & SCF_DO_STCLASS)
832                     cl_init_zero(pRExC_state, &accum);
833                 while (OP(scan) == code) {
834                     I32 deltanext, minnext, f = 0, fake;
835                     struct regnode_charclass_class this_class;
836
837                     num++;
838                     data_fake.flags = 0;
839                     if (data) {         
840                         data_fake.whilem_c = data->whilem_c;
841                         data_fake.last_closep = data->last_closep;
842                     }
843                     else
844                         data_fake.last_closep = &fake;
845                     next = regnext(scan);
846                     scan = NEXTOPER(scan);
847                     if (code != BRANCH)
848                         scan = NEXTOPER(scan);
849                     if (flags & SCF_DO_STCLASS) {
850                         cl_init(pRExC_state, &this_class);
851                         data_fake.start_class = &this_class;
852                         f = SCF_DO_STCLASS_AND;
853                     }           
854                     if (flags & SCF_WHILEM_VISITED_POS)
855                         f |= SCF_WHILEM_VISITED_POS;
856                     /* we suppose the run is continuous, last=next...*/
857                     minnext = study_chunk(pRExC_state, &scan, &deltanext,
858                                           next, &data_fake, f);
859                     if (min1 > minnext)
860                         min1 = minnext;
861                     if (max1 < minnext + deltanext)
862                         max1 = minnext + deltanext;
863                     if (deltanext == I32_MAX)
864                         is_inf = is_inf_internal = 1;
865                     scan = next;
866                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
867                         pars++;
868                     if (data && (data_fake.flags & SF_HAS_EVAL))
869                         data->flags |= SF_HAS_EVAL;
870                     if (data)
871                         data->whilem_c = data_fake.whilem_c;
872                     if (flags & SCF_DO_STCLASS)
873                         cl_or(pRExC_state, &accum, &this_class);
874                     if (code == SUSPEND)
875                         break;
876                 }
877                 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
878                     min1 = 0;
879                 if (flags & SCF_DO_SUBSTR) {
880                     data->pos_min += min1;
881                     data->pos_delta += max1 - min1;
882                     if (max1 != min1 || is_inf)
883                         data->longest = &(data->longest_float);
884                 }
885                 min += min1;
886                 delta += max1 - min1;
887                 if (flags & SCF_DO_STCLASS_OR) {
888                     cl_or(pRExC_state, data->start_class, &accum);
889                     if (min1) {
890                         cl_and(data->start_class, &and_with);
891                         flags &= ~SCF_DO_STCLASS;
892                     }
893                 }
894                 else if (flags & SCF_DO_STCLASS_AND) {
895                     if (min1) {
896                         cl_and(data->start_class, &accum);
897                         flags &= ~SCF_DO_STCLASS;
898                     }
899                     else {
900                         /* Switch to OR mode: cache the old value of
901                          * data->start_class */
902                         StructCopy(data->start_class, &and_with,
903                                    struct regnode_charclass_class);
904                         flags &= ~SCF_DO_STCLASS_AND;
905                         StructCopy(&accum, data->start_class,
906                                    struct regnode_charclass_class);
907                         flags |= SCF_DO_STCLASS_OR;
908                         data->start_class->flags |= ANYOF_EOS;
909                     }
910                 }
911             }
912             else if (code == BRANCHJ)   /* single branch is optimized. */
913                 scan = NEXTOPER(NEXTOPER(scan));
914             else                        /* single branch is optimized. */
915                 scan = NEXTOPER(scan);
916             continue;
917         }
918         else if (OP(scan) == EXACT) {
919             I32 l = STR_LEN(scan);
920             UV uc = *((U8*)STRING(scan));
921             if (UTF) {
922                 U8 *s = (U8*)STRING(scan);
923                 l = utf8_length(s, s + l);
924                 uc = utf8_to_uvchr(s, NULL);
925             }
926             min += l;
927             if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
928                 /* The code below prefers earlier match for fixed
929                    offset, later match for variable offset.  */
930                 if (data->last_end == -1) { /* Update the start info. */
931                     data->last_start_min = data->pos_min;
932                     data->last_start_max = is_inf
933                         ? I32_MAX : data->pos_min + data->pos_delta;
934                 }
935                 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
936                 if (UTF)
937                     SvUTF8_on(data->last_found);
938                 data->last_end = data->pos_min + l;
939                 data->pos_min += l; /* As in the first entry. */
940                 data->flags &= ~SF_BEFORE_EOL;
941             }
942             if (flags & SCF_DO_STCLASS_AND) {
943                 /* Check whether it is compatible with what we know already! */
944                 int compat = 1;
945
946                 if (uc >= 0x100 ||
947                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
948                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
949                     && (!(data->start_class->flags & ANYOF_FOLD)
950                         || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
951                     )
952                     compat = 0;
953                 ANYOF_CLASS_ZERO(data->start_class);
954                 ANYOF_BITMAP_ZERO(data->start_class);
955                 if (compat)
956                     ANYOF_BITMAP_SET(data->start_class, uc);
957                 data->start_class->flags &= ~ANYOF_EOS;
958                 if (uc < 0x100)
959                   data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960             }
961             else if (flags & SCF_DO_STCLASS_OR) {
962                 /* false positive possible if the class is case-folded */
963                 if (uc < 0x100)
964                     ANYOF_BITMAP_SET(data->start_class, uc);
965                 else
966                     data->start_class->flags |= ANYOF_UNICODE_ALL;
967                 data->start_class->flags &= ~ANYOF_EOS;
968                 cl_and(data->start_class, &and_with);
969             }
970             flags &= ~SCF_DO_STCLASS;
971         }
972         else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
973             I32 l = STR_LEN(scan);
974             UV uc = *((U8*)STRING(scan));
975
976             /* Search for fixed substrings supports EXACT only. */
977             if (flags & SCF_DO_SUBSTR)
978                 scan_commit(pRExC_state, data);
979             if (UTF) {
980                 U8 *s = (U8 *)STRING(scan);
981                 l = utf8_length(s, s + l);
982                 uc = utf8_to_uvchr(s, NULL);
983             }
984             min += l;
985             if (data && (flags & SCF_DO_SUBSTR))
986                 data->pos_min += l;
987             if (flags & SCF_DO_STCLASS_AND) {
988                 /* Check whether it is compatible with what we know already! */
989                 int compat = 1;
990
991                 if (uc >= 0x100 ||
992                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
993                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
994                      && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995                     compat = 0;
996                 ANYOF_CLASS_ZERO(data->start_class);
997                 ANYOF_BITMAP_ZERO(data->start_class);
998                 if (compat) {
999                     ANYOF_BITMAP_SET(data->start_class, uc);
1000                     data->start_class->flags &= ~ANYOF_EOS;
1001                     data->start_class->flags |= ANYOF_FOLD;
1002                     if (OP(scan) == EXACTFL)
1003                         data->start_class->flags |= ANYOF_LOCALE;
1004                 }
1005             }
1006             else if (flags & SCF_DO_STCLASS_OR) {
1007                 if (data->start_class->flags & ANYOF_FOLD) {
1008                     /* false positive possible if the class is case-folded.
1009                        Assume that the locale settings are the same... */
1010                     if (uc < 0x100)
1011                         ANYOF_BITMAP_SET(data->start_class, uc);
1012                     data->start_class->flags &= ~ANYOF_EOS;
1013                 }
1014                 cl_and(data->start_class, &and_with);
1015             }
1016             flags &= ~SCF_DO_STCLASS;
1017         }
1018         else if (strchr((char*)PL_varies,OP(scan))) {
1019             I32 mincount, maxcount, minnext, deltanext, fl = 0;
1020             I32 f = flags, pos_before = 0;
1021             regnode *oscan = scan;
1022             struct regnode_charclass_class this_class;
1023             struct regnode_charclass_class *oclass = NULL;
1024             I32 next_is_eval = 0;
1025
1026             switch (PL_regkind[(U8)OP(scan)]) {
1027             case WHILEM:                /* End of (?:...)* . */
1028                 scan = NEXTOPER(scan);
1029                 goto finish;
1030             case PLUS:
1031                 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1032                     next = NEXTOPER(scan);
1033                     if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034                         mincount = 1;
1035                         maxcount = REG_INFTY;
1036                         next = regnext(scan);
1037                         scan = NEXTOPER(scan);
1038                         goto do_curly;
1039                     }
1040                 }
1041                 if (flags & SCF_DO_SUBSTR)
1042                     data->pos_min++;
1043                 min++;
1044                 /* Fall through. */
1045             case STAR:
1046                 if (flags & SCF_DO_STCLASS) {
1047                     mincount = 0;
1048                     maxcount = REG_INFTY;
1049                     next = regnext(scan);
1050                     scan = NEXTOPER(scan);
1051                     goto do_curly;
1052                 }
1053                 is_inf = is_inf_internal = 1;
1054                 scan = regnext(scan);
1055                 if (flags & SCF_DO_SUBSTR) {
1056                     scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1057                     data->longest = &(data->longest_float);
1058                 }
1059                 goto optimize_curly_tail;
1060             case CURLY:
1061                 mincount = ARG1(scan);
1062                 maxcount = ARG2(scan);
1063                 next = regnext(scan);
1064                 if (OP(scan) == CURLYX) {
1065                     I32 lp = (data ? *(data->last_closep) : 0);
1066
1067                     scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068                 }
1069                 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1070                 next_is_eval = (OP(scan) == EVAL);
1071               do_curly:
1072                 if (flags & SCF_DO_SUBSTR) {
1073                     if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1074                     pos_before = data->pos_min;
1075                 }
1076                 if (data) {
1077                     fl = data->flags;
1078                     data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079                     if (is_inf)
1080                         data->flags |= SF_IS_INF;
1081                 }
1082                 if (flags & SCF_DO_STCLASS) {
1083                     cl_init(pRExC_state, &this_class);
1084                     oclass = data->start_class;
1085                     data->start_class = &this_class;
1086                     f |= SCF_DO_STCLASS_AND;
1087                     f &= ~SCF_DO_STCLASS_OR;
1088                 }
1089                 /* These are the cases when once a subexpression
1090                    fails at a particular position, it cannot succeed
1091                    even after backtracking at the enclosing scope.
1092                 
1093                    XXXX what if minimal match and we are at the
1094                         initial run of {n,m}? */
1095                 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1096                     f &= ~SCF_WHILEM_VISITED_POS;
1097
1098                 /* This will finish on WHILEM, setting scan, or on NULL: */
1099                 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100                                       mincount == 0
1101                                         ? (f & ~SCF_DO_SUBSTR) : f);
1102
1103                 if (flags & SCF_DO_STCLASS)
1104                     data->start_class = oclass;
1105                 if (mincount == 0 || minnext == 0) {
1106                     if (flags & SCF_DO_STCLASS_OR) {
1107                         cl_or(pRExC_state, data->start_class, &this_class);
1108                     }
1109                     else if (flags & SCF_DO_STCLASS_AND) {
1110                         /* Switch to OR mode: cache the old value of
1111                          * data->start_class */
1112                         StructCopy(data->start_class, &and_with,
1113                                    struct regnode_charclass_class);
1114                         flags &= ~SCF_DO_STCLASS_AND;
1115                         StructCopy(&this_class, data->start_class,
1116                                    struct regnode_charclass_class);
1117                         flags |= SCF_DO_STCLASS_OR;
1118                         data->start_class->flags |= ANYOF_EOS;
1119                     }
1120                 } else {                /* Non-zero len */
1121                     if (flags & SCF_DO_STCLASS_OR) {
1122                         cl_or(pRExC_state, data->start_class, &this_class);
1123                         cl_and(data->start_class, &and_with);
1124                     }
1125                     else if (flags & SCF_DO_STCLASS_AND)
1126                         cl_and(data->start_class, &this_class);
1127                     flags &= ~SCF_DO_STCLASS;
1128                 }
1129                 if (!scan)              /* It was not CURLYX, but CURLY. */
1130                     scan = next;
1131                 if (ckWARN(WARN_REGEXP)
1132                        /* ? quantifier ok, except for (?{ ... }) */
1133                     && (next_is_eval || !(mincount == 0 && maxcount == 1))
1134                     && (minnext == 0) && (deltanext == 0)
1135                     && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1136                     && maxcount <= REG_INFTY/3) /* Complement check for big count */
1137                 {
1138                     vWARN(RExC_parse,
1139                           "Quantifier unexpected on zero-length expression");
1140                 }
1141
1142                 min += minnext * mincount;
1143                 is_inf_internal |= ((maxcount == REG_INFTY
1144                                      && (minnext + deltanext) > 0)
1145                                     || deltanext == I32_MAX);
1146                 is_inf |= is_inf_internal;
1147                 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148
1149                 /* Try powerful optimization CURLYX => CURLYN. */
1150                 if (  OP(oscan) == CURLYX && data
1151                       && data->flags & SF_IN_PAR
1152                       && !(data->flags & SF_HAS_EVAL)
1153                       && !deltanext && minnext == 1 ) {
1154                     /* Try to optimize to CURLYN.  */
1155                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1156                     regnode *nxt1 = nxt;
1157 #ifdef DEBUGGING
1158                     regnode *nxt2;
1159 #endif
1160
1161                     /* Skip open. */
1162                     nxt = regnext(nxt);
1163                     if (!strchr((char*)PL_simple,OP(nxt))
1164                         && !(PL_regkind[(U8)OP(nxt)] == EXACT
1165                              && STR_LEN(nxt) == 1))
1166                         goto nogo;
1167 #ifdef DEBUGGING
1168                     nxt2 = nxt;
1169 #endif
1170                     nxt = regnext(nxt);
1171                     if (OP(nxt) != CLOSE)
1172                         goto nogo;
1173                     /* Now we know that nxt2 is the only contents: */
1174                     oscan->flags = (U8)ARG(nxt);
1175                     OP(oscan) = CURLYN;
1176                     OP(nxt1) = NOTHING; /* was OPEN. */
1177 #ifdef DEBUGGING
1178                     OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1179                     NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1180                     NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1181                     OP(nxt) = OPTIMIZED;        /* was CLOSE. */
1182                     OP(nxt + 1) = OPTIMIZED; /* was count. */
1183                     NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1184 #endif
1185                 }
1186               nogo:
1187
1188                 /* Try optimization CURLYX => CURLYM. */
1189                 if (  OP(oscan) == CURLYX && data
1190                       && !(data->flags & SF_HAS_PAR)
1191                       && !(data->flags & SF_HAS_EVAL)
1192                       && !deltanext  ) {
1193                     /* XXXX How to optimize if data == 0? */
1194                     /* Optimize to a simpler form.  */
1195                     regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1196                     regnode *nxt2;
1197
1198                     OP(oscan) = CURLYM;
1199                     while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1200                             && (OP(nxt2) != WHILEM))
1201                         nxt = nxt2;
1202                     OP(nxt2)  = SUCCEED; /* Whas WHILEM */
1203                     /* Need to optimize away parenths. */
1204                     if (data->flags & SF_IN_PAR) {
1205                         /* Set the parenth number.  */
1206                         regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207
1208                         if (OP(nxt) != CLOSE)
1209                             FAIL("Panic opt close");
1210                         oscan->flags = (U8)ARG(nxt);
1211                         OP(nxt1) = OPTIMIZED;   /* was OPEN. */
1212                         OP(nxt) = OPTIMIZED;    /* was CLOSE. */
1213 #ifdef DEBUGGING
1214                         OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1215                         OP(nxt + 1) = OPTIMIZED; /* was count. */
1216                         NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1217                         NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1218 #endif
1219 #if 0
1220                         while ( nxt1 && (OP(nxt1) != WHILEM)) {
1221                             regnode *nnxt = regnext(nxt1);
1222                         
1223                             if (nnxt == nxt) {
1224                                 if (reg_off_by_arg[OP(nxt1)])
1225                                     ARG_SET(nxt1, nxt2 - nxt1);
1226                                 else if (nxt2 - nxt1 < U16_MAX)
1227                                     NEXT_OFF(nxt1) = nxt2 - nxt1;
1228                                 else
1229                                     OP(nxt) = NOTHING;  /* Cannot beautify */
1230                             }
1231                             nxt1 = nnxt;
1232                         }
1233 #endif
1234                         /* Optimize again: */
1235                         study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1236                                     NULL, 0);
1237                     }
1238                     else
1239                         oscan->flags = 0;
1240                 }
1241                 else if ((OP(oscan) == CURLYX)
1242                          && (flags & SCF_WHILEM_VISITED_POS)
1243                          /* See the comment on a similar expression above.
1244                             However, this time it not a subexpression
1245                             we care about, but the expression itself. */
1246                          && (maxcount == REG_INFTY)
1247                          && data && ++data->whilem_c < 16) {
1248                     /* This stays as CURLYX, we can put the count/of pair. */
1249                     /* Find WHILEM (as in regexec.c) */
1250                     regnode *nxt = oscan + NEXT_OFF(oscan);
1251
1252                     if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253                         nxt += ARG(nxt);
1254                     PREVOPER(nxt)->flags = (U8)(data->whilem_c
1255                         | (RExC_whilem_seen << 4)); /* On WHILEM */
1256                 }
1257                 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1258                     pars++;
1259                 if (flags & SCF_DO_SUBSTR) {
1260                     SV *last_str = Nullsv;
1261                     int counted = mincount != 0;
1262
1263                     if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1264 #if defined(SPARC64_GCC_WORKAROUND)
1265                         I32 b = 0;
1266                         STRLEN l = 0;
1267                         char *s = NULL;
1268                         I32 old = 0;
1269
1270                         if (pos_before >= data->last_start_min)
1271                             b = pos_before;
1272                         else
1273                             b = data->last_start_min;
1274
1275                         l = 0;
1276                         s = SvPV(data->last_found, l);
1277                         old = b - data->last_start_min;
1278
1279 #else
1280                         I32 b = pos_before >= data->last_start_min
1281                             ? pos_before : data->last_start_min;
1282                         STRLEN l;
1283                         char *s = SvPV(data->last_found, l);
1284                         I32 old = b - data->last_start_min;
1285 #endif
1286
1287                         if (UTF)
1288                             old = utf8_hop((U8*)s, old) - (U8*)s;
1289                         
1290                         l -= old;
1291                         /* Get the added string: */
1292                         last_str = newSVpvn(s  + old, l);
1293                         if (deltanext == 0 && pos_before == b) {
1294                             /* What was added is a constant string */
1295                             if (mincount > 1) {
1296                                 SvGROW(last_str, (mincount * l) + 1);
1297                                 repeatcpy(SvPVX(last_str) + l,
1298                                           SvPVX(last_str), l, mincount - 1);
1299                                 SvCUR(last_str) *= mincount;
1300                                 /* Add additional parts. */
1301                                 SvCUR_set(data->last_found,
1302                                           SvCUR(data->last_found) - l);
1303                                 sv_catsv(data->last_found, last_str);
1304                                 data->last_end += l * (mincount - 1);
1305                             }
1306                         } else {
1307                             /* start offset must point into the last copy */
1308                             data->last_start_min += minnext * (mincount - 1);
1309                             data->last_start_max += is_inf ? I32_MAX
1310                                 : (maxcount - 1) * (minnext + data->pos_delta);
1311                         }
1312                     }
1313                     /* It is counted once already... */
1314                     data->pos_min += minnext * (mincount - counted);
1315                     data->pos_delta += - counted * deltanext +
1316                         (minnext + deltanext) * maxcount - minnext * mincount;
1317                     if (mincount != maxcount) {
1318                          /* Cannot extend fixed substrings found inside
1319                             the group.  */
1320                         scan_commit(pRExC_state,data);
1321                         if (mincount && last_str) {
1322                             sv_setsv(data->last_found, last_str);
1323                             data->last_end = data->pos_min;
1324                             data->last_start_min =
1325                                 data->pos_min - CHR_SVLEN(last_str);
1326                             data->last_start_max = is_inf
1327                                 ? I32_MAX
1328                                 : data->pos_min + data->pos_delta
1329                                 - CHR_SVLEN(last_str);
1330                         }
1331                         data->longest = &(data->longest_float);
1332                     }
1333                     SvREFCNT_dec(last_str);
1334                 }
1335                 if (data && (fl & SF_HAS_EVAL))
1336                     data->flags |= SF_HAS_EVAL;
1337               optimize_curly_tail:
1338                 if (OP(oscan) != CURLYX) {
1339                     while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1340                            && NEXT_OFF(next))
1341                         NEXT_OFF(oscan) += NEXT_OFF(next);
1342                 }
1343                 continue;
1344             default:                    /* REF and CLUMP only? */
1345                 if (flags & SCF_DO_SUBSTR) {
1346                     scan_commit(pRExC_state,data);      /* Cannot expect anything... */
1347                     data->longest = &(data->longest_float);
1348                 }
1349                 is_inf = is_inf_internal = 1;
1350                 if (flags & SCF_DO_STCLASS_OR)
1351                     cl_anything(pRExC_state, data->start_class);
1352                 flags &= ~SCF_DO_STCLASS;
1353                 break;
1354             }
1355         }
1356         else if (strchr((char*)PL_simple,OP(scan))) {
1357             int value = 0;
1358
1359             if (flags & SCF_DO_SUBSTR) {
1360                 scan_commit(pRExC_state,data);
1361                 data->pos_min++;
1362             }
1363             min++;
1364             if (flags & SCF_DO_STCLASS) {
1365                 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1366
1367                 /* Some of the logic below assumes that switching
1368                    locale on will only add false positives. */
1369                 switch (PL_regkind[(U8)OP(scan)]) {
1370                 case SANY:
1371                 default:
1372                   do_default:
1373                     /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1374                     if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1375                         cl_anything(pRExC_state, data->start_class);
1376                     break;
1377                 case REG_ANY:
1378                     if (OP(scan) == SANY)
1379                         goto do_default;
1380                     if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1381                         value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1382                                  || (data->start_class->flags & ANYOF_CLASS));
1383                         cl_anything(pRExC_state, data->start_class);
1384                     }
1385                     if (flags & SCF_DO_STCLASS_AND || !value)
1386                         ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1387                     break;
1388                 case ANYOF:
1389                     if (flags & SCF_DO_STCLASS_AND)
1390                         cl_and(data->start_class,
1391                                (struct regnode_charclass_class*)scan);
1392                     else
1393                         cl_or(pRExC_state, data->start_class,
1394                               (struct regnode_charclass_class*)scan);
1395                     break;
1396                 case ALNUM:
1397                     if (flags & SCF_DO_STCLASS_AND) {
1398                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1399                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1400                             for (value = 0; value < 256; value++)
1401                                 if (!isALNUM(value))
1402                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1403                         }
1404                     }
1405                     else {
1406                         if (data->start_class->flags & ANYOF_LOCALE)
1407                             ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1408                         else {
1409                             for (value = 0; value < 256; value++)
1410                                 if (isALNUM(value))
1411                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1412                         }
1413                     }
1414                     break;
1415                 case ALNUML:
1416                     if (flags & SCF_DO_STCLASS_AND) {
1417                         if (data->start_class->flags & ANYOF_LOCALE)
1418                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1419                     }
1420                     else {
1421                         ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1422                         data->start_class->flags |= ANYOF_LOCALE;
1423                     }
1424                     break;
1425                 case NALNUM:
1426                     if (flags & SCF_DO_STCLASS_AND) {
1427                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1428                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1429                             for (value = 0; value < 256; value++)
1430                                 if (isALNUM(value))
1431                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1432                         }
1433                     }
1434                     else {
1435                         if (data->start_class->flags & ANYOF_LOCALE)
1436                             ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1437                         else {
1438                             for (value = 0; value < 256; value++)
1439                                 if (!isALNUM(value))
1440                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1441                         }
1442                     }
1443                     break;
1444                 case NALNUML:
1445                     if (flags & SCF_DO_STCLASS_AND) {
1446                         if (data->start_class->flags & ANYOF_LOCALE)
1447                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1448                     }
1449                     else {
1450                         data->start_class->flags |= ANYOF_LOCALE;
1451                         ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1452                     }
1453                     break;
1454                 case SPACE:
1455                     if (flags & SCF_DO_STCLASS_AND) {
1456                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1457                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1458                             for (value = 0; value < 256; value++)
1459                                 if (!isSPACE(value))
1460                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1461                         }
1462                     }
1463                     else {
1464                         if (data->start_class->flags & ANYOF_LOCALE)
1465                             ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1466                         else {
1467                             for (value = 0; value < 256; value++)
1468                                 if (isSPACE(value))
1469                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1470                         }
1471                     }
1472                     break;
1473                 case SPACEL:
1474                     if (flags & SCF_DO_STCLASS_AND) {
1475                         if (data->start_class->flags & ANYOF_LOCALE)
1476                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1477                     }
1478                     else {
1479                         data->start_class->flags |= ANYOF_LOCALE;
1480                         ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1481                     }
1482                     break;
1483                 case NSPACE:
1484                     if (flags & SCF_DO_STCLASS_AND) {
1485                         if (!(data->start_class->flags & ANYOF_LOCALE)) {
1486                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1487                             for (value = 0; value < 256; value++)
1488                                 if (isSPACE(value))
1489                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1490                         }
1491                     }
1492                     else {
1493                         if (data->start_class->flags & ANYOF_LOCALE)
1494                             ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1495                         else {
1496                             for (value = 0; value < 256; value++)
1497                                 if (!isSPACE(value))
1498                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1499                         }
1500                     }
1501                     break;
1502                 case NSPACEL:
1503                     if (flags & SCF_DO_STCLASS_AND) {
1504                         if (data->start_class->flags & ANYOF_LOCALE) {
1505                             ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1506                             for (value = 0; value < 256; value++)
1507                                 if (!isSPACE(value))
1508                                     ANYOF_BITMAP_CLEAR(data->start_class, value);
1509                         }
1510                     }
1511                     else {
1512                         data->start_class->flags |= ANYOF_LOCALE;
1513                         ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1514                     }
1515                     break;
1516                 case DIGIT:
1517                     if (flags & SCF_DO_STCLASS_AND) {
1518                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1519                         for (value = 0; value < 256; value++)
1520                             if (!isDIGIT(value))
1521                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1522                     }
1523                     else {
1524                         if (data->start_class->flags & ANYOF_LOCALE)
1525                             ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1526                         else {
1527                             for (value = 0; value < 256; value++)
1528                                 if (isDIGIT(value))
1529                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1530                         }
1531                     }
1532                     break;
1533                 case NDIGIT:
1534                     if (flags & SCF_DO_STCLASS_AND) {
1535                         ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1536                         for (value = 0; value < 256; value++)
1537                             if (isDIGIT(value))
1538                                 ANYOF_BITMAP_CLEAR(data->start_class, value);
1539                     }
1540                     else {
1541                         if (data->start_class->flags & ANYOF_LOCALE)
1542                             ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1543                         else {
1544                             for (value = 0; value < 256; value++)
1545                                 if (!isDIGIT(value))
1546                                     ANYOF_BITMAP_SET(data->start_class, value);                 
1547                         }
1548                     }
1549                     break;
1550                 }
1551                 if (flags & SCF_DO_STCLASS_OR)
1552                     cl_and(data->start_class, &and_with);
1553                 flags &= ~SCF_DO_STCLASS;
1554             }
1555         }
1556         else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1557             data->flags |= (OP(scan) == MEOL
1558                             ? SF_BEFORE_MEOL
1559                             : SF_BEFORE_SEOL);
1560         }
1561         else if (  PL_regkind[(U8)OP(scan)] == BRANCHJ
1562                  /* Lookbehind, or need to calculate parens/evals/stclass: */
1563                    && (scan->flags || data || (flags & SCF_DO_STCLASS))
1564                    && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1565             /* Lookahead/lookbehind */
1566             I32 deltanext, minnext, fake = 0;
1567             regnode *nscan;
1568             struct regnode_charclass_class intrnl;
1569             int f = 0;
1570
1571             data_fake.flags = 0;
1572             if (data) {         
1573                 data_fake.whilem_c = data->whilem_c;
1574                 data_fake.last_closep = data->last_closep;
1575             }
1576             else
1577                 data_fake.last_closep = &fake;
1578             if ( flags & SCF_DO_STCLASS && !scan->flags
1579                  && OP(scan) == IFMATCH ) { /* Lookahead */
1580                 cl_init(pRExC_state, &intrnl);
1581                 data_fake.start_class = &intrnl;
1582                 f |= SCF_DO_STCLASS_AND;
1583             }
1584             if (flags & SCF_WHILEM_VISITED_POS)
1585                 f |= SCF_WHILEM_VISITED_POS;
1586             next = regnext(scan);
1587             nscan = NEXTOPER(NEXTOPER(scan));
1588             minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1589             if (scan->flags) {
1590                 if (deltanext) {
1591                     vFAIL("Variable length lookbehind not implemented");
1592                 }
1593                 else if (minnext > U8_MAX) {
1594                     vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1595                 }
1596                 scan->flags = (U8)minnext;
1597             }
1598             if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1599                 pars++;
1600             if (data && (data_fake.flags & SF_HAS_EVAL))
1601                 data->flags |= SF_HAS_EVAL;
1602             if (data)
1603                 data->whilem_c = data_fake.whilem_c;
1604             if (f & SCF_DO_STCLASS_AND) {
1605                 int was = (data->start_class->flags & ANYOF_EOS);
1606
1607                 cl_and(data->start_class, &intrnl);
1608                 if (was)
1609                     data->start_class->flags |= ANYOF_EOS;
1610             }
1611         }
1612         else if (OP(scan) == OPEN) {
1613             pars++;
1614         }
1615         else if (OP(scan) == CLOSE) {
1616             if ((I32)ARG(scan) == is_par) {
1617                 next = regnext(scan);
1618
1619                 if ( next && (OP(next) != WHILEM) && next < last)
1620                     is_par = 0;         /* Disable optimization */
1621             }
1622             if (data)
1623                 *(data->last_closep) = ARG(scan);
1624         }
1625         else if (OP(scan) == EVAL) {
1626                 if (data)
1627                     data->flags |= SF_HAS_EVAL;
1628         }
1629         else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1630                 if (flags & SCF_DO_SUBSTR) {
1631                     scan_commit(pRExC_state,data);
1632                     data->longest = &(data->longest_float);
1633                 }
1634                 is_inf = is_inf_internal = 1;
1635                 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1636                     cl_anything(pRExC_state, data->start_class);
1637                 flags &= ~SCF_DO_STCLASS;
1638         }
1639         /* Else: zero-length, ignore. */
1640         scan = regnext(scan);
1641     }
1642
1643   finish:
1644     *scanp = scan;
1645     *deltap = is_inf_internal ? I32_MAX : delta;
1646     if (flags & SCF_DO_SUBSTR && is_inf)
1647         data->pos_delta = I32_MAX - data->pos_min;
1648     if (is_par > U8_MAX)
1649         is_par = 0;
1650     if (is_par && pars==1 && data) {
1651         data->flags |= SF_IN_PAR;
1652         data->flags &= ~SF_HAS_PAR;
1653     }
1654     else if (pars && data) {
1655         data->flags |= SF_HAS_PAR;
1656         data->flags &= ~SF_IN_PAR;
1657     }
1658     if (flags & SCF_DO_STCLASS_OR)
1659         cl_and(data->start_class, &and_with);
1660     return min;
1661 }
1662
1663 STATIC I32
1664 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1665 {
1666     if (RExC_rx->data) {
1667         Renewc(RExC_rx->data,
1668                sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1669                char, struct reg_data);
1670         Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1671         RExC_rx->data->count += n;
1672     }
1673     else {
1674         Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1675              char, struct reg_data);
1676         New(1208, RExC_rx->data->what, n, U8);
1677         RExC_rx->data->count = n;
1678     }
1679     Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1680     return RExC_rx->data->count - n;
1681 }
1682
1683 void
1684 Perl_reginitcolors(pTHX)
1685 {
1686     int i = 0;
1687     char *s = PerlEnv_getenv("PERL_RE_COLORS");
1688         
1689     if (s) {
1690         PL_colors[0] = s = savepv(s);
1691         while (++i < 6) {
1692             s = strchr(s, '\t');
1693             if (s) {
1694                 *s = '\0';
1695                 PL_colors[i] = ++s;
1696             }
1697             else
1698                 PL_colors[i] = s = "";
1699         }
1700     } else {
1701         while (i < 6)
1702             PL_colors[i++] = "";
1703     }
1704     PL_colorset = 1;
1705 }
1706
1707
1708 /*
1709  - pregcomp - compile a regular expression into internal code
1710  *
1711  * We can't allocate space until we know how big the compiled form will be,
1712  * but we can't compile it (and thus know how big it is) until we've got a
1713  * place to put the code.  So we cheat:  we compile it twice, once with code
1714  * generation turned off and size counting turned on, and once "for real".
1715  * This also means that we don't allocate space until we are sure that the
1716  * thing really will compile successfully, and we never have to move the
1717  * code and thus invalidate pointers into it.  (Note that it has to be in
1718  * one piece because free() must be able to free it all.) [NB: not true in perl]
1719  *
1720  * Beware that the optimization-preparation code in here knows about some
1721  * of the structure of the compiled regexp.  [I'll say.]
1722  */
1723 regexp *
1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1725 {
1726     register regexp *r;
1727     regnode *scan;
1728     regnode *first;
1729     I32 flags;
1730     I32 minlen = 0;
1731     I32 sawplus = 0;
1732     I32 sawopen = 0;
1733     scan_data_t data;
1734     RExC_state_t RExC_state;
1735     RExC_state_t *pRExC_state = &RExC_state;
1736
1737     if (exp == NULL)
1738         FAIL("NULL regexp argument");
1739
1740     RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1741
1742     RExC_precomp = exp;
1743     DEBUG_r({
1744          if (!PL_colorset) reginitcolors();
1745          PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746                        PL_colors[4],PL_colors[5],PL_colors[0],
1747                        (int)(xend - exp), RExC_precomp, PL_colors[1]);
1748     });
1749     RExC_flags = pm->op_pmflags;
1750     RExC_sawback = 0;
1751
1752     RExC_seen = 0;
1753     RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754     RExC_seen_evals = 0;
1755     RExC_extralen = 0;
1756
1757     /* First pass: determine size, legality. */
1758     RExC_parse = exp;
1759     RExC_start = exp;
1760     RExC_end = xend;
1761     RExC_naughty = 0;
1762     RExC_npar = 1;
1763     RExC_size = 0L;
1764     RExC_emit = &PL_regdummy;
1765     RExC_whilem_seen = 0;
1766 #if 0 /* REGC() is (currently) a NOP at the first pass.
1767        * Clever compilers notice this and complain. --jhi */
1768     REGC((U8)REG_MAGIC, (char*)RExC_emit);
1769 #endif
1770     if (reg(pRExC_state, 0, &flags) == NULL) {
1771         RExC_precomp = Nullch;
1772         return(NULL);
1773     }
1774     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1775
1776     /* Small enough for pointer-storage convention?
1777        If extralen==0, this means that we will not need long jumps. */
1778     if (RExC_size >= 0x10000L && RExC_extralen)
1779         RExC_size += RExC_extralen;
1780     else
1781         RExC_extralen = 0;
1782     if (RExC_whilem_seen > 15)
1783         RExC_whilem_seen = 15;
1784
1785     /* Allocate space and initialize. */
1786     Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1787          char, regexp);
1788     if (r == NULL)
1789         FAIL("Regexp out of space");
1790
1791 #ifdef DEBUGGING
1792     /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1793     Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1794 #endif
1795     r->refcnt = 1;
1796     r->prelen = xend - exp;
1797     r->precomp = savepvn(RExC_precomp, r->prelen);
1798     r->subbeg = NULL;
1799     r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1800     r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1801
1802     r->substrs = 0;                     /* Useful during FAIL. */
1803     r->startp = 0;                      /* Useful during FAIL. */
1804     r->endp = 0;                        /* Useful during FAIL. */
1805
1806     Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1807     if (r->offsets) {
1808       r->offsets[0] = RExC_size; 
1809     }
1810     DEBUG_r(PerlIO_printf(Perl_debug_log, 
1811                           "%s %"UVuf" bytes for offset annotations.\n", 
1812                           r->offsets ? "Got" : "Couldn't get", 
1813                           (UV)((2*RExC_size+1) * sizeof(U32))));
1814
1815     RExC_rx = r;
1816
1817     /* Second pass: emit code. */
1818     RExC_flags = pm->op_pmflags;        /* don't let top level (?i) bleed */
1819     RExC_parse = exp;
1820     RExC_end = xend;
1821     RExC_naughty = 0;
1822     RExC_npar = 1;
1823     RExC_emit_start = r->program;
1824     RExC_emit = r->program;
1825     /* Store the count of eval-groups for security checks: */
1826     RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1827     REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1828     r->data = 0;
1829     if (reg(pRExC_state, 0, &flags) == NULL)
1830         return(NULL);
1831
1832     /* Dig out information for optimizations. */
1833     r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1834     pm->op_pmflags = RExC_flags;
1835     if (UTF)
1836         r->reganch |= ROPT_UTF8;        /* Unicode in it? */
1837     r->regstclass = NULL;
1838     if (RExC_naughty >= 10)     /* Probably an expensive pattern. */
1839         r->reganch |= ROPT_NAUGHTY;
1840     scan = r->program + 1;              /* First BRANCH. */
1841
1842     /* XXXX To minimize changes to RE engine we always allocate
1843        3-units-long substrs field. */
1844     Newz(1004, r->substrs, 1, struct reg_substr_data);
1845
1846     StructCopy(&zero_scan_data, &data, scan_data_t);
1847     /* XXXX Should not we check for something else?  Usually it is OPEN1... */
1848     if (OP(scan) != BRANCH) {   /* Only one top-level choice. */
1849         I32 fake;
1850         STRLEN longest_float_length, longest_fixed_length;
1851         struct regnode_charclass_class ch_class;
1852         int stclass_flag;
1853         I32 last_close = 0;
1854
1855         first = scan;
1856         /* Skip introductions and multiplicators >= 1. */
1857         while ((OP(first) == OPEN && (sawopen = 1)) ||
1858                /* An OR of *one* alternative - should not happen now. */
1859             (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1860             (OP(first) == PLUS) ||
1861             (OP(first) == MINMOD) ||
1862                /* An {n,m} with n>0 */
1863             (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1864                 if (OP(first) == PLUS)
1865                     sawplus = 1;
1866                 else
1867                     first += regarglen[(U8)OP(first)];
1868                 first = NEXTOPER(first);
1869         }
1870
1871         /* Starting-point info. */
1872       again:
1873         if (PL_regkind[(U8)OP(first)] == EXACT) {
1874             if (OP(first) == EXACT)
1875                 ;       /* Empty, get anchored substr later. */
1876             else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1877                 r->regstclass = first;
1878         }
1879         else if (strchr((char*)PL_simple,OP(first)))
1880             r->regstclass = first;
1881         else if (PL_regkind[(U8)OP(first)] == BOUND ||
1882                  PL_regkind[(U8)OP(first)] == NBOUND)
1883             r->regstclass = first;
1884         else if (PL_regkind[(U8)OP(first)] == BOL) {
1885             r->reganch |= (OP(first) == MBOL
1886                            ? ROPT_ANCH_MBOL
1887                            : (OP(first) == SBOL
1888                               ? ROPT_ANCH_SBOL
1889                               : ROPT_ANCH_BOL));
1890             first = NEXTOPER(first);
1891             goto again;
1892         }
1893         else if (OP(first) == GPOS) {
1894             r->reganch |= ROPT_ANCH_GPOS;
1895             first = NEXTOPER(first);
1896             goto again;
1897         }
1898         else if (!sawopen && (OP(first) == STAR &&
1899             PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1900             !(r->reganch & ROPT_ANCH) )
1901         {
1902             /* turn .* into ^.* with an implied $*=1 */
1903             int type = OP(NEXTOPER(first));
1904
1905             if (type == REG_ANY)
1906                 type = ROPT_ANCH_MBOL;
1907             else
1908                 type = ROPT_ANCH_SBOL;
1909
1910             r->reganch |= type | ROPT_IMPLICIT;
1911             first = NEXTOPER(first);
1912             goto again;
1913         }
1914         if (sawplus && (!sawopen || !RExC_sawback)
1915             && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1916             /* x+ must match at the 1st pos of run of x's */
1917             r->reganch |= ROPT_SKIP;
1918
1919         /* Scan is after the zeroth branch, first is atomic matcher. */
1920         DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1921                               (IV)(first - scan + 1)));
1922         /*
1923         * If there's something expensive in the r.e., find the
1924         * longest literal string that must appear and make it the
1925         * regmust.  Resolve ties in favor of later strings, since
1926         * the regstart check works with the beginning of the r.e.
1927         * and avoiding duplication strengthens checking.  Not a
1928         * strong reason, but sufficient in the absence of others.
1929         * [Now we resolve ties in favor of the earlier string if
1930         * it happens that c_offset_min has been invalidated, since the
1931         * earlier string may buy us something the later one won't.]
1932         */
1933         minlen = 0;
1934
1935         data.longest_fixed = newSVpvn("",0);
1936         data.longest_float = newSVpvn("",0);
1937         data.last_found = newSVpvn("",0);
1938         data.longest = &(data.longest_fixed);
1939         first = scan;
1940         if (!r->regstclass) {
1941             cl_init(pRExC_state, &ch_class);
1942             data.start_class = &ch_class;
1943             stclass_flag = SCF_DO_STCLASS_AND;
1944         } else                          /* XXXX Check for BOUND? */
1945             stclass_flag = 0;
1946         data.last_closep = &last_close;
1947
1948         minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1949                              &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1950         if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1951              && data.last_start_min == 0 && data.last_end > 0
1952              && !RExC_seen_zerolen
1953              && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1954             r->reganch |= ROPT_CHECK_ALL;
1955         scan_commit(pRExC_state, &data);
1956         SvREFCNT_dec(data.last_found);
1957
1958         longest_float_length = CHR_SVLEN(data.longest_float);
1959         if (longest_float_length
1960             || (data.flags & SF_FL_BEFORE_EOL
1961                 && (!(data.flags & SF_FL_BEFORE_MEOL)
1962                     || (RExC_flags & PMf_MULTILINE)))) {
1963             int t;
1964
1965             if (SvCUR(data.longest_fixed)                       /* ok to leave SvCUR */
1966                 && data.offset_fixed == data.offset_float_min
1967                 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1968                     goto remove_float;          /* As in (a)+. */
1969
1970             if (SvUTF8(data.longest_float)) {
1971                 r->float_utf8 = data.longest_float;
1972                 r->float_substr = Nullsv;
1973             } else {
1974                 r->float_substr = data.longest_float;
1975                 r->float_utf8 = Nullsv;
1976             }
1977             r->float_min_offset = data.offset_float_min;
1978             r->float_max_offset = data.offset_float_max;
1979             t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1980                        && (!(data.flags & SF_FL_BEFORE_MEOL)
1981                            || (RExC_flags & PMf_MULTILINE)));
1982             fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1983         }
1984         else {
1985           remove_float:
1986             r->float_substr = r->float_utf8 = Nullsv;
1987             SvREFCNT_dec(data.longest_float);
1988             longest_float_length = 0;
1989         }
1990
1991         longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1992         if (longest_fixed_length
1993             || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1994                 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1995                     || (RExC_flags & PMf_MULTILINE)))) {
1996             int t;
1997
1998             if (SvUTF8(data.longest_fixed)) {
1999                 r->anchored_utf8 = data.longest_fixed;
2000                 r->anchored_substr = Nullsv;
2001             } else {
2002                 r->anchored_substr = data.longest_fixed;
2003                 r->anchored_utf8 = Nullsv;
2004             }
2005             r->anchored_offset = data.offset_fixed;
2006             t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2007                  && (!(data.flags & SF_FIX_BEFORE_MEOL)
2008                      || (RExC_flags & PMf_MULTILINE)));
2009             fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2010         }
2011         else {
2012             r->anchored_substr = r->anchored_utf8 = Nullsv;
2013             SvREFCNT_dec(data.longest_fixed);
2014             longest_fixed_length = 0;
2015         }
2016         if (r->regstclass
2017             && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2018             r->regstclass = NULL;
2019         if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2020             && stclass_flag
2021             && !(data.start_class->flags & ANYOF_EOS)
2022             && !cl_is_anything(data.start_class))
2023         {
2024             I32 n = add_data(pRExC_state, 1, "f");
2025
2026             New(1006, RExC_rx->data->data[n], 1,
2027                 struct regnode_charclass_class);
2028             StructCopy(data.start_class,
2029                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2030                        struct regnode_charclass_class);
2031             r->regstclass = (regnode*)RExC_rx->data->data[n];
2032             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2033             PL_regdata = r->data; /* for regprop() */
2034             DEBUG_r({ SV *sv = sv_newmortal();
2035                       regprop(sv, (regnode*)data.start_class);
2036                       PerlIO_printf(Perl_debug_log,
2037                                     "synthetic stclass `%s'.\n",
2038                                     SvPVX(sv));});
2039         }
2040
2041         /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2042         if (longest_fixed_length > longest_float_length) {
2043             r->check_substr = r->anchored_substr;
2044             r->check_utf8 = r->anchored_utf8;
2045             r->check_offset_min = r->check_offset_max = r->anchored_offset;
2046             if (r->reganch & ROPT_ANCH_SINGLE)
2047                 r->reganch |= ROPT_NOSCAN;
2048         }
2049         else {
2050             r->check_substr = r->float_substr;
2051             r->check_utf8 = r->float_utf8;
2052             r->check_offset_min = data.offset_float_min;
2053             r->check_offset_max = data.offset_float_max;
2054         }
2055         /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2056            This should be changed ASAP!  */
2057         if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2058             r->reganch |= RE_USE_INTUIT;
2059             if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2060                 r->reganch |= RE_INTUIT_TAIL;
2061         }
2062     }
2063     else {
2064         /* Several toplevels. Best we can is to set minlen. */
2065         I32 fake;
2066         struct regnode_charclass_class ch_class;
2067         I32 last_close = 0;
2068         
2069         DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2070         scan = r->program + 1;
2071         cl_init(pRExC_state, &ch_class);
2072         data.start_class = &ch_class;
2073         data.last_closep = &last_close;
2074         minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2075         r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2076                 = r->float_substr = r->float_utf8 = Nullsv;
2077         if (!(data.start_class->flags & ANYOF_EOS)
2078             && !cl_is_anything(data.start_class))
2079         {
2080             I32 n = add_data(pRExC_state, 1, "f");
2081
2082             New(1006, RExC_rx->data->data[n], 1,
2083                 struct regnode_charclass_class);
2084             StructCopy(data.start_class,
2085                        (struct regnode_charclass_class*)RExC_rx->data->data[n],
2086                        struct regnode_charclass_class);
2087             r->regstclass = (regnode*)RExC_rx->data->data[n];
2088             r->reganch &= ~ROPT_SKIP;   /* Used in find_byclass(). */
2089             DEBUG_r({ SV* sv = sv_newmortal();
2090                       regprop(sv, (regnode*)data.start_class);
2091                       PerlIO_printf(Perl_debug_log,
2092                                     "synthetic stclass `%s'.\n",
2093                                     SvPVX(sv));});
2094         }
2095     }
2096
2097     r->minlen = minlen;
2098     if (RExC_seen & REG_SEEN_GPOS)
2099         r->reganch |= ROPT_GPOS_SEEN;
2100     if (RExC_seen & REG_SEEN_LOOKBEHIND)
2101         r->reganch |= ROPT_LOOKBEHIND_SEEN;
2102     if (RExC_seen & REG_SEEN_EVAL)
2103         r->reganch |= ROPT_EVAL_SEEN;
2104     if (RExC_seen & REG_SEEN_CANY)
2105         r->reganch |= ROPT_CANY_SEEN;
2106     Newz(1002, r->startp, RExC_npar, I32);
2107     Newz(1002, r->endp, RExC_npar, I32);
2108     PL_regdata = r->data; /* for regprop() */
2109     DEBUG_r(regdump(r));
2110     return(r);
2111 }
2112
2113 /*
2114  - reg - regular expression, i.e. main body or parenthesized thing
2115  *
2116  * Caller must absorb opening parenthesis.
2117  *
2118  * Combining parenthesis handling with the base level of regular expression
2119  * is a trifle forced, but the need to tie the tails of the branches to what
2120  * follows makes it hard to avoid.
2121  */
2122 STATIC regnode *
2123 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2124     /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2125 {
2126     register regnode *ret;              /* Will be the head of the group. */
2127     register regnode *br;
2128     register regnode *lastbr;
2129     register regnode *ender = 0;
2130     register I32 parno = 0;
2131     I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2132
2133     /* for (?g), (?gc), and (?o) warnings; warning
2134        about (?c) will warn about (?g) -- japhy    */
2135
2136     I32 wastedflags = 0x00,
2137         wasted_o    = 0x01,
2138         wasted_g    = 0x02,
2139         wasted_gc   = 0x02 | 0x04,
2140         wasted_c    = 0x04;
2141
2142     char * parse_start = RExC_parse; /* MJD */
2143     char *oregcomp_parse = RExC_parse;
2144     char c;
2145
2146     *flagp = 0;                         /* Tentatively. */
2147
2148
2149     /* Make an OPEN node, if parenthesized. */
2150     if (paren) {
2151         if (*RExC_parse == '?') { /* (?...) */
2152             U32 posflags = 0, negflags = 0;
2153             U32 *flagsp = &posflags;
2154             int logical = 0;
2155             char *seqstart = RExC_parse;
2156
2157             RExC_parse++;
2158             paren = *RExC_parse++;
2159             ret = NULL;                 /* For look-ahead/behind. */
2160             switch (paren) {
2161             case '<':           /* (?<...) */
2162                 RExC_seen |= REG_SEEN_LOOKBEHIND;
2163                 if (*RExC_parse == '!')
2164                     paren = ',';
2165                 if (*RExC_parse != '=' && *RExC_parse != '!')
2166                     goto unknown;
2167                 RExC_parse++;
2168             case '=':           /* (?=...) */
2169             case '!':           /* (?!...) */
2170                 RExC_seen_zerolen++;
2171             case ':':           /* (?:...) */
2172             case '>':           /* (?>...) */
2173                 break;
2174             case '$':           /* (?$...) */
2175             case '@':           /* (?@...) */
2176                 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2177                 break;
2178             case '#':           /* (?#...) */
2179                 while (*RExC_parse && *RExC_parse != ')')
2180                     RExC_parse++;
2181                 if (*RExC_parse != ')')
2182                     FAIL("Sequence (?#... not terminated");
2183                 nextchar(pRExC_state);
2184                 *flagp = TRYAGAIN;
2185                 return NULL;
2186             case 'p':           /* (?p...) */
2187                 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2188                     vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2189                 /* FALL THROUGH*/
2190             case '?':           /* (??...) */
2191                 logical = 1;
2192                 if (*RExC_parse != '{')
2193                     goto unknown;
2194                 paren = *RExC_parse++;
2195                 /* FALL THROUGH */
2196             case '{':           /* (?{...}) */
2197             {
2198                 I32 count = 1, n = 0;
2199                 char c;
2200                 char *s = RExC_parse;
2201                 SV *sv;
2202                 OP_4tree *sop, *rop;
2203
2204                 RExC_seen_zerolen++;
2205                 RExC_seen |= REG_SEEN_EVAL;
2206                 while (count && (c = *RExC_parse)) {
2207                     if (c == '\\' && RExC_parse[1])
2208                         RExC_parse++;
2209                     else if (c == '{')
2210                         count++;
2211                     else if (c == '}')
2212                         count--;
2213                     RExC_parse++;
2214                 }
2215                 if (*RExC_parse != ')')
2216                 {
2217                     RExC_parse = s;             
2218                     vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2219                 }
2220                 if (!SIZE_ONLY) {
2221                     PAD *pad;
2222                 
2223                     if (RExC_parse - 1 - s)
2224                         sv = newSVpvn(s, RExC_parse - 1 - s);
2225                     else
2226                         sv = newSVpvn("", 0);
2227
2228                     ENTER;
2229                     Perl_save_re_context(aTHX);
2230                     rop = sv_compile_2op(sv, &sop, "re", &pad);
2231                     sop->op_private |= OPpREFCOUNTED;
2232                     /* re_dup will OpREFCNT_inc */
2233                     OpREFCNT_set(sop, 1);
2234                     LEAVE;
2235
2236                     n = add_data(pRExC_state, 3, "nop");
2237                     RExC_rx->data->data[n] = (void*)rop;
2238                     RExC_rx->data->data[n+1] = (void*)sop;
2239                     RExC_rx->data->data[n+2] = (void*)pad;
2240                     SvREFCNT_dec(sv);
2241                 }
2242                 else {                                          /* First pass */
2243                     if (PL_reginterp_cnt < ++RExC_seen_evals
2244                         && PL_curcop != &PL_compiling)
2245                         /* No compiled RE interpolated, has runtime
2246                            components ===> unsafe.  */
2247                         FAIL("Eval-group not allowed at runtime, use re 'eval'");
2248                     if (PL_tainting && PL_tainted)
2249                         FAIL("Eval-group in insecure regular expression");
2250                 }
2251                 
2252                 nextchar(pRExC_state);
2253                 if (logical) {
2254                     ret = reg_node(pRExC_state, LOGICAL);
2255                     if (!SIZE_ONLY)
2256                         ret->flags = 2;
2257                     regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2258                     /* deal with the length of this later - MJD */
2259                     return ret;
2260                 }
2261                 return reganode(pRExC_state, EVAL, n);
2262             }
2263             case '(':           /* (?(?{...})...) and (?(?=...)...) */
2264             {
2265                 if (RExC_parse[0] == '?') {        /* (?(?...)) */
2266                     if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2267                         || RExC_parse[1] == '<'
2268                         || RExC_parse[1] == '{') { /* Lookahead or eval. */
2269                         I32 flag;
2270                         
2271                         ret = reg_node(pRExC_state, LOGICAL);
2272                         if (!SIZE_ONLY)
2273                             ret->flags = 1;
2274                         regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2275                         goto insert_if;
2276                     }
2277                 }
2278                 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2279                     /* (?(1)...) */
2280                     parno = atoi(RExC_parse++);
2281
2282                     while (isDIGIT(*RExC_parse))
2283                         RExC_parse++;
2284                     ret = reganode(pRExC_state, GROUPP, parno);
2285                     
2286                     if ((c = *nextchar(pRExC_state)) != ')')
2287                         vFAIL("Switch condition not recognized");
2288                   insert_if:
2289                     regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2290                     br = regbranch(pRExC_state, &flags, 1);
2291                     if (br == NULL)
2292                         br = reganode(pRExC_state, LONGJMP, 0);
2293                     else
2294                         regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2295                     c = *nextchar(pRExC_state);
2296                     if (flags&HASWIDTH)
2297                         *flagp |= HASWIDTH;
2298                     if (c == '|') {
2299                         lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2300                         regbranch(pRExC_state, &flags, 1);
2301                         regtail(pRExC_state, ret, lastbr);
2302                         if (flags&HASWIDTH)
2303                             *flagp |= HASWIDTH;
2304                         c = *nextchar(pRExC_state);
2305                     }
2306                     else
2307                         lastbr = NULL;
2308                     if (c != ')')
2309                         vFAIL("Switch (?(condition)... contains too many branches");
2310                     ender = reg_node(pRExC_state, TAIL);
2311                     regtail(pRExC_state, br, ender);
2312                     if (lastbr) {
2313                         regtail(pRExC_state, lastbr, ender);
2314                         regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2315                     }
2316                     else
2317                         regtail(pRExC_state, ret, ender);
2318                     return ret;
2319                 }
2320                 else {
2321                     vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2322                 }
2323             }
2324             case 0:
2325                 RExC_parse--; /* for vFAIL to print correctly */
2326                 vFAIL("Sequence (? incomplete");
2327                 break;
2328             default:
2329                 --RExC_parse;
2330               parse_flags:      /* (?i) */
2331                 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2332                     /* (?g), (?gc) and (?o) are useless here
2333                        and must be globally applied -- japhy */
2334
2335                     if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2336                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2337                             I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2338                             if (! (wastedflags & wflagbit) ) {
2339                                 wastedflags |= wflagbit;
2340                                 vWARN5(
2341                                     RExC_parse + 1,
2342                                     "Useless (%s%c) - %suse /%c modifier",
2343                                     flagsp == &negflags ? "?-" : "?",
2344                                     *RExC_parse,
2345                                     flagsp == &negflags ? "don't " : "",
2346                                     *RExC_parse
2347                                 );
2348                             }
2349                         }
2350                     }
2351                     else if (*RExC_parse == 'c') {
2352                         if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2353                             if (! (wastedflags & wasted_c) ) {
2354                                 wastedflags |= wasted_gc;
2355                                 vWARN3(
2356                                     RExC_parse + 1,
2357                                     "Useless (%sc) - %suse /gc modifier",
2358                                     flagsp == &negflags ? "?-" : "?",
2359                                     flagsp == &negflags ? "don't " : ""
2360                                 );
2361                             }
2362                         }
2363                     }
2364                     else { pmflag(flagsp, *RExC_parse); }
2365
2366                     ++RExC_parse;
2367                 }
2368                 if (*RExC_parse == '-') {
2369                     flagsp = &negflags;
2370                     wastedflags = 0;  /* reset so (?g-c) warns twice */
2371                     ++RExC_parse;
2372                     goto parse_flags;
2373                 }
2374                 RExC_flags |= posflags;
2375                 RExC_flags &= ~negflags;
2376                 if (*RExC_parse == ':') {
2377                     RExC_parse++;
2378                     paren = ':';
2379                     break;
2380                 }               
2381               unknown:
2382                 if (*RExC_parse != ')') {
2383                     RExC_parse++;
2384                     vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2385                 }
2386                 nextchar(pRExC_state);
2387                 *flagp = TRYAGAIN;
2388                 return NULL;
2389             }
2390         }
2391         else {                  /* (...) */
2392             parno = RExC_npar;
2393             RExC_npar++;
2394             ret = reganode(pRExC_state, OPEN, parno);
2395             Set_Node_Length(ret, 1); /* MJD */
2396             Set_Node_Offset(ret, RExC_parse); /* MJD */
2397             open = 1;
2398         }
2399     }
2400     else                        /* ! paren */
2401         ret = NULL;
2402
2403     /* Pick up the branches, linking them together. */
2404     parse_start = RExC_parse;   /* MJD */
2405     br = regbranch(pRExC_state, &flags, 1);
2406     /*     branch_len = (paren != 0); */
2407     
2408     if (br == NULL)
2409         return(NULL);
2410     if (*RExC_parse == '|') {
2411         if (!SIZE_ONLY && RExC_extralen) {
2412             reginsert(pRExC_state, BRANCHJ, br);
2413         }
2414         else {                  /* MJD */
2415             reginsert(pRExC_state, BRANCH, br);
2416             Set_Node_Length(br, paren != 0);
2417             Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2418         }
2419         have_branch = 1;
2420         if (SIZE_ONLY)
2421             RExC_extralen += 1;         /* For BRANCHJ-BRANCH. */
2422     }
2423     else if (paren == ':') {
2424         *flagp |= flags&SIMPLE;
2425     }
2426     if (open) {                         /* Starts with OPEN. */
2427         regtail(pRExC_state, ret, br);          /* OPEN -> first. */
2428     }
2429     else if (paren != '?')              /* Not Conditional */
2430         ret = br;
2431     *flagp |= flags & (SPSTART | HASWIDTH);
2432     lastbr = br;
2433     while (*RExC_parse == '|') {
2434         if (!SIZE_ONLY && RExC_extralen) {
2435             ender = reganode(pRExC_state, LONGJMP,0);
2436             regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2437         }
2438         if (SIZE_ONLY)
2439             RExC_extralen += 2;         /* Account for LONGJMP. */
2440         nextchar(pRExC_state);
2441         br = regbranch(pRExC_state, &flags, 0);
2442         
2443         if (br == NULL)
2444             return(NULL);
2445         regtail(pRExC_state, lastbr, br);               /* BRANCH -> BRANCH. */
2446         lastbr = br;
2447         if (flags&HASWIDTH)
2448             *flagp |= HASWIDTH;
2449         *flagp |= flags&SPSTART;
2450     }
2451
2452     if (have_branch || paren != ':') {
2453         /* Make a closing node, and hook it on the end. */
2454         switch (paren) {
2455         case ':':
2456             ender = reg_node(pRExC_state, TAIL);
2457             break;
2458         case 1:
2459             ender = reganode(pRExC_state, CLOSE, parno);
2460             Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2461             Set_Node_Length(ender,1); /* MJD */
2462             break;
2463         case '<':
2464         case ',':
2465         case '=':
2466         case '!':
2467             *flagp &= ~HASWIDTH;
2468             /* FALL THROUGH */
2469         case '>':
2470             ender = reg_node(pRExC_state, SUCCEED);
2471             break;
2472         case 0:
2473             ender = reg_node(pRExC_state, END);
2474             break;
2475         }
2476         regtail(pRExC_state, lastbr, ender);
2477
2478         if (have_branch) {
2479             /* Hook the tails of the branches to the closing node. */
2480             for (br = ret; br != NULL; br = regnext(br)) {
2481                 regoptail(pRExC_state, br, ender);
2482             }
2483         }
2484     }
2485
2486     {
2487         char *p;
2488         static char parens[] = "=!<,>";
2489
2490         if (paren && (p = strchr(parens, paren))) {
2491             U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2492             int flag = (p - parens) > 1;
2493
2494             if (paren == '>')
2495                 node = SUSPEND, flag = 0;
2496             reginsert(pRExC_state, node,ret);
2497             ret->flags = flag;
2498             regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2499         }
2500     }
2501
2502     /* Check for proper termination. */
2503     if (paren) {
2504         RExC_flags = oregflags;
2505         if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2506             RExC_parse = oregcomp_parse;
2507             vFAIL("Unmatched (");
2508         }
2509     }
2510     else if (!paren && RExC_parse < RExC_end) {
2511         if (*RExC_parse == ')') {
2512             RExC_parse++;
2513             vFAIL("Unmatched )");
2514         }
2515         else
2516             FAIL("Junk on end of regexp");      /* "Can't happen". */
2517         /* NOTREACHED */
2518     }
2519
2520     return(ret);
2521 }
2522
2523 /*
2524  - regbranch - one alternative of an | operator
2525  *
2526  * Implements the concatenation operator.
2527  */
2528 STATIC regnode *
2529 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2530 {
2531     register regnode *ret;
2532     register regnode *chain = NULL;
2533     register regnode *latest;
2534     I32 flags = 0, c = 0;
2535
2536     if (first)
2537         ret = NULL;
2538     else {
2539         if (!SIZE_ONLY && RExC_extralen)
2540             ret = reganode(pRExC_state, BRANCHJ,0);
2541         else {
2542             ret = reg_node(pRExC_state, BRANCH);
2543             Set_Node_Length(ret, 1);
2544         }
2545     }
2546         
2547     if (!first && SIZE_ONLY)
2548         RExC_extralen += 1;                     /* BRANCHJ */
2549
2550     *flagp = WORST;                     /* Tentatively. */
2551
2552     RExC_parse--;
2553     nextchar(pRExC_state);
2554     while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2555         flags &= ~TRYAGAIN;
2556         latest = regpiece(pRExC_state, &flags);
2557         if (latest == NULL) {
2558             if (flags & TRYAGAIN)
2559                 continue;
2560             return(NULL);
2561         }
2562         else if (ret == NULL)
2563             ret = latest;
2564         *flagp |= flags&HASWIDTH;
2565         if (chain == NULL)      /* First piece. */
2566             *flagp |= flags&SPSTART;
2567         else {
2568             RExC_naughty++;
2569             regtail(pRExC_state, chain, latest);
2570         }
2571         chain = latest;
2572         c++;
2573     }
2574     if (chain == NULL) {        /* Loop ran zero times. */
2575         chain = reg_node(pRExC_state, NOTHING);
2576         if (ret == NULL)
2577             ret = chain;
2578     }
2579     if (c == 1) {
2580         *flagp |= flags&SIMPLE;
2581     }
2582
2583     return(ret);
2584 }
2585
2586 /*
2587  - regpiece - something followed by possible [*+?]
2588  *
2589  * Note that the branching code sequences used for ? and the general cases
2590  * of * and + are somewhat optimized:  they use the same NOTHING node as
2591  * both the endmarker for their branch list and the body of the last branch.
2592  * It might seem that this node could be dispensed with entirely, but the
2593  * endmarker role is not redundant.
2594  */
2595 STATIC regnode *
2596 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2597 {
2598     register regnode *ret;
2599     register char op;
2600     register char *next;
2601     I32 flags;
2602     char *origparse = RExC_parse;
2603     char *maxpos;
2604     I32 min;
2605     I32 max = REG_INFTY;
2606     char *parse_start;
2607
2608     ret = regatom(pRExC_state, &flags);
2609     if (ret == NULL) {
2610         if (flags & TRYAGAIN)
2611             *flagp |= TRYAGAIN;
2612         return(NULL);
2613     }
2614
2615     op = *RExC_parse;
2616
2617     if (op == '{' && regcurly(RExC_parse)) {
2618         parse_start = RExC_parse; /* MJD */
2619         next = RExC_parse + 1;
2620         maxpos = Nullch;
2621         while (isDIGIT(*next) || *next == ',') {
2622             if (*next == ',') {
2623                 if (maxpos)
2624                     break;
2625                 else
2626                     maxpos = next;
2627             }
2628             next++;
2629         }
2630         if (*next == '}') {             /* got one */
2631             if (!maxpos)
2632                 maxpos = next;
2633             RExC_parse++;
2634             min = atoi(RExC_parse);
2635             if (*maxpos == ',')
2636                 maxpos++;
2637             else
2638                 maxpos = RExC_parse;
2639             max = atoi(maxpos);
2640             if (!max && *maxpos != '0')
2641                 max = REG_INFTY;                /* meaning "infinity" */
2642             else if (max >= REG_INFTY)
2643                 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2644             RExC_parse = next;
2645             nextchar(pRExC_state);
2646
2647         do_curly:
2648             if ((flags&SIMPLE)) {
2649                 RExC_naughty += 2 + RExC_naughty / 2;
2650                 reginsert(pRExC_state, CURLY, ret);
2651                 Set_Node_Offset(ret, parse_start+1); /* MJD */
2652                 Set_Node_Cur_Length(ret);
2653             }
2654             else {
2655                 regnode *w = reg_node(pRExC_state, WHILEM);
2656
2657                 w->flags = 0;
2658                 regtail(pRExC_state, ret, w);
2659                 if (!SIZE_ONLY && RExC_extralen) {
2660                     reginsert(pRExC_state, LONGJMP,ret);
2661                     reginsert(pRExC_state, NOTHING,ret);
2662                     NEXT_OFF(ret) = 3;  /* Go over LONGJMP. */
2663                 }
2664                 reginsert(pRExC_state, CURLYX,ret);
2665                                 /* MJD hk */
2666                 Set_Node_Offset(ret, parse_start+1);
2667                 Set_Node_Length(ret, 
2668                                 op == '{' ? (RExC_parse - parse_start) : 1);
2669                 
2670                 if (!SIZE_ONLY && RExC_extralen)
2671                     NEXT_OFF(ret) = 3;  /* Go over NOTHING to LONGJMP. */
2672                 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2673                 if (SIZE_ONLY)
2674                     RExC_whilem_seen++, RExC_extralen += 3;
2675                 RExC_naughty += 4 + RExC_naughty;       /* compound interest */
2676             }
2677             ret->flags = 0;
2678
2679             if (min > 0)
2680                 *flagp = WORST;
2681             if (max > 0)
2682                 *flagp |= HASWIDTH;
2683             if (max && max < min)
2684                 vFAIL("Can't do {n,m} with n > m");
2685             if (!SIZE_ONLY) {
2686                 ARG1_SET(ret, (U16)min);
2687                 ARG2_SET(ret, (U16)max);
2688             }
2689
2690             goto nest_check;
2691         }
2692     }
2693
2694     if (!ISMULT1(op)) {
2695         *flagp = flags;
2696         return(ret);
2697     }
2698
2699 #if 0                           /* Now runtime fix should be reliable. */
2700
2701     /* if this is reinstated, don't forget to put this back into perldiag:
2702
2703             =item Regexp *+ operand could be empty at {#} in regex m/%s/
2704
2705            (F) The part of the regexp subject to either the * or + quantifier
2706            could match an empty string. The {#} shows in the regular
2707            expression about where the problem was discovered.
2708
2709     */
2710
2711     if (!(flags&HASWIDTH) && op != '?')
2712       vFAIL("Regexp *+ operand could be empty");
2713 #endif
2714
2715     parse_start = RExC_parse;
2716     nextchar(pRExC_state);
2717
2718     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2719
2720     if (op == '*' && (flags&SIMPLE)) {
2721         reginsert(pRExC_state, STAR, ret);
2722         ret->flags = 0;
2723         RExC_naughty += 4;
2724     }
2725     else if (op == '*') {
2726         min = 0;
2727         goto do_curly;
2728     }
2729     else if (op == '+' && (flags&SIMPLE)) {
2730         reginsert(pRExC_state, PLUS, ret);
2731         ret->flags = 0;
2732         RExC_naughty += 3;
2733     }
2734     else if (op == '+') {
2735         min = 1;
2736         goto do_curly;
2737     }
2738     else if (op == '?') {
2739         min = 0; max = 1;
2740         goto do_curly;
2741     }
2742   nest_check:
2743     if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2744         vWARN3(RExC_parse,
2745                "%.*s matches null string many times",
2746                RExC_parse - origparse,
2747                origparse);
2748     }
2749
2750     if (*RExC_parse == '?') {
2751         nextchar(pRExC_state);
2752         reginsert(pRExC_state, MINMOD, ret);
2753         regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2754     }
2755     if (ISMULT2(RExC_parse)) {
2756         RExC_parse++;
2757         vFAIL("Nested quantifiers");
2758     }
2759
2760     return(ret);
2761 }
2762
2763 /*
2764  - regatom - the lowest level
2765  *
2766  * Optimization:  gobbles an entire sequence of ordinary characters so that
2767  * it can turn them into a single node, which is smaller to store and
2768  * faster to run.  Backslashed characters are exceptions, each becoming a
2769  * separate node; the code is simpler that way and it's not worth fixing.
2770  *
2771  * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2772 STATIC regnode *
2773 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2774 {
2775     register regnode *ret = 0;
2776     I32 flags;
2777     char *parse_start = 0;
2778
2779     *flagp = WORST;             /* Tentatively. */
2780
2781 tryagain:
2782     switch (*RExC_parse) {
2783     case '^':
2784         RExC_seen_zerolen++;
2785         nextchar(pRExC_state);
2786         if (RExC_flags & PMf_MULTILINE)
2787             ret = reg_node(pRExC_state, MBOL);
2788         else if (RExC_flags & PMf_SINGLELINE)
2789             ret = reg_node(pRExC_state, SBOL);
2790         else
2791             ret = reg_node(pRExC_state, BOL);
2792         Set_Node_Length(ret, 1); /* MJD */
2793         break;
2794     case '$':
2795         nextchar(pRExC_state);
2796         if (*RExC_parse)
2797             RExC_seen_zerolen++;
2798         if (RExC_flags & PMf_MULTILINE)
2799             ret = reg_node(pRExC_state, MEOL);
2800         else if (RExC_flags & PMf_SINGLELINE)
2801             ret = reg_node(pRExC_state, SEOL);
2802         else
2803             ret = reg_node(pRExC_state, EOL);
2804         Set_Node_Length(ret, 1); /* MJD */
2805         break;
2806     case '.':
2807         nextchar(pRExC_state);
2808         if (RExC_flags & PMf_SINGLELINE)
2809             ret = reg_node(pRExC_state, SANY);
2810         else
2811             ret = reg_node(pRExC_state, REG_ANY);
2812         *flagp |= HASWIDTH|SIMPLE;
2813         RExC_naughty++;
2814         Set_Node_Length(ret, 1); /* MJD */
2815         break;
2816     case '[':
2817     {
2818         char *oregcomp_parse = ++RExC_parse;
2819         ret = regclass(pRExC_state);
2820         if (*RExC_parse != ']') {
2821             RExC_parse = oregcomp_parse;
2822             vFAIL("Unmatched [");
2823         }
2824         nextchar(pRExC_state);
2825         *flagp |= HASWIDTH|SIMPLE;
2826         Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2827         break;
2828     }
2829     case '(':
2830         nextchar(pRExC_state);
2831         ret = reg(pRExC_state, 1, &flags);
2832         if (ret == NULL) {
2833                 if (flags & TRYAGAIN) {
2834                     if (RExC_parse == RExC_end) {
2835                          /* Make parent create an empty node if needed. */
2836                         *flagp |= TRYAGAIN;
2837                         return(NULL);
2838                     }
2839                     goto tryagain;
2840                 }
2841                 return(NULL);
2842         }
2843         *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2844         break;
2845     case '|':
2846     case ')':
2847         if (flags & TRYAGAIN) {
2848             *flagp |= TRYAGAIN;
2849             return NULL;
2850         }
2851         vFAIL("Internal urp");
2852                                 /* Supposed to be caught earlier. */
2853         break;
2854     case '{':
2855         if (!regcurly(RExC_parse)) {
2856             RExC_parse++;
2857             goto defchar;
2858         }
2859         /* FALL THROUGH */
2860     case '?':
2861     case '+':
2862     case '*':
2863         RExC_parse++;
2864         vFAIL("Quantifier follows nothing");
2865         break;
2866     case '\\':
2867         switch (*++RExC_parse) {
2868         case 'A':
2869             RExC_seen_zerolen++;
2870             ret = reg_node(pRExC_state, SBOL);
2871             *flagp |= SIMPLE;
2872             nextchar(pRExC_state);
2873             Set_Node_Length(ret, 2); /* MJD */
2874             break;
2875         case 'G':
2876             ret = reg_node(pRExC_state, GPOS);
2877             RExC_seen |= REG_SEEN_GPOS;
2878             *flagp |= SIMPLE;
2879             nextchar(pRExC_state);
2880             Set_Node_Length(ret, 2); /* MJD */
2881             break;
2882         case 'Z':
2883             ret = reg_node(pRExC_state, SEOL);
2884             *flagp |= SIMPLE;
2885             RExC_seen_zerolen++;                /* Do not optimize RE away */
2886             nextchar(pRExC_state);
2887             break;
2888         case 'z':
2889             ret = reg_node(pRExC_state, EOS);
2890             *flagp |= SIMPLE;
2891             RExC_seen_zerolen++;                /* Do not optimize RE away */
2892             nextchar(pRExC_state);
2893             Set_Node_Length(ret, 2); /* MJD */
2894             break;
2895         case 'C':
2896             ret = reg_node(pRExC_state, CANY);
2897             RExC_seen |= REG_SEEN_CANY;
2898             *flagp |= HASWIDTH|SIMPLE;
2899             nextchar(pRExC_state);
2900             Set_Node_Length(ret, 2); /* MJD */
2901             break;
2902         case 'X':
2903             ret = reg_node(pRExC_state, CLUMP);
2904             *flagp |= HASWIDTH;
2905             nextchar(pRExC_state);
2906             Set_Node_Length(ret, 2); /* MJD */
2907             break;
2908         case 'w':
2909             ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML     : ALNUM));
2910             *flagp |= HASWIDTH|SIMPLE;
2911             nextchar(pRExC_state);
2912             Set_Node_Length(ret, 2); /* MJD */
2913             break;
2914         case 'W':
2915             ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML    : NALNUM));
2916             *flagp |= HASWIDTH|SIMPLE;
2917             nextchar(pRExC_state);
2918             Set_Node_Length(ret, 2); /* MJD */
2919             break;
2920         case 'b':
2921             RExC_seen_zerolen++;
2922             RExC_seen |= REG_SEEN_LOOKBEHIND;
2923             ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL     : BOUND));
2924             *flagp |= SIMPLE;
2925             nextchar(pRExC_state);
2926             Set_Node_Length(ret, 2); /* MJD */
2927             break;
2928         case 'B':
2929             RExC_seen_zerolen++;
2930             RExC_seen |= REG_SEEN_LOOKBEHIND;
2931             ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL    : NBOUND));
2932             *flagp |= SIMPLE;
2933             nextchar(pRExC_state);
2934             Set_Node_Length(ret, 2); /* MJD */
2935             break;
2936         case 's':
2937             ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL     : SPACE));
2938             *flagp |= HASWIDTH|SIMPLE;
2939             nextchar(pRExC_state);
2940             Set_Node_Length(ret, 2); /* MJD */
2941             break;
2942         case 'S':
2943             ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL    : NSPACE));
2944             *flagp |= HASWIDTH|SIMPLE;
2945             nextchar(pRExC_state);
2946             Set_Node_Length(ret, 2); /* MJD */
2947             break;
2948         case 'd':
2949             ret = reg_node(pRExC_state, DIGIT);
2950             *flagp |= HASWIDTH|SIMPLE;
2951             nextchar(pRExC_state);
2952             Set_Node_Length(ret, 2); /* MJD */
2953             break;
2954         case 'D':
2955             ret = reg_node(pRExC_state, NDIGIT);
2956             *flagp |= HASWIDTH|SIMPLE;
2957             nextchar(pRExC_state);
2958             Set_Node_Length(ret, 2); /* MJD */
2959             break;
2960         case 'p':
2961         case 'P':
2962             {   
2963                 char* oldregxend = RExC_end;
2964                 char* parse_start = RExC_parse;
2965
2966                 if (RExC_parse[1] == '{') {
2967                   /* a lovely hack--pretend we saw [\pX] instead */
2968                     RExC_end = strchr(RExC_parse, '}');
2969                     if (!RExC_end) {
2970                         U8 c = (U8)*RExC_parse;
2971                         RExC_parse += 2;
2972                         RExC_end = oldregxend;
2973                         vFAIL2("Missing right brace on \\%c{}", c);
2974                     }
2975                     RExC_end++;
2976                 }
2977                 else {
2978                     RExC_end = RExC_parse + 2;
2979                     if (RExC_end > oldregxend)
2980                         RExC_end = oldregxend;
2981                 }
2982                 RExC_parse--;
2983
2984                 ret = regclass(pRExC_state);
2985
2986                 RExC_end = oldregxend;
2987                 RExC_parse--;
2988                 Set_Node_Cur_Length(ret); /* MJD */
2989                 nextchar(pRExC_state);
2990                 *flagp |= HASWIDTH|SIMPLE;
2991             }
2992             break;
2993         case 'n':
2994         case 'r':
2995         case 't':
2996         case 'f':
2997         case 'e':
2998         case 'a':
2999         case 'x':
3000         case 'c':
3001         case '0':
3002             goto defchar;
3003         case '1': case '2': case '3': case '4':
3004         case '5': case '6': case '7': case '8': case '9':
3005             {
3006                 I32 num = atoi(RExC_parse);
3007
3008                 if (num > 9 && num >= RExC_npar)
3009                     goto defchar;
3010                 else {
3011                     char * parse_start = RExC_parse - 1; /* MJD */
3012                     while (isDIGIT(*RExC_parse))
3013                         RExC_parse++;
3014
3015                     if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3016                         vFAIL("Reference to nonexistent group");
3017                     RExC_sawback = 1;
3018                     ret = reganode(pRExC_state,
3019                                    (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3020                                    num);
3021                     *flagp |= HASWIDTH;
3022                     
3023                     /* override incorrect value set in reganode MJD */
3024                     Set_Node_Offset(ret, parse_start+1); 
3025                     Set_Node_Cur_Length(ret); /* MJD */
3026                     RExC_parse--;
3027                     nextchar(pRExC_state);
3028                 }
3029             }
3030             break;
3031         case '\0':
3032             if (RExC_parse >= RExC_end)
3033                 FAIL("Trailing \\");
3034             /* FALL THROUGH */
3035         default:
3036             /* Do not generate `unrecognized' warnings here, we fall
3037                back into the quick-grab loop below */
3038             goto defchar;
3039         }
3040         break;
3041
3042     case '#':
3043         if (RExC_flags & PMf_EXTENDED) {
3044             while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3045             if (RExC_parse < RExC_end)
3046                 goto tryagain;
3047         }
3048         /* FALL THROUGH */
3049
3050     default: {
3051             register STRLEN len;
3052             register UV ender;
3053             register char *p;
3054             char *oldp, *s;
3055             STRLEN numlen;
3056             STRLEN foldlen;
3057             U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3058
3059             parse_start = RExC_parse - 1;
3060
3061             RExC_parse++;
3062
3063         defchar:
3064             ender = 0;
3065             ret = reg_node(pRExC_state,
3066                            (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3067             s = STRING(ret);
3068             for (len = 0, p = RExC_parse - 1;
3069               len < 127 && p < RExC_end;
3070               len++)
3071             {
3072                 oldp = p;
3073
3074                 if (RExC_flags & PMf_EXTENDED)
3075                     p = regwhite(p, RExC_end);
3076                 switch (*p) {
3077                 case '^':
3078                 case '$':
3079                 case '.':
3080                 case '[':
3081                 case '(':
3082                 case ')':
3083                 case '|':
3084                     goto loopdone;
3085                 case '\\':
3086                     switch (*++p) {
3087                     case 'A':
3088                     case 'C':
3089                     case 'X':
3090                     case 'G':
3091                     case 'Z':
3092                     case 'z':
3093                     case 'w':
3094                     case 'W':
3095                     case 'b':
3096                     case 'B':
3097                     case 's':
3098                     case 'S':
3099                     case 'd':
3100                     case 'D':
3101                     case 'p':
3102                     case 'P':
3103                         --p;
3104                         goto loopdone;
3105                     case 'n':
3106                         ender = '\n';
3107                         p++;
3108                         break;
3109                     case 'r':
3110                         ender = '\r';
3111                         p++;
3112                         break;
3113                     case 't':
3114                         ender = '\t';
3115                         p++;
3116                         break;
3117                     case 'f':
3118                         ender = '\f';
3119                         p++;
3120                         break;
3121                     case 'e':
3122                           ender = ASCII_TO_NATIVE('\033');
3123                         p++;
3124                         break;
3125                     case 'a':
3126                           ender = ASCII_TO_NATIVE('\007');
3127                         p++;
3128                         break;
3129                     case 'x':
3130                         if (*++p == '{') {
3131                             char* e = strchr(p, '}');
3132         
3133                             if (!e) {
3134                                 RExC_parse = p + 1;
3135                                 vFAIL("Missing right brace on \\x{}");
3136                             }
3137                             else {
3138                                 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3139                                     | PERL_SCAN_DISALLOW_PREFIX;
3140                                 numlen = e - p - 1;
3141                                 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3142                                 if (ender > 0xff)
3143                                     RExC_utf8 = 1;
3144                                 /* numlen is generous */
3145                                 if (numlen + len >= 127) {
3146                                     p--;
3147                                     goto loopdone;
3148                                 }
3149                                 p = e + 1;
3150                             }
3151                         }
3152                         else {
3153                             I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3154                             numlen = 2;
3155                             ender = grok_hex(p, &numlen, &flags, NULL);
3156                             p += numlen;
3157                         }
3158                         break;
3159                     case 'c':
3160                         p++;
3161                         ender = UCHARAT(p++);
3162                         ender = toCTRL(ender);
3163                         break;
3164                     case '0': case '1': case '2': case '3':case '4':
3165                     case '5': case '6': case '7': case '8':case '9':
3166                         if (*p == '0' ||
3167                           (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3168                             I32 flags = 0;
3169                             numlen = 3;
3170                             ender = grok_oct(p, &numlen, &flags, NULL);
3171                             p += numlen;
3172                         }
3173                         else {
3174                             --p;
3175                             goto loopdone;
3176                         }
3177                         break;
3178                     case '\0':
3179                         if (p >= RExC_end)
3180                             FAIL("Trailing \\");
3181                         /* FALL THROUGH */
3182                     default:
3183                         if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3184                             vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3185                         goto normal_default;
3186                     }
3187                     break;
3188                 default:
3189                   normal_default:
3190                     if (UTF8_IS_START(*p) && UTF) {
3191                         ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3192                                                &numlen, 0);
3193                         p += numlen;
3194                     }
3195                     else
3196                         ender = *p++;
3197                     break;
3198                 }
3199                 if (RExC_flags & PMf_EXTENDED)
3200                     p = regwhite(p, RExC_end);
3201                 if (UTF && FOLD) {
3202                     /* Prime the casefolded buffer. */
3203                     ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3204                 }
3205                 if (ISMULT2(p)) { /* Back off on ?+*. */
3206                     if (len)
3207                         p = oldp;
3208                     else if (UTF) {
3209                          STRLEN unilen;
3210
3211                          if (FOLD) {
3212                               /* Emit all the Unicode characters. */
3213                               for (foldbuf = tmpbuf;
3214                                    foldlen;
3215                                    foldlen -= numlen) {
3216                                    ender = utf8_to_uvchr(foldbuf, &numlen);
3217                                    if (numlen > 0) {
3218                                         reguni(pRExC_state, ender, s, &unilen);
3219                                         s       += unilen;
3220                                         len     += unilen;
3221                                         /* In EBCDIC the numlen
3222                                          * and unilen can differ. */
3223                                         foldbuf += numlen;
3224                                         if (numlen >= foldlen)
3225                                              break;
3226                                    }
3227                                    else
3228                                         break; /* "Can't happen." */
3229                               }
3230                          }
3231                          else {
3232                               reguni(pRExC_state, ender, s, &unilen);
3233                               if (unilen > 0) {
3234                                    s   += unilen;
3235                                    len += unilen;
3236                               }
3237                          }
3238                     }
3239                     else {
3240                         len++;
3241                         REGC((char)ender, s++);
3242                     }
3243                     break;
3244                 }
3245                 if (UTF) {
3246                      STRLEN unilen;
3247
3248                      if (FOLD) {
3249                           /* Emit all the Unicode characters. */
3250                           for (foldbuf = tmpbuf;
3251                                foldlen;
3252                                foldlen -= numlen) {
3253                                ender = utf8_to_uvchr(foldbuf, &numlen);
3254                                if (numlen > 0) {
3255                                     reguni(pRExC_state, ender, s, &unilen);
3256                                     len     += unilen;
3257                                     s       += unilen;
3258                                     /* In EBCDIC the numlen
3259                                      * and unilen can differ. */
3260                                     foldbuf += numlen;
3261                                     if (numlen >= foldlen)
3262                                          break;
3263                                }
3264                                else
3265                                     break;
3266                           }
3267                      }
3268                      else {
3269                           reguni(pRExC_state, ender, s, &unilen);
3270                           if (unilen > 0) {
3271                                s   += unilen;
3272                                len += unilen;
3273                           }
3274                      }
3275                      len--;
3276                 }
3277                 else
3278                     REGC((char)ender, s++);
3279             }
3280         loopdone:
3281             RExC_parse = p - 1;
3282             Set_Node_Cur_Length(ret); /* MJD */
3283             nextchar(pRExC_state);
3284             {
3285                 /* len is STRLEN which is unsigned, need to copy to signed */
3286                 IV iv = len;
3287                 if (iv < 0)
3288                     vFAIL("Internal disaster");
3289             }
3290             if (len > 0)
3291                 *flagp |= HASWIDTH;
3292             if (len == 1)
3293                 *flagp |= SIMPLE;
3294             if (!SIZE_ONLY)
3295                 STR_LEN(ret) = len;
3296             if (SIZE_ONLY)
3297                 RExC_size += STR_SZ(len);
3298             else
3299                 RExC_emit += STR_SZ(len);
3300         }
3301         break;
3302     }
3303
3304     /* If the encoding pragma is in effect recode the text of
3305      * any EXACT-kind nodes. */
3306     if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3307         STRLEN oldlen = STR_LEN(ret);
3308         SV *sv        = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3309
3310         if (RExC_utf8)
3311             SvUTF8_on(sv);
3312         if (sv_utf8_downgrade(sv, TRUE)) {
3313             char *s       = sv_recode_to_utf8(sv, PL_encoding);
3314             STRLEN newlen = SvCUR(sv);
3315
3316             if (SvUTF8(sv))
3317                 RExC_utf8 = 1;
3318             if (!SIZE_ONLY) {
3319                 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3320                                       (int)oldlen, STRING(ret),
3321                                       (int)newlen, s));
3322                 Copy(s, STRING(ret), newlen, char);
3323                 STR_LEN(ret) += newlen - oldlen;
3324                 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3325             } else
3326                 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3327         }
3328     }
3329
3330     return(ret);
3331 }
3332
3333 STATIC char *
3334 S_regwhite(pTHX_ char *p, char *e)
3335 {
3336     while (p < e) {
3337         if (isSPACE(*p))
3338             ++p;
3339         else if (*p == '#') {
3340             do {
3341                 p++;
3342             } while (p < e && *p != '\n');
3343         }
3344         else
3345             break;
3346     }
3347     return p;
3348 }
3349
3350 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3351    Character classes ([:foo:]) can also be negated ([:^foo:]).
3352    Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3353    Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3354    but trigger failures because they are currently unimplemented. */
3355
3356 #define POSIXCC_DONE(c)   ((c) == ':')
3357 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3358 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3359
3360 STATIC I32
3361 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3362 {
3363     char *posixcc = 0;
3364     I32 namedclass = OOB_NAMEDCLASS;
3365
3366     if (value == '[' && RExC_parse + 1 < RExC_end &&
3367         /* I smell either [: or [= or [. -- POSIX has been here, right? */
3368         POSIXCC(UCHARAT(RExC_parse))) {
3369         char  c = UCHARAT(RExC_parse);
3370         char* s = RExC_parse++;
3371         
3372         while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3373             RExC_parse++;
3374         if (RExC_parse == RExC_end)
3375             /* Grandfather lone [:, [=, [. */
3376             RExC_parse = s;
3377         else {
3378             char* t = RExC_parse++; /* skip over the c */
3379
3380             if (UCHARAT(RExC_parse) == ']') {
3381                 RExC_parse++; /* skip over the ending ] */
3382                 posixcc = s + 1;
3383                 if (*s == ':') {
3384                     I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3385                     I32 skip = 5; /* the most common skip */
3386
3387                     switch (*posixcc) {
3388                     case 'a':
3389                         if (strnEQ(posixcc, "alnum", 5))
3390                             namedclass =
3391                                 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3392                         else if (strnEQ(posixcc, "alpha", 5))
3393                             namedclass =
3394                                 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3395                         else if (strnEQ(posixcc, "ascii", 5))
3396                             namedclass =
3397                                 complement ? ANYOF_NASCII : ANYOF_ASCII;
3398                         break;
3399                     case 'b':
3400                         if (strnEQ(posixcc, "blank", 5))
3401                             namedclass =
3402                                 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3403                         break;
3404                     case 'c':
3405                         if (strnEQ(posixcc, "cntrl", 5))
3406                             namedclass =
3407                                 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3408                         break;
3409                     case 'd':
3410                         if (strnEQ(posixcc, "digit", 5))
3411                             namedclass =
3412                                 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3413                         break;
3414                     case 'g':
3415                         if (strnEQ(posixcc, "graph", 5))
3416                             namedclass =
3417                                 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3418                         break;
3419                     case 'l':
3420                         if (strnEQ(posixcc, "lower", 5))
3421                             namedclass =
3422                                 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3423                         break;
3424                     case 'p':
3425                         if (strnEQ(posixcc, "print", 5))
3426                             namedclass =
3427                                 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3428                         else if (strnEQ(posixcc, "punct", 5))
3429                             namedclass =
3430                                 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3431                         break;
3432                     case 's':
3433                         if (strnEQ(posixcc, "space", 5))
3434                             namedclass =
3435                                 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3436                         break;
3437                     case 'u':
3438                         if (strnEQ(posixcc, "upper", 5))
3439                             namedclass =
3440                                 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3441                         break;
3442                     case 'w': /* this is not POSIX, this is the Perl \w */
3443                         if (strnEQ(posixcc, "word", 4)) {
3444                             namedclass =
3445                                 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3446                             skip = 4;
3447                         }
3448                         break;
3449                     case 'x':
3450                         if (strnEQ(posixcc, "xdigit", 6)) {
3451                             namedclass =
3452                                 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3453                             skip = 6;
3454                         }
3455                         break;
3456                     }
3457                     if (namedclass == OOB_NAMEDCLASS ||
3458                         posixcc[skip] != ':' ||
3459                         posixcc[skip+1] != ']')
3460                     {
3461                         Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3462                                       t - s - 1, s + 1);
3463                     }
3464                 } else if (!SIZE_ONLY) {
3465                     /* [[=foo=]] and [[.foo.]] are still future. */
3466
3467                     /* adjust RExC_parse so the warning shows after
3468                        the class closes */
3469                     while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3470                         RExC_parse++;
3471                     Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3472                 }
3473             } else {
3474                 /* Maternal grandfather:
3475                  * "[:" ending in ":" but not in ":]" */
3476                 RExC_parse = s;
3477             }
3478         }
3479     }
3480
3481     return namedclass;
3482 }
3483
3484 STATIC void
3485 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3486 {
3487     if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3488         char *s = RExC_parse;
3489         char  c = *s++;
3490
3491         while(*s && isALNUM(*s))
3492             s++;
3493         if (*s && c == *s && s[1] == ']') {
3494             if (ckWARN(WARN_REGEXP))
3495                 vWARN3(s+2,
3496                         "POSIX syntax [%c %c] belongs inside character classes",
3497                         c, c);
3498
3499             /* [[=foo=]] and [[.foo.]] are still future. */
3500             if (POSIXCC_NOTYET(c)) {
3501                 /* adjust RExC_parse so the error shows after
3502                    the class closes */
3503                 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3504                     ;
3505                 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3506             }
3507         }
3508     }
3509 }
3510
3511 STATIC regnode *
3512 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3513 {
3514     register UV value;
3515     register UV nextvalue;
3516     register IV prevvalue = OOB_UNICODE;
3517     register IV range = 0;
3518     register regnode *ret;
3519     STRLEN numlen;
3520     IV namedclass;
3521     char *rangebegin = 0;
3522     bool need_class = 0;
3523     SV *listsv = Nullsv;
3524     register char *e;
3525     UV n;
3526     bool optimize_invert   = TRUE;
3527     AV* unicode_alternate  = 0;
3528 #ifdef EBCDIC
3529     UV literal_endpoint = 0;
3530 #endif
3531
3532     ret = reganode(pRExC_state, ANYOF, 0);
3533
3534     if (!SIZE_ONLY)
3535         ANYOF_FLAGS(ret) = 0;
3536
3537     if (UCHARAT(RExC_parse) == '^') {   /* Complement of range. */
3538         RExC_naughty++;
3539         RExC_parse++;
3540         if (!SIZE_ONLY)
3541             ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3542     }
3543
3544     if (SIZE_ONLY)
3545         RExC_size += ANYOF_SKIP;
3546     else {
3547         RExC_emit += ANYOF_SKIP;
3548         if (FOLD)
3549             ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3550         if (LOC)
3551             ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3552         ANYOF_BITMAP_ZERO(ret);
3553         listsv = newSVpvn("# comment\n", 10);
3554     }
3555
3556     nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3557
3558     if (!SIZE_ONLY && POSIXCC(nextvalue))
3559         checkposixcc(pRExC_state);
3560
3561     /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3562     if (UCHARAT(RExC_parse) == ']')
3563         goto charclassloop;
3564
3565     while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3566
3567     charclassloop:
3568
3569         namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3570
3571         if (!range)
3572             rangebegin = RExC_parse;
3573         if (UTF) {
3574             value = utf8n_to_uvchr((U8*)RExC_parse,
3575                                    RExC_end - RExC_parse,
3576                                    &numlen, 0);
3577             RExC_parse += numlen;
3578         }
3579         else
3580             value = UCHARAT(RExC_parse++);
3581         nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3582         if (value == '[' && POSIXCC(nextvalue))
3583             namedclass = regpposixcc(pRExC_state, value);
3584         else if (value == '\\') {
3585             if (UTF) {
3586                 value = utf8n_to_uvchr((U8*)RExC_parse,
3587                                    RExC_end - RExC_parse,
3588                                    &numlen, 0);
3589                 RExC_parse += numlen;
3590             }
3591             else
3592                 value = UCHARAT(RExC_parse++);
3593             /* Some compilers cannot handle switching on 64-bit integer
3594              * values, therefore value cannot be an UV.  Yes, this will
3595              * be a problem later if we want switch on Unicode.
3596              * A similar issue a little bit later when switching on
3597              * namedclass. --jhi */
3598             switch ((I32)value) {
3599             case 'w':   namedclass = ANYOF_ALNUM;       break;
3600             case 'W':   namedclass = ANYOF_NALNUM;      break;
3601             case 's':   namedclass = ANYOF_SPACE;       break;
3602             case 'S':   namedclass = ANYOF_NSPACE;      break;
3603             case 'd':   namedclass = ANYOF_DIGIT;       break;
3604             case 'D':   namedclass = ANYOF_NDIGIT;      break;
3605             case 'p':
3606             case 'P':
3607                 if (RExC_parse >= RExC_end)
3608                     vFAIL2("Empty \\%c{}", (U8)value);
3609                 if (*RExC_parse == '{') {
3610                     U8 c = (U8)value;
3611                     e = strchr(RExC_parse++, '}');
3612                     if (!e)
3613                         vFAIL2("Missing right brace on \\%c{}", c);
3614                     while (isSPACE(UCHARAT(RExC_parse)))
3615                         RExC_parse++;
3616                     if (e == RExC_parse)
3617                         vFAIL2("Empty \\%c{}", c);
3618                     n = e - RExC_parse;
3619                     while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3620                         n--;
3621                 }
3622                 else {
3623                     e = RExC_parse;
3624                     n = 1;
3625                 }
3626                 if (!SIZE_ONLY) {
3627                     if (UCHARAT(RExC_parse) == '^') {
3628                          RExC_parse++;
3629                          n--;
3630                          value = value == 'p' ? 'P' : 'p'; /* toggle */
3631                          while (isSPACE(UCHARAT(RExC_parse))) {
3632                               RExC_parse++;
3633                               n--;
3634                          }
3635                     }
3636                     if (value == 'p')
3637                          Perl_sv_catpvf(aTHX_ listsv,
3638                                         "+utf8::%.*s\n", (int)n, RExC_parse);
3639                     else
3640                          Perl_sv_catpvf(aTHX_ listsv,
3641                                         "!utf8::%.*s\n", (int)n, RExC_parse);
3642                 }
3643                 RExC_parse = e + 1;
3644                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3645                 continue;
3646             case 'n':   value = '\n';                   break;
3647             case 'r':   value = '\r';                   break;
3648             case 't':   value = '\t';                   break;
3649             case 'f':   value = '\f';                   break;
3650             case 'b':   value = '\b';                   break;
3651             case 'e':   value = ASCII_TO_NATIVE('\033');break;
3652             case 'a':   value = ASCII_TO_NATIVE('\007');break;
3653             case 'x':
3654                 if (*RExC_parse == '{') {
3655                     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3656                         | PERL_SCAN_DISALLOW_PREFIX;
3657                     e = strchr(RExC_parse++, '}');
3658                     if (!e)
3659                         vFAIL("Missing right brace on \\x{}");
3660
3661                     numlen = e - RExC_parse;
3662                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3663                     RExC_parse = e + 1;
3664                 }
3665                 else {
3666                     I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3667                     numlen = 2;
3668                     value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3669                     RExC_parse += numlen;
3670                 }
3671                 break;
3672             case 'c':
3673                 value = UCHARAT(RExC_parse++);
3674                 value = toCTRL(value);
3675                 break;
3676             case '0': case '1': case '2': case '3': case '4':
3677             case '5': case '6': case '7': case '8': case '9':
3678             {
3679                 I32 flags = 0;
3680                 numlen = 3;
3681                 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3682                 RExC_parse += numlen;
3683                 break;
3684             }
3685             default:
3686                 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3687                     vWARN2(RExC_parse,
3688                            "Unrecognized escape \\%c in character class passed through",
3689                            (int)value);
3690                 break;
3691             }
3692         } /* end of \blah */
3693 #ifdef EBCDIC
3694         else
3695             literal_endpoint++;
3696 #endif
3697
3698         if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3699
3700             if (!SIZE_ONLY && !need_class)
3701                 ANYOF_CLASS_ZERO(ret);
3702
3703             need_class = 1;
3704
3705             /* a bad range like a-\d, a-[:digit:] ? */
3706             if (range) {
3707                 if (!SIZE_ONLY) {
3708                     if (ckWARN(WARN_REGEXP))
3709                         vWARN4(RExC_parse,
3710                                "False [] range \"%*.*s\"",
3711                                RExC_parse - rangebegin,
3712                                RExC_parse - rangebegin,
3713                                rangebegin);
3714                     if (prevvalue < 256) {
3715                         ANYOF_BITMAP_SET(ret, prevvalue);
3716                         ANYOF_BITMAP_SET(ret, '-');
3717                     }
3718                     else {
3719                         ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3720                         Perl_sv_catpvf(aTHX_ listsv,
3721                                        "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3722                     }
3723                 }
3724
3725                 range = 0; /* this was not a true range */
3726             }
3727
3728             if (!SIZE_ONLY) {
3729                 if (namedclass > OOB_NAMEDCLASS)
3730                     optimize_invert = FALSE;
3731                 /* Possible truncation here but in some 64-bit environments
3732                  * the compiler gets heartburn about switch on 64-bit values.
3733                  * A similar issue a little earlier when switching on value.
3734                  * --jhi */
3735                 switch ((I32)namedclass) {
3736                 case ANYOF_ALNUM:
3737                     if (LOC)
3738                         ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3739                     else {
3740                         for (value = 0; value < 256; value++)
3741                             if (isALNUM(value))
3742                                 ANYOF_BITMAP_SET(ret, value);
3743                     }
3744                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");    
3745                     break;
3746                 case ANYOF_NALNUM:
3747                     if (LOC)
3748                         ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3749                     else {
3750                         for (value = 0; value < 256; value++)
3751                             if (!isALNUM(value))
3752                                 ANYOF_BITMAP_SET(ret, value);
3753                     }
3754                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3755                     break;
3756                 case ANYOF_ALNUMC:
3757                     if (LOC)
3758                         ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3759                     else {
3760                         for (value = 0; value < 256; value++)
3761                             if (isALNUMC(value))
3762                                 ANYOF_BITMAP_SET(ret, value);
3763                     }
3764                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3765                     break;
3766                 case ANYOF_NALNUMC:
3767                     if (LOC)
3768                         ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3769                     else {
3770                         for (value = 0; value < 256; value++)
3771                             if (!isALNUMC(value))
3772                                 ANYOF_BITMAP_SET(ret, value);
3773                     }
3774                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3775                     break;
3776                 case ANYOF_ALPHA:
3777                     if (LOC)
3778                         ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3779                     else {
3780                         for (value = 0; value < 256; value++)
3781                             if (isALPHA(value))
3782                                 ANYOF_BITMAP_SET(ret, value);
3783                     }
3784                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3785                     break;
3786                 case ANYOF_NALPHA:
3787                     if (LOC)
3788                         ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3789                     else {
3790                         for (value = 0; value < 256; value++)
3791                             if (!isALPHA(value))
3792                                 ANYOF_BITMAP_SET(ret, value);
3793                     }
3794                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3795                     break;
3796                 case ANYOF_ASCII:
3797                     if (LOC)
3798                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3799                     else {
3800 #ifndef EBCDIC
3801                         for (value = 0; value < 128; value++)
3802                             ANYOF_BITMAP_SET(ret, value);
3803 #else  /* EBCDIC */
3804                         for (value = 0; value < 256; value++) {
3805                             if (isASCII(value))
3806                                 ANYOF_BITMAP_SET(ret, value);
3807                         }
3808 #endif /* EBCDIC */
3809                     }
3810                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3811                     break;
3812                 case ANYOF_NASCII:
3813                     if (LOC)
3814                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3815                     else {
3816 #ifndef EBCDIC
3817                         for (value = 128; value < 256; value++)
3818                             ANYOF_BITMAP_SET(ret, value);
3819 #else  /* EBCDIC */
3820                         for (value = 0; value < 256; value++) {
3821                             if (!isASCII(value))
3822                                 ANYOF_BITMAP_SET(ret, value);
3823                         }
3824 #endif /* EBCDIC */
3825                     }
3826                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3827                     break;
3828                 case ANYOF_BLANK:
3829                     if (LOC)
3830                         ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3831                     else {
3832                         for (value = 0; value < 256; value++)
3833                             if (isBLANK(value))
3834                                 ANYOF_BITMAP_SET(ret, value);
3835                     }
3836                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3837                     break;
3838                 case ANYOF_NBLANK:
3839                     if (LOC)
3840                         ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3841                     else {
3842                         for (value = 0; value < 256; value++)
3843                             if (!isBLANK(value))
3844                                 ANYOF_BITMAP_SET(ret, value);
3845                     }
3846                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3847                     break;
3848                 case ANYOF_CNTRL:
3849                     if (LOC)
3850                         ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3851                     else {
3852                         for (value = 0; value < 256; value++)
3853                             if (isCNTRL(value))
3854                                 ANYOF_BITMAP_SET(ret, value);
3855                     }
3856                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3857                     break;
3858                 case ANYOF_NCNTRL:
3859                     if (LOC)
3860                         ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3861                     else {
3862                         for (value = 0; value < 256; value++)
3863                             if (!isCNTRL(value))
3864                                 ANYOF_BITMAP_SET(ret, value);
3865                     }
3866                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3867                     break;
3868                 case ANYOF_DIGIT:
3869                     if (LOC)
3870                         ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3871                     else {
3872                         /* consecutive digits assumed */
3873                         for (value = '0'; value <= '9'; value++)
3874                             ANYOF_BITMAP_SET(ret, value);
3875                     }
3876                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3877                     break;
3878                 case ANYOF_NDIGIT:
3879                     if (LOC)
3880                         ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3881                     else {
3882                         /* consecutive digits assumed */
3883                         for (value = 0; value < '0'; value++)
3884                             ANYOF_BITMAP_SET(ret, value);
3885                         for (value = '9' + 1; value < 256; value++)
3886                             ANYOF_BITMAP_SET(ret, value);
3887                     }
3888                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3889                     break;
3890                 case ANYOF_GRAPH:
3891                     if (LOC)
3892                         ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3893                     else {
3894                         for (value = 0; value < 256; value++)
3895                             if (isGRAPH(value))
3896                                 ANYOF_BITMAP_SET(ret, value);
3897                     }
3898                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3899                     break;
3900                 case ANYOF_NGRAPH:
3901                     if (LOC)
3902                         ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3903                     else {
3904                         for (value = 0; value < 256; value++)
3905                             if (!isGRAPH(value))
3906                                 ANYOF_BITMAP_SET(ret, value);
3907                     }
3908                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3909                     break;
3910                 case ANYOF_LOWER:
3911                     if (LOC)
3912                         ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3913                     else {
3914                         for (value = 0; value < 256; value++)
3915                             if (isLOWER(value))
3916                                 ANYOF_BITMAP_SET(ret, value);
3917                     }
3918                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3919                     break;
3920                 case ANYOF_NLOWER:
3921                     if (LOC)
3922                         ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3923                     else {
3924                         for (value = 0; value < 256; value++)
3925                             if (!isLOWER(value))
3926                                 ANYOF_BITMAP_SET(ret, value);
3927                     }
3928                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3929                     break;
3930                 case ANYOF_PRINT:
3931                     if (LOC)
3932                         ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3933                     else {
3934                         for (value = 0; value < 256; value++)
3935                             if (isPRINT(value))
3936                                 ANYOF_BITMAP_SET(ret, value);
3937                     }
3938                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3939                     break;
3940                 case ANYOF_NPRINT:
3941                     if (LOC)
3942                         ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3943                     else {
3944                         for (value = 0; value < 256; value++)
3945                             if (!isPRINT(value))
3946                                 ANYOF_BITMAP_SET(ret, value);
3947                     }
3948                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3949                     break;
3950                 case ANYOF_PSXSPC:
3951                     if (LOC)
3952                         ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3953                     else {
3954                         for (value = 0; value < 256; value++)
3955                             if (isPSXSPC(value))
3956                                 ANYOF_BITMAP_SET(ret, value);
3957                     }
3958                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3959                     break;
3960                 case ANYOF_NPSXSPC:
3961                     if (LOC)
3962                         ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3963                     else {
3964                         for (value = 0; value < 256; value++)
3965                             if (!isPSXSPC(value))
3966                                 ANYOF_BITMAP_SET(ret, value);
3967                     }
3968                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3969                     break;
3970                 case ANYOF_PUNCT:
3971                     if (LOC)
3972                         ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3973                     else {
3974                         for (value = 0; value < 256; value++)
3975                             if (isPUNCT(value))
3976                                 ANYOF_BITMAP_SET(ret, value);
3977                     }
3978                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3979                     break;
3980                 case ANYOF_NPUNCT:
3981                     if (LOC)
3982                         ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3983                     else {
3984                         for (value = 0; value < 256; value++)
3985                             if (!isPUNCT(value))
3986                                 ANYOF_BITMAP_SET(ret, value);
3987                     }
3988                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3989                     break;
3990                 case ANYOF_SPACE:
3991                     if (LOC)
3992                         ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3993                     else {
3994                         for (value = 0; value < 256; value++)
3995                             if (isSPACE(value))
3996                                 ANYOF_BITMAP_SET(ret, value);
3997                     }
3998                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3999                     break;
4000                 case ANYOF_NSPACE:
4001                     if (LOC)
4002                         ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4003                     else {
4004                         for (value = 0; value < 256; value++)
4005                             if (!isSPACE(value))
4006                                 ANYOF_BITMAP_SET(ret, value);
4007                     }
4008                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4009                     break;
4010                 case ANYOF_UPPER:
4011                     if (LOC)
4012                         ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4013                     else {
4014                         for (value = 0; value < 256; value++)
4015                             if (isUPPER(value))
4016                                 ANYOF_BITMAP_SET(ret, value);
4017                     }
4018                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4019                     break;
4020                 case ANYOF_NUPPER:
4021                     if (LOC)
4022                         ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4023                     else {
4024                         for (value = 0; value < 256; value++)
4025                             if (!isUPPER(value))
4026                                 ANYOF_BITMAP_SET(ret, value);
4027                     }
4028                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4029                     break;
4030                 case ANYOF_XDIGIT:
4031                     if (LOC)
4032                         ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4033                     else {
4034                         for (value = 0; value < 256; value++)
4035                             if (isXDIGIT(value))
4036                                 ANYOF_BITMAP_SET(ret, value);
4037                     }
4038                     Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4039                     break;
4040                 case ANYOF_NXDIGIT:
4041                     if (LOC)
4042                         ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4043                     else {
4044                         for (value = 0; value < 256; value++)
4045                             if (!isXDIGIT(value))
4046                                 ANYOF_BITMAP_SET(ret, value);
4047                     }
4048                     Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4049                     break;
4050                 default:
4051                     vFAIL("Invalid [::] class");
4052                     break;
4053                 }
4054                 if (LOC)
4055                     ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4056                 continue;
4057             }
4058         } /* end of namedclass \blah */
4059
4060         if (range) {
4061             if (prevvalue > (IV)value) /* b-a */ {
4062                 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4063                               RExC_parse - rangebegin,
4064                               RExC_parse - rangebegin,
4065                               rangebegin);
4066                 range = 0; /* not a valid range */
4067             }
4068         }
4069         else {
4070             prevvalue = value; /* save the beginning of the range */
4071             if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4072                 RExC_parse[1] != ']') {
4073                 RExC_parse++;
4074
4075                 /* a bad range like \w-, [:word:]- ? */
4076                 if (namedclass > OOB_NAMEDCLASS) {
4077                     if (ckWARN(WARN_REGEXP))
4078                         vWARN4(RExC_parse,
4079                                "False [] range \"%*.*s\"",
4080                                RExC_parse - rangebegin,
4081                                RExC_parse - rangebegin,
4082                                rangebegin);
4083                     if (!SIZE_ONLY)
4084                         ANYOF_BITMAP_SET(ret, '-');
4085                 } else
4086                     range = 1;  /* yeah, it's a range! */
4087                 continue;       /* but do it the next time */
4088             }
4089         }
4090
4091         /* now is the next time */
4092         if (!SIZE_ONLY) {
4093             IV i;
4094
4095             if (prevvalue < 256) {
4096                 IV ceilvalue = value < 256 ? value : 255;
4097
4098 #ifdef EBCDIC
4099                 /* In EBCDIC [\x89-\x91] should include
4100                  * the \x8e but [i-j] should not. */
4101                 if (literal_endpoint == 2 &&
4102                     ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4103                      (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4104                 {
4105                     if (isLOWER(prevvalue)) {
4106                         for (i = prevvalue; i <= ceilvalue; i++)
4107                             if (isLOWER(i))
4108                                 ANYOF_BITMAP_SET(ret, i);
4109                     } else {
4110                         for (i = prevvalue; i <= ceilvalue; i++)
4111                             if (isUPPER(i))
4112                                 ANYOF_BITMAP_SET(ret, i);
4113                     }
4114                 }
4115                 else
4116 #endif
4117                       for (i = prevvalue; i <= ceilvalue; i++)
4118                           ANYOF_BITMAP_SET(ret, i);
4119           }
4120           if (value > 255 || UTF) {
4121                 UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
4122                 UV natvalue      = NATIVE_TO_UNI(value);
4123
4124                 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4125                 if (prevnatvalue < natvalue) { /* what about > ? */
4126                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4127                                    prevnatvalue, natvalue);
4128                 }
4129                 else if (prevnatvalue == natvalue) {
4130                     Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4131                     if (FOLD) {
4132                          U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4133                          STRLEN foldlen;
4134                          UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4135
4136                          /* If folding and foldable and a single
4137                           * character, insert also the folded version
4138                           * to the charclass. */
4139                          if (f != value) {
4140                               if (foldlen == (STRLEN)UNISKIP(f))
4141                                   Perl_sv_catpvf(aTHX_ listsv,
4142                                                  "%04"UVxf"\n", f);
4143                               else {
4144                                   /* Any multicharacter foldings
4145                                    * require the following transform:
4146                                    * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4147                                    * where E folds into "pq" and F folds
4148                                    * into "rst", all other characters
4149                                    * fold to single characters.  We save
4150                                    * away these multicharacter foldings,
4151                                    * to be later saved as part of the
4152                                    * additional "s" data. */
4153                                   SV *sv;
4154
4155                                   if (!unicode_alternate)
4156                                       unicode_alternate = newAV();
4157                                   sv = newSVpvn((char*)foldbuf, foldlen);
4158                                   SvUTF8_on(sv);
4159                                   av_push(unicode_alternate, sv);
4160                               }
4161                          }
4162
4163                          /* If folding and the value is one of the Greek
4164                           * sigmas insert a few more sigmas to make the
4165                           * folding rules of the sigmas to work right.
4166                           * Note that not all the possible combinations
4167                           * are handled here: some of them are handled
4168                           * by the standard folding rules, and some of
4169                           * them (literal or EXACTF cases) are handled
4170                           * during runtime in regexec.c:S_find_byclass(). */
4171                          if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4172                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4173                                              (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4174                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4175                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4176                          }
4177                          else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4178                               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4179                                              (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4180                     }
4181                 }
4182             }
4183 #ifdef EBCDIC
4184             literal_endpoint = 0;
4185 #endif
4186         }
4187
4188         range = 0; /* this range (if it was one) is done now */
4189     }
4190
4191     if (need_class) {
4192         ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4193         if (SIZE_ONLY)
4194             RExC_size += ANYOF_CLASS_ADD_SKIP;
4195         else
4196             RExC_emit += ANYOF_CLASS_ADD_SKIP;
4197     }
4198
4199     /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4200     if (!SIZE_ONLY &&
4201          /* If the only flag is folding (plus possibly inversion). */
4202         ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4203        ) {
4204         for (value = 0; value < 256; ++value) {
4205             if (ANYOF_BITMAP_TEST(ret, value)) {
4206                 UV fold = PL_fold[value];
4207
4208                 if (fold != value)
4209                     ANYOF_BITMAP_SET(ret, fold);
4210             }
4211         }
4212         ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4213     }
4214
4215     /* optimize inverted simple patterns (e.g. [^a-z]) */
4216     if (!SIZE_ONLY && optimize_invert &&
4217         /* If the only flag is inversion. */
4218         (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4219         for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4220             ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4221         ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4222     }
4223
4224     if (!SIZE_ONLY) {
4225         AV *av = newAV();
4226         SV *rv;
4227
4228         /* The 0th element stores the character class description
4229          * in its textual form: used later (regexec.c:Perl_regclass_swash())
4230          * to initialize the appropriate swash (which gets stored in
4231          * the 1st element), and also useful for dumping the regnode.
4232          * The 2nd element stores the multicharacter foldings,
4233          * used later (regexec.c:S_reginclass()). */
4234         av_store(av, 0, listsv);
4235         av_store(av, 1, NULL);
4236         av_store(av, 2, (SV*)unicode_alternate);
4237         rv = newRV_noinc((SV*)av);
4238         n = add_data(pRExC_state, 1, "s");
4239         RExC_rx->data->data[n] = (void*)rv;
4240         ARG_SET(ret, n);
4241     }
4242
4243     return ret;
4244 }
4245
4246 STATIC char*
4247 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4248 {
4249     char* retval = RExC_parse++;
4250
4251     for (;;) {
4252         if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4253                 RExC_parse[2] == '#') {
4254             while (*RExC_parse && *RExC_parse != ')')
4255                 RExC_parse++;
4256             RExC_parse++;
4257             continue;
4258         }
4259         if (RExC_flags & PMf_EXTENDED) {
4260             if (isSPACE(*RExC_parse)) {
4261                 RExC_parse++;
4262                 continue;
4263             }
4264             else if (*RExC_parse == '#') {
4265                 while (*RExC_parse && *RExC_parse != '\n')
4266                     RExC_parse++;
4267                 RExC_parse++;
4268                 continue;
4269             }
4270         }
4271         return retval;
4272     }
4273 }
4274
4275 /*
4276 - reg_node - emit a node
4277 */
4278 STATIC regnode *                        /* Location. */
4279 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4280 {
4281     register regnode *ret;
4282     register regnode *ptr;
4283
4284     ret = RExC_emit;
4285     if (SIZE_ONLY) {
4286         SIZE_ALIGN(RExC_size);
4287         RExC_size += 1;
4288         return(ret);
4289     }
4290
4291     NODE_ALIGN_FILL(ret);
4292     ptr = ret;
4293     FILL_ADVANCE_NODE(ptr, op);
4294     if (RExC_offsets) {         /* MJD */
4295       MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n", 
4296               "reg_node", __LINE__, 
4297               reg_name[op],
4298               RExC_emit - RExC_emit_start > RExC_offsets[0] 
4299               ? "Overwriting end of array!\n" : "OK",
4300               RExC_emit - RExC_emit_start,
4301               RExC_parse - RExC_start,
4302               RExC_offsets[0])); 
4303       Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4304     }
4305             
4306     RExC_emit = ptr;
4307
4308     return(ret);
4309 }
4310
4311 /*
4312 - reganode - emit a node with an argument
4313 */
4314 STATIC regnode *                        /* Location. */
4315 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4316 {
4317     register regnode *ret;
4318     register regnode *ptr;
4319
4320     ret = RExC_emit;
4321     if (SIZE_ONLY) {
4322         SIZE_ALIGN(RExC_size);
4323         RExC_size += 2;
4324         return(ret);
4325     }
4326
4327     NODE_ALIGN_FILL(ret);
4328     ptr = ret;
4329     FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4330     if (RExC_offsets) {         /* MJD */
4331       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4332               "reganode",
4333               RExC_emit - RExC_emit_start > RExC_offsets[0] ? 
4334               "Overwriting end of array!\n" : "OK",
4335               RExC_emit - RExC_emit_start,
4336               RExC_parse - RExC_start,
4337               RExC_offsets[0])); 
4338       Set_Cur_Node_Offset;
4339     }
4340             
4341     RExC_emit = ptr;
4342
4343     return(ret);
4344 }
4345
4346 /*
4347 - reguni - emit (if appropriate) a Unicode character
4348 */
4349 STATIC void
4350 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4351 {
4352     *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4353 }
4354
4355 /*
4356 - reginsert - insert an operator in front of already-emitted operand
4357 *
4358 * Means relocating the operand.
4359 */
4360 STATIC void
4361 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4362 {
4363     register regnode *src;
4364     register regnode *dst;
4365     register regnode *place;
4366     register int offset = regarglen[(U8)op];
4367
4368 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4369
4370     if (SIZE_ONLY) {
4371         RExC_size += NODE_STEP_REGNODE + offset;
4372         return;
4373     }
4374
4375     src = RExC_emit;
4376     RExC_emit += NODE_STEP_REGNODE + offset;
4377     dst = RExC_emit;
4378     while (src > opnd) {
4379         StructCopy(--src, --dst, regnode);
4380         if (RExC_offsets) {     /* MJD 20010112 */
4381           MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n", 
4382                   "reg_insert",
4383                   dst - RExC_emit_start > RExC_offsets[0] 
4384                   ? "Overwriting end of array!\n" : "OK",
4385                   src - RExC_emit_start,
4386                   dst - RExC_emit_start,
4387                   RExC_offsets[0])); 
4388           Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4389           Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4390         }
4391     }
4392     
4393
4394     place = opnd;               /* Op node, where operand used to be. */
4395     if (RExC_offsets) {         /* MJD */
4396       MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n", 
4397               "reginsert",
4398               place - RExC_emit_start > RExC_offsets[0] 
4399               ? "Overwriting end of array!\n" : "OK",
4400               place - RExC_emit_start,
4401               RExC_parse - RExC_start,
4402               RExC_offsets[0])); 
4403       Set_Node_Offset(place, RExC_parse);
4404     }
4405     src = NEXTOPER(place);
4406     FILL_ADVANCE_NODE(place, op);
4407     Zero(src, offset, regnode);
4408 }
4409
4410 /*
4411 - regtail - set the next-pointer at the end of a node chain of p to val.
4412 */
4413 STATIC void
4414 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4415 {
4416     register regnode *scan;
4417     register regnode *temp;
4418
4419     if (SIZE_ONLY)
4420         return;
4421
4422     /* Find last node. */
4423     scan = p;
4424     for (;;) {
4425         temp = regnext(scan);
4426         if (temp == NULL)
4427             break;
4428         scan = temp;
4429     }
4430
4431     if (reg_off_by_arg[OP(scan)]) {
4432         ARG_SET(scan, val - scan);
4433     }
4434     else {
4435         NEXT_OFF(scan) = val - scan;
4436     }
4437 }
4438
4439 /*
4440 - regoptail - regtail on operand of first argument; nop if operandless
4441 */
4442 STATIC void
4443 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4444 {
4445     /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4446     if (p == NULL || SIZE_ONLY)
4447         return;
4448     if (PL_regkind[(U8)OP(p)] == BRANCH) {
4449         regtail(pRExC_state, NEXTOPER(p), val);
4450     }
4451     else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4452         regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4453     }
4454     else
4455         return;
4456 }
4457
4458 /*
4459  - regcurly - a little FSA that accepts {\d+,?\d*}
4460  */
4461 STATIC I32
4462 S_regcurly(pTHX_ register char *s)
4463 {
4464     if (*s++ != '{')
4465         return FALSE;
4466     if (!isDIGIT(*s))
4467         return FALSE;
4468     while (isDIGIT(*s))
4469         s++;
4470     if (*s == ',')
4471         s++;
4472     while (isDIGIT(*s))
4473         s++;
4474     if (*s != '}')
4475         return FALSE;
4476     return TRUE;
4477 }
4478
4479
4480 #ifdef DEBUGGING
4481
4482 STATIC regnode *
4483 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4484 {
4485     register U8 op = EXACT;     /* Arbitrary non-END op. */
4486     register regnode *next;
4487
4488     while (op != END && (!last || node < last)) {
4489         /* While that wasn't END last time... */
4490
4491         NODE_ALIGN(node);
4492         op = OP(node);
4493         if (op == CLOSE)
4494             l--;        
4495         next = regnext(node);
4496         /* Where, what. */
4497         if (OP(node) == OPTIMIZED)
4498             goto after_print;
4499         regprop(sv, node);
4500         PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4501                       (int)(2*l + 1), "", SvPVX(sv));
4502         if (next == NULL)               /* Next ptr. */
4503             PerlIO_printf(Perl_debug_log, "(0)");
4504         else
4505             PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4506         (void)PerlIO_putc(Perl_debug_log, '\n');
4507       after_print:
4508         if (PL_regkind[(U8)op] == BRANCHJ) {
4509             register regnode *nnode = (OP(next) == LONGJMP
4510                                        ? regnext(next)
4511                                        : next);
4512             if (last && nnode > last)
4513                 nnode = last;
4514             node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4515         }
4516         else if (PL_regkind[(U8)op] == BRANCH) {
4517             node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4518         }
4519         else if ( op == CURLY) {   /* `next' might be very big: optimizer */
4520             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4521                              NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4522         }
4523         else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4524             node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4525                              next, sv, l + 1);
4526         }
4527         else if ( op == PLUS || op == STAR) {
4528             node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4529         }
4530         else if (op == ANYOF) {
4531             /* arglen 1 + class block */
4532             node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4533                     ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4534             node = NEXTOPER(node);
4535         }
4536         else if (PL_regkind[(U8)op] == EXACT) {
4537             /* Literal string, where present. */
4538             node += NODE_SZ_STR(node) - 1;
4539             node = NEXTOPER(node);
4540         }
4541         else {
4542             node = NEXTOPER(node);
4543             node += regarglen[(U8)op];
4544         }
4545         if (op == CURLYX || op == OPEN)
4546             l++;
4547         else if (op == WHILEM)
4548             l--;
4549     }
4550     return node;
4551 }
4552
4553 #endif  /* DEBUGGING */
4554
4555 /*
4556  - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4557  */
4558 void
4559 Perl_regdump(pTHX_ regexp *r)
4560 {
4561 #ifdef DEBUGGING
4562     SV *sv = sv_newmortal();
4563
4564     (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4565
4566     /* Header fields of interest. */
4567     if (r->anchored_substr)
4568         PerlIO_printf(Perl_debug_log,
4569                       "anchored `%s%.*s%s'%s at %"IVdf" ",
4570                       PL_colors[0],
4571                       (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4572                       SvPVX(r->anchored_substr),
4573                       PL_colors[1],
4574                       SvTAIL(r->anchored_substr) ? "$" : "",
4575                       (IV)r->anchored_offset);
4576     else if (r->anchored_utf8)
4577         PerlIO_printf(Perl_debug_log,
4578                       "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4579                       PL_colors[0],
4580                       (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4581                       SvPVX(r->anchored_utf8),
4582                       PL_colors[1],
4583                       SvTAIL(r->anchored_utf8) ? "$" : "",
4584                       (IV)r->anchored_offset);
4585     if (r->float_substr)
4586         PerlIO_printf(Perl_debug_log,
4587                       "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4588                       PL_colors[0],
4589                       (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4590                       SvPVX(r->float_substr),
4591                       PL_colors[1],
4592                       SvTAIL(r->float_substr) ? "$" : "",
4593                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4594     else if (r->float_utf8)
4595         PerlIO_printf(Perl_debug_log,
4596                       "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4597                       PL_colors[0],
4598                       (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4599                       SvPVX(r->float_utf8),
4600                       PL_colors[1],
4601                       SvTAIL(r->float_utf8) ? "$" : "",
4602                       (IV)r->float_min_offset, (UV)r->float_max_offset);
4603     if (r->check_substr || r->check_utf8)
4604         PerlIO_printf(Perl_debug_log,
4605                       r->check_substr == r->float_substr
4606                       && r->check_utf8 == r->float_utf8
4607                       ? "(checking floating" : "(checking anchored");
4608     if (r->reganch & ROPT_NOSCAN)
4609         PerlIO_printf(Perl_debug_log, " noscan");
4610     if (r->reganch & ROPT_CHECK_ALL)
4611         PerlIO_printf(Perl_debug_log, " isall");
4612     if (r->check_substr || r->check_utf8)
4613         PerlIO_printf(Perl_debug_log, ") ");
4614
4615     if (r->regstclass) {
4616         regprop(sv, r->regstclass);
4617         PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4618     }
4619     if (r->reganch & ROPT_ANCH) {
4620         PerlIO_printf(Perl_debug_log, "anchored");
4621         if (r->reganch & ROPT_ANCH_BOL)
4622             PerlIO_printf(Perl_debug_log, "(BOL)");
4623         if (r->reganch & ROPT_ANCH_MBOL)
4624             PerlIO_printf(Perl_debug_log, "(MBOL)");
4625         if (r->reganch & ROPT_ANCH_SBOL)
4626             PerlIO_printf(Perl_debug_log, "(SBOL)");
4627         if (r->reganch & ROPT_ANCH_GPOS)
4628             PerlIO_printf(Perl_debug_log, "(GPOS)");
4629         PerlIO_putc(Perl_debug_log, ' ');
4630     }
4631     if (r->reganch & ROPT_GPOS_SEEN)
4632         PerlIO_printf(Perl_debug_log, "GPOS ");
4633     if (r->reganch & ROPT_SKIP)
4634         PerlIO_printf(Perl_debug_log, "plus ");
4635     if (r->reganch & ROPT_IMPLICIT)
4636         PerlIO_printf(Perl_debug_log, "implicit ");
4637     PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4638     if (r->reganch & ROPT_EVAL_SEEN)
4639         PerlIO_printf(Perl_debug_log, "with eval ");
4640     PerlIO_printf(Perl_debug_log, "\n");
4641     if (r->offsets) {
4642       U32 i;
4643       U32 len = r->offsets[0];
4644       PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4645       for (i = 1; i <= len; i++)
4646         PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ", 
4647                       (UV)r->offsets[i*2-1], 
4648                       (UV)r->offsets[i*2]);
4649       PerlIO_printf(Perl_debug_log, "\n");
4650     }
4651 #endif  /* DEBUGGING */
4652 }
4653
4654 #ifdef DEBUGGING
4655
4656 STATIC void
4657 S_put_byte(pTHX_ SV *sv, int c)
4658 {
4659     if (isCNTRL(c) || c == 255 || !isPRINT(c))
4660         Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4661     else if (c == '-' || c == ']' || c == '\\' || c == '^')
4662         Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4663     else
4664         Perl_sv_catpvf(aTHX_ sv, "%c", c);
4665 }
4666
4667 #endif  /* DEBUGGING */
4668
4669 /*
4670 - regprop - printable representation of opcode
4671 */
4672 void
4673 Perl_regprop(pTHX_ SV *sv, regnode *o)
4674 {
4675 #ifdef DEBUGGING
4676     register int k;
4677
4678     sv_setpvn(sv, "", 0);
4679     if (OP(o) >= reg_num)               /* regnode.type is unsigned */
4680         /* It would be nice to FAIL() here, but this may be called from
4681            regexec.c, and it would be hard to supply pRExC_state. */
4682         Perl_croak(aTHX_ "Corrupted regexp opcode");
4683     sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4684
4685     k = PL_regkind[(U8)OP(o)];
4686
4687     if (k == EXACT) {
4688         SV *dsv = sv_2mortal(newSVpvn("", 0));
4689         /* Using is_utf8_string() is a crude hack but it may
4690          * be the best for now since we have no flag "this EXACTish
4691          * node was UTF-8" --jhi */
4692         bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4693         char *s    = do_utf8 ?
4694           pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4695                          UNI_DISPLAY_REGEX) :
4696           STRING(o);
4697         int len = do_utf8 ?
4698           strlen(s) :
4699           STR_LEN(o);
4700         Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4701                        PL_colors[0],
4702                        len, s,
4703                        PL_colors[1]);
4704     }
4705     else if (k == CURLY) {
4706         if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4707             Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4708         Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4709     }
4710     else if (k == WHILEM && o->flags)                   /* Ordinal/of */
4711         Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4712     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4713         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
4714     else if (k == LOGICAL)
4715         Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);     /* 2: embedded, otherwise 1 */
4716     else if (k == ANYOF) {
4717         int i, rangestart = -1;
4718         U8 flags = ANYOF_FLAGS(o);
4719         const char * const anyofs[] = { /* Should be syncronized with
4720                                          * ANYOF_ #xdefines in regcomp.h */
4721             "\\w",
4722             "\\W",
4723             "\\s",
4724             "\\S",
4725             "\\d",
4726             "\\D",
4727             "[:alnum:]",
4728             "[:^alnum:]",
4729             "[:alpha:]",
4730             "[:^alpha:]",
4731             "[:ascii:]",
4732             "[:^ascii:]",
4733             "[:ctrl:]",
4734             "[:^ctrl:]",
4735             "[:graph:]",
4736             "[:^graph:]",
4737             "[:lower:]",
4738             "[:^lower:]",
4739             "[:print:]",
4740             "[:^print:]",
4741             "[:punct:]",
4742             "[:^punct:]",
4743             "[:upper:]",
4744             "[:^upper:]",
4745             "[:xdigit:]",
4746             "[:^xdigit:]",
4747             "[:space:]",
4748             "[:^space:]",
4749             "[:blank:]",
4750             "[:^blank:]"
4751         };
4752
4753         if (flags & ANYOF_LOCALE)
4754             sv_catpv(sv, "{loc}");
4755         if (flags & ANYOF_FOLD)
4756             sv_catpv(sv, "{i}");
4757         Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4758         if (flags & ANYOF_INVERT)
4759             sv_catpv(sv, "^");
4760         for (i = 0; i <= 256; i++) {
4761             if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4762                 if (rangestart == -1)
4763                     rangestart = i;
4764             } else if (rangestart != -1) {
4765                 if (i <= rangestart + 3)
4766                     for (; rangestart < i; rangestart++)
4767                         put_byte(sv, rangestart);
4768                 else {
4769                     put_byte(sv, rangestart);
4770                     sv_catpv(sv, "-");
4771                     put_byte(sv, i - 1);
4772                 }
4773                 rangestart = -1;
4774             }
4775         }
4776
4777         if (o->flags & ANYOF_CLASS)
4778             for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4779                 if (ANYOF_CLASS_TEST(o,i))
4780                     sv_catpv(sv, anyofs[i]);
4781
4782         if (flags & ANYOF_UNICODE)
4783             sv_catpv(sv, "{unicode}");
4784         else if (flags & ANYOF_UNICODE_ALL)
4785             sv_catpv(sv, "{unicode_all}");
4786
4787         {
4788             SV *lv;
4789             SV *sw = regclass_swash(o, FALSE, &lv, 0);
4790         
4791             if (lv) {
4792                 if (sw) {
4793                     U8 s[UTF8_MAXLEN+1];
4794                 
4795                     for (i = 0; i <= 256; i++) { /* just the first 256 */
4796                         U8 *e = uvchr_to_utf8(s, i);
4797                         
4798                         if (i < 256 && swash_fetch(sw, s, TRUE)) {
4799                             if (rangestart == -1)
4800                                 rangestart = i;
4801                         } else if (rangestart != -1) {
4802                             U8 *p;
4803                         
4804                             if (i <= rangestart + 3)
4805                                 for (; rangestart < i; rangestart++) {
4806                                     for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4807                                         put_byte(sv, *p);
4808                                 }
4809                             else {
4810                                 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4811                                     put_byte(sv, *p);
4812                                 sv_catpv(sv, "-");
4813                                     for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4814                                         put_byte(sv, *p);
4815                                 }
4816                                 rangestart = -1;
4817                             }
4818                         }
4819                         
4820                     sv_catpv(sv, "..."); /* et cetera */
4821                 }
4822
4823                 {
4824                     char *s = savepv(SvPVX(lv));
4825                     char *origs = s;
4826                 
4827                     while(*s && *s != '\n') s++;
4828                 
4829                     if (*s == '\n') {
4830                         char *t = ++s;
4831                         
4832                         while (*s) {
4833                             if (*s == '\n')
4834                                 *s = ' ';
4835                             s++;
4836                         }
4837                         if (s[-1] == ' ')
4838                             s[-1] = 0;
4839                         
4840                         sv_catpv(sv, t);
4841                     }
4842                 
4843                     Safefree(origs);
4844                 }
4845             }
4846         }
4847
4848         Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4849     }
4850     else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4851         Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4852 #endif  /* DEBUGGING */
4853 }
4854
4855 SV *
4856 Perl_re_intuit_string(pTHX_ regexp *prog)
4857 {                               /* Assume that RE_INTUIT is set */
4858     DEBUG_r(
4859         {   STRLEN n_a;
4860             char *s = SvPV(prog->check_substr
4861                       ? prog->check_substr : prog->check_utf8, n_a);
4862
4863             if (!PL_colorset) reginitcolors();
4864             PerlIO_printf(Perl_debug_log,
4865                       "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4866                       PL_colors[4],
4867                       prog->check_substr ? "" : "utf8 ",
4868                       PL_colors[5],PL_colors[0],
4869                       s,
4870                       PL_colors[1],
4871                       (strlen(s) > 60 ? "..." : ""));
4872         } );
4873
4874     return prog->check_substr ? prog->check_substr : prog->check_utf8;
4875 }
4876
4877 void
4878 Perl_pregfree(pTHX_ struct regexp *r)
4879 {
4880 #ifdef DEBUGGING
4881     SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4882 #endif
4883
4884     if (!r || (--r->refcnt > 0))
4885         return;
4886     DEBUG_r({
4887          int len;
4888          char *s;
4889
4890          s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4891                 r->prelen, 60, UNI_DISPLAY_REGEX)
4892             : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4893          len = SvCUR(dsv);
4894          if (!PL_colorset)
4895               reginitcolors();
4896          PerlIO_printf(Perl_debug_log,
4897                        "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4898                        PL_colors[4],PL_colors[5],PL_colors[0],
4899                        len, len, s,
4900                        PL_colors[1],
4901                        len > 60 ? "..." : "");
4902     });
4903
4904     if (r->precomp)
4905         Safefree(r->precomp);
4906     if (r->offsets)             /* 20010421 MJD */
4907         Safefree(r->offsets);
4908     if (RX_MATCH_COPIED(r))
4909         Safefree(r->subbeg);
4910     if (r->substrs) {
4911         if (r->anchored_substr)
4912             SvREFCNT_dec(r->anchored_substr);
4913         if (r->anchored_utf8)
4914             SvREFCNT_dec(r->anchored_utf8);
4915         if (r->float_substr)
4916             SvREFCNT_dec(r->float_substr);
4917         if (r->float_utf8)
4918             SvREFCNT_dec(r->float_utf8);
4919         Safefree(r->substrs);
4920     }
4921     if (r->data) {
4922         int n = r->data->count;
4923         PAD* new_comppad = NULL;
4924         PAD* old_comppad;
4925
4926         while (--n >= 0) {
4927           /* If you add a ->what type here, update the comment in regcomp.h */
4928             switch (r->data->what[n]) {
4929             case 's':
4930                 SvREFCNT_dec((SV*)r->data->data[n]);
4931                 break;
4932             case 'f':
4933                 Safefree(r->data->data[n]);
4934                 break;
4935             case 'p':
4936                 new_comppad = (AV*)r->data->data[n];
4937                 break;
4938             case 'o':
4939                 if (new_comppad == NULL)
4940                     Perl_croak(aTHX_ "panic: pregfree comppad");
4941                 PAD_SAVE_LOCAL(old_comppad,
4942                     /* Watch out for global destruction's random ordering. */
4943                     (SvTYPE(new_comppad) == SVt_PVAV) ?
4944                                 new_comppad : Null(PAD *)
4945                 );
4946                 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4947                     op_free((OP_4tree*)r->data->data[n]);
4948                 }
4949
4950                 PAD_RESTORE_LOCAL(old_comppad);
4951                 SvREFCNT_dec((SV*)new_comppad);
4952                 new_comppad = NULL;
4953                 break;
4954             case 'n':
4955                 break;
4956             default:
4957                 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4958             }
4959         }
4960         Safefree(r->data->what);
4961         Safefree(r->data);
4962     }
4963     Safefree(r->startp);
4964     Safefree(r->endp);
4965     Safefree(r);
4966 }
4967
4968 /*
4969  - regnext - dig the "next" pointer out of a node
4970  *
4971  * [Note, when REGALIGN is defined there are two places in regmatch()
4972  * that bypass this code for speed.]
4973  */
4974 regnode *
4975 Perl_regnext(pTHX_ register regnode *p)
4976 {
4977     register I32 offset;
4978
4979     if (p == &PL_regdummy)
4980         return(NULL);
4981
4982     offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4983     if (offset == 0)
4984         return(NULL);
4985
4986     return(p+offset);
4987 }
4988
4989 STATIC void     
4990 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4991 {
4992     va_list args;
4993     STRLEN l1 = strlen(pat1);
4994     STRLEN l2 = strlen(pat2);
4995     char buf[512];
4996     SV *msv;
4997     char *message;
4998
4999     if (l1 > 510)
5000         l1 = 510;
5001     if (l1 + l2 > 510)
5002         l2 = 510 - l1;
5003     Copy(pat1, buf, l1 , char);
5004     Copy(pat2, buf + l1, l2 , char);
5005     buf[l1 + l2] = '\n';
5006     buf[l1 + l2 + 1] = '\0';
5007 #ifdef I_STDARG
5008     /* ANSI variant takes additional second argument */
5009     va_start(args, pat2);
5010 #else
5011     va_start(args);
5012 #endif
5013     msv = vmess(buf, &args);
5014     va_end(args);
5015     message = SvPV(msv,l1);
5016     if (l1 > 512)
5017         l1 = 512;
5018     Copy(message, buf, l1 , char);
5019     buf[l1] = '\0';                     /* Overwrite \n */
5020     Perl_croak(aTHX_ "%s", buf);
5021 }
5022
5023 /* XXX Here's a total kludge.  But we need to re-enter for swash routines. */
5024
5025 void
5026 Perl_save_re_context(pTHX)
5027 {
5028 #if 0
5029     SAVEPPTR(RExC_precomp);             /* uncompiled string. */
5030     SAVEI32(RExC_npar);         /* () count. */
5031     SAVEI32(RExC_size);         /* Code size. */
5032     SAVEI32(RExC_flags);                /* are we folding, multilining? */
5033     SAVEVPTR(RExC_rx);          /* from regcomp.c */
5034     SAVEI32(RExC_seen);         /* from regcomp.c */
5035     SAVEI32(RExC_sawback);              /* Did we see \1, ...? */
5036     SAVEI32(RExC_naughty);              /* How bad is this pattern? */
5037     SAVEVPTR(RExC_emit);                /* Code-emit pointer; &regdummy = don't */
5038     SAVEPPTR(RExC_end);         /* End of input for compile */
5039     SAVEPPTR(RExC_parse);               /* Input-scan pointer. */
5040 #endif
5041
5042     SAVEI32(PL_reg_flags);              /* from regexec.c */
5043     SAVEPPTR(PL_bostr);
5044     SAVEPPTR(PL_reginput);              /* String-input pointer. */
5045     SAVEPPTR(PL_regbol);                /* Beginning of input, for ^ check. */
5046     SAVEPPTR(PL_regeol);                /* End of input, for $ check. */
5047     SAVEVPTR(PL_regstartp);             /* Pointer to startp array. */
5048     SAVEVPTR(PL_regendp);               /* Ditto for endp. */
5049     SAVEVPTR(PL_reglastparen);          /* Similarly for lastparen. */
5050     SAVEPPTR(PL_regtill);               /* How far we are required to go. */
5051     SAVEGENERICPV(PL_reg_start_tmp);            /* from regexec.c */
5052     PL_reg_start_tmp = 0;
5053     SAVEI32(PL_reg_start_tmpl);         /* from regexec.c */
5054     PL_reg_start_tmpl = 0;
5055     SAVEVPTR(PL_regdata);
5056     SAVEI32(PL_reg_eval_set);           /* from regexec.c */
5057     SAVEI32(PL_regnarrate);             /* from regexec.c */
5058     SAVEVPTR(PL_regprogram);            /* from regexec.c */
5059     SAVEINT(PL_regindent);              /* from regexec.c */
5060     SAVEVPTR(PL_regcc);                 /* from regexec.c */
5061     SAVEVPTR(PL_curcop);
5062     SAVEVPTR(PL_reg_call_cc);           /* from regexec.c */
5063     SAVEVPTR(PL_reg_re);                /* from regexec.c */
5064     SAVEPPTR(PL_reg_ganch);             /* from regexec.c */
5065     SAVESPTR(PL_reg_sv);                /* from regexec.c */
5066     SAVEI8(PL_reg_match_utf8);          /* from regexec.c */
5067     SAVEVPTR(PL_reg_magic);             /* from regexec.c */
5068     SAVEI32(PL_reg_oldpos);                     /* from regexec.c */
5069     SAVEVPTR(PL_reg_oldcurpm);          /* from regexec.c */
5070     SAVEVPTR(PL_reg_curpm);             /* from regexec.c */
5071     SAVEI32(PL_regnpar);                /* () count. */
5072     SAVEI32(PL_regsize);                /* from regexec.c */
5073 #ifdef DEBUGGING
5074     SAVEPPTR(PL_reg_starttry);          /* from regexec.c */
5075 #endif
5076 }
5077
5078 static void
5079 clear_re(pTHX_ void *r)
5080 {
5081     ReREFCNT_dec((regexp *)r);
5082 }
5083