This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to I18N::LangTags::List 0.27, from Sean Burke.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
be3c0a43 72 **** Copyright (c) 1991-2002, Larry Wall
a687059c 73 ****
9ef589d8
LW
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
76
a687059c
LW
77 *
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
81 */
82#include "EXTERN.h"
864dbfa3 83#define PERL_IN_REGCOMP_C
a687059c 84#include "perl.h"
d06ea78c 85
acfe0abc 86#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
87# include "INTERN.h"
88#endif
c277df42
IZ
89
90#define REG_COMP_C
a687059c
LW
91#include "regcomp.h"
92
d4cce5f1 93#ifdef op
11343788 94#undef op
d4cce5f1 95#endif /* op */
11343788 96
fe14fcc3
LW
97#ifdef MSDOS
98# if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103# endif /* BUGGY_MSC6 */
104#endif /* MSDOS */
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
830247a4
IZ
110typedef struct RExC_state_t {
111 U16 flags16; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
113 regexp *rx;
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 119 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
122 U32 seen;
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
1aa99e6b 128 I32 utf8;
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
133} RExC_state_t;
134
135#define RExC_flags16 (pRExC_state->flags16)
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
830247a4 154
a687059c
LW
155#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
a687059c 158
35c8bce7
LW
159#ifdef SPSTART
160#undef SPSTART /* dratted cpp namespace... */
161#endif
a687059c
LW
162/*
163 * Flags to be passed up and down.
164 */
a687059c 165#define WORST 0 /* Worst case. */
821b33a5 166#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
167#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168#define SPSTART 0x4 /* Starts with * or +. */
169#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 170
2c2d71f5
JH
171/* Length of a variant. */
172
173typedef struct scan_data_t {
174 I32 len_min;
175 I32 len_delta;
176 I32 pos_min;
177 I32 pos_delta;
178 SV *last_found;
179 I32 last_end; /* min value, <0 unless valid. */
180 I32 last_start_min;
181 I32 last_start_max;
182 SV **longest; /* Either &l_fixed, or &l_float. */
183 SV *longest_fixed;
184 I32 offset_fixed;
185 SV *longest_float;
186 I32 offset_float_min;
187 I32 offset_float_max;
188 I32 flags;
189 I32 whilem_c;
cb434fcc 190 I32 *last_closep;
653099ff 191 struct regnode_charclass_class *start_class;
2c2d71f5
JH
192} scan_data_t;
193
a687059c 194/*
e50aee73 195 * Forward declarations for pregcomp()'s friends.
a687059c 196 */
a0d0e21e 197
b81d288d 198static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 199 0, 0, 0, 0, 0, 0};
c277df42
IZ
200
201#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202#define SF_BEFORE_SEOL 0x1
203#define SF_BEFORE_MEOL 0x2
204#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
206
09b7f37c
CB
207#ifdef NO_UNARY_PLUS
208# define SF_FIX_SHIFT_EOL (0+2)
209# define SF_FL_SHIFT_EOL (0+4)
210#else
211# define SF_FIX_SHIFT_EOL (+2)
212# define SF_FL_SHIFT_EOL (+4)
213#endif
c277df42
IZ
214
215#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
217
218#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220#define SF_IS_INF 0x40
221#define SF_HAS_PAR 0x80
222#define SF_IN_PAR 0x100
223#define SF_HAS_EVAL 0x200
4bfe0158 224#define SCF_DO_SUBSTR 0x400
653099ff
GS
225#define SCF_DO_STCLASS_AND 0x0800
226#define SCF_DO_STCLASS_OR 0x1000
227#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 228#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 229
1aa99e6b 230#define UTF RExC_utf8
830247a4
IZ
231#define LOC (RExC_flags16 & PMf_LOCALE)
232#define FOLD (RExC_flags16 & PMf_FOLD)
a0ed51b3 233
ffc61ed2 234#define OOB_UNICODE 12345678
93733859 235#define OOB_NAMEDCLASS -1
b8c5462f 236
a0ed51b3
LW
237#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
239
8615cb43 240
b45f050a
JF
241/* length of regex to show in messages that don't mark a position within */
242#define RegexLengthToShowInErrorMessages 127
243
244/*
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
248 */
7253e4e3
RK
249#define MARKER1 "<-- HERE" /* marker as it appears in the description */
250#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 251
7253e4e3 252#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
253
254/*
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
257 * "...".
258 */
a4eb266f 259#define FAIL(msg) \
8615cb43 260 STMT_START { \
a4eb266f 261 char *ellipses = ""; \
7a799e20 262 IV len = RExC_end - RExC_precomp; \
b45f050a 263 \
8615cb43 264 if (!SIZE_ONLY) \
830247a4 265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
266 \
267 if (len > RegexLengthToShowInErrorMessages) { \
268 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
269 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 270 ellipses = "..."; \
b45f050a
JF
271 } \
272 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
830247a4 273 msg, (int)len, RExC_precomp, ellipses); \
8615cb43
JF
274 } STMT_END
275
b45f050a
JF
276/*
277 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
278 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 * "...".
280 */
a4eb266f 281#define FAIL2(pat,msg) \
8615cb43 282 STMT_START { \
a4eb266f 283 char *ellipses = ""; \
7a799e20 284 IV len = RExC_end - RExC_precomp; \
b45f050a 285 \
8615cb43 286 if (!SIZE_ONLY) \
830247a4 287 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
8615cb43 288 \
b45f050a
JF
289 if (len > RegexLengthToShowInErrorMessages) { \
290 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
291 len = RegexLengthToShowInErrorMessages - 10; \
a4eb266f 292 ellipses = "..."; \
b45f050a
JF
293 } \
294 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
830247a4 295 msg, (int)len, RExC_precomp, ellipses); \
b45f050a
JF
296 } STMT_END
297
298
299/*
300 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
301 */
302#define Simple_vFAIL(m) \
303 STMT_START { \
7a799e20 304 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
305 \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
830247a4 307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
308 } STMT_END
309
310/*
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
312 */
313#define vFAIL(m) \
314 STMT_START { \
315 if (!SIZE_ONLY) \
830247a4 316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
317 Simple_vFAIL(m); \
318 } STMT_END
319
320/*
321 * Like Simple_vFAIL(), but accepts two arguments.
322 */
323#define Simple_vFAIL2(m,a1) \
324 STMT_START { \
7a799e20 325 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
326 \
327 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
830247a4 328 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
329 } STMT_END
330
331/*
332 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
333 */
334#define vFAIL2(m,a1) \
335 STMT_START { \
336 if (!SIZE_ONLY) \
830247a4 337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
338 Simple_vFAIL2(m, a1); \
339 } STMT_END
340
341
342/*
343 * Like Simple_vFAIL(), but accepts three arguments.
344 */
345#define Simple_vFAIL3(m, a1, a2) \
346 STMT_START { \
7a799e20 347 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
348 \
349 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
830247a4 350 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
351 } STMT_END
352
353/*
354 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 */
356#define vFAIL3(m,a1,a2) \
357 STMT_START { \
358 if (!SIZE_ONLY) \
830247a4 359 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
b45f050a
JF
360 Simple_vFAIL3(m, a1, a2); \
361 } STMT_END
362
363/*
364 * Like Simple_vFAIL(), but accepts four arguments.
365 */
366#define Simple_vFAIL4(m, a1, a2, a3) \
367 STMT_START { \
7a799e20 368 IV offset = RExC_parse - RExC_precomp; \
b45f050a
JF
369 \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
830247a4 371 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
372 } STMT_END
373
374/*
375 * Like Simple_vFAIL(), but accepts five arguments.
376 */
377#define Simple_vFAIL5(m, a1, a2, a3, a4) \
378 STMT_START { \
7a799e20 379 IV offset = RExC_parse - RExC_precomp; \
b45f050a 380 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
830247a4 381 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
382 } STMT_END
383
384
385#define vWARN(loc,m) \
386 STMT_START { \
7a799e20 387 IV offset = loc - RExC_precomp; \
b45f050a 388 Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
830247a4 389 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
390 } STMT_END \
391
f9373011
PM
392#define vWARNdep(loc,m) \
393 STMT_START { \
7a799e20 394 IV offset = loc - RExC_precomp; \
f9373011
PM
395 int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \
396 Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
397 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
398 } STMT_END \
399
b45f050a
JF
400
401#define vWARN2(loc, m, a1) \
402 STMT_START { \
7a799e20 403 IV offset = loc - RExC_precomp; \
b45f050a
JF
404 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
405 a1, \
830247a4 406 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
407 } STMT_END
408
409#define vWARN3(loc, m, a1, a2) \
410 STMT_START { \
7a799e20 411 IV offset = loc - RExC_precomp; \
b45f050a
JF
412 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
413 a1, a2, \
830247a4 414 (int)offset, RExC_precomp, RExC_precomp + offset); \
b45f050a
JF
415 } STMT_END
416
417#define vWARN4(loc, m, a1, a2, a3) \
418 STMT_START { \
7a799e20 419 IV offset = loc - RExC_precomp; \
b45f050a
JF
420 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
421 a1, a2, a3, \
830247a4 422 (int)offset, RExC_precomp, RExC_precomp + offset); \
8615cb43
JF
423 } STMT_END
424
9d1d55b5
JP
425/* used for the parse_flags section for (?c) -- japhy */
426#define vWARN5(loc, m, a1, a2, a3, a4) \
427 STMT_START { \
7a799e20 428 IV offset = loc - RExC_precomp; \
9d1d55b5
JP
429 Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
430 a1, a2, a3, a4, \
431 (int)offset, RExC_precomp, RExC_precomp + offset); \
432 } STMT_END
433
8615cb43 434
cd439c50 435/* Allow for side effects in s */
fd3f0ae2 436#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
cd439c50 437
fac92740
MJD
438/* Macros for recording node offsets. 20001227 mjd@plover.com
439 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
440 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
441 * Element 0 holds the number n.
442 */
443
444#define MJD_OFFSET_DEBUG(x)
445/* #define MJD_OFFSET_DEBUG(x) fprintf x */
446
447
448# define Set_Node_Offset_To_R(node,byte) \
449 STMT_START { \
450 if (! SIZE_ONLY) { \
451 if((node) < 0) { \
452 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
453 } else { \
454 RExC_offsets[2*(node)-1] = (byte); \
455 } \
456 } \
457 } STMT_END
458
459# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
460# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
461
462# define Set_Node_Length_To_R(node,len) \
463 STMT_START { \
464 if (! SIZE_ONLY) { \
465 MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
466 if((node) < 0) { \
467 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
468 } else { \
469 RExC_offsets[2*(node)] = (len); \
470 } \
471 } \
472 } STMT_END
473
474# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
475# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
476# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
477
478/* Get offsets and lengths */
479#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
480#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
481
acfe0abc 482static void clear_re(pTHX_ void *r);
4327152a 483
653099ff
GS
484/* Mark that we cannot extend a found fixed substring at this point.
485 Updata the longest found anchored substring and the longest found
486 floating substrings if needed. */
487
4327152a 488STATIC void
830247a4 489S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 490{
a0ed51b3
LW
491 STRLEN l = CHR_SVLEN(data->last_found);
492 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 493
c277df42
IZ
494 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
495 sv_setsv(*data->longest, data->last_found);
496 if (*data->longest == data->longest_fixed) {
497 data->offset_fixed = l ? data->last_start_min : data->pos_min;
498 if (data->flags & SF_BEFORE_EOL)
b81d288d 499 data->flags
c277df42
IZ
500 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
501 else
502 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
503 }
504 else {
c277df42 505 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
506 data->offset_float_max = (l
507 ? data->last_start_max
c277df42
IZ
508 : data->pos_min + data->pos_delta);
509 if (data->flags & SF_BEFORE_EOL)
b81d288d 510 data->flags
c277df42
IZ
511 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
512 else
513 data->flags &= ~SF_FL_BEFORE_EOL;
514 }
515 }
516 SvCUR_set(data->last_found, 0);
517 data->last_end = -1;
518 data->flags &= ~SF_BEFORE_EOL;
519}
520
653099ff
GS
521/* Can match anything (initialization) */
522STATIC void
830247a4 523S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 524{
653099ff 525 ANYOF_CLASS_ZERO(cl);
f8bef550 526 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 527 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
528 if (LOC)
529 cl->flags |= ANYOF_LOCALE;
530}
531
532/* Can match anything (initialization) */
533STATIC int
534S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
535{
536 int value;
537
aaa51d5e 538 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
539 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 return 1;
1aa99e6b
IH
541 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 return 0;
f8bef550
NC
543 if (!ANYOF_BITMAP_TESTALLSET(cl))
544 return 0;
653099ff
GS
545 return 1;
546}
547
548/* Can match anything (initialization) */
549STATIC void
830247a4 550S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 551{
8ecf7187 552 Zero(cl, 1, struct regnode_charclass_class);
653099ff 553 cl->type = ANYOF;
830247a4 554 cl_anything(pRExC_state, cl);
653099ff
GS
555}
556
557STATIC void
830247a4 558S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 559{
8ecf7187 560 Zero(cl, 1, struct regnode_charclass_class);
653099ff 561 cl->type = ANYOF;
830247a4 562 cl_anything(pRExC_state, cl);
653099ff
GS
563 if (LOC)
564 cl->flags |= ANYOF_LOCALE;
565}
566
567/* 'And' a given class with another one. Can create false positives */
568/* We assume that cl is not inverted */
569STATIC void
570S_cl_and(pTHX_ struct regnode_charclass_class *cl,
571 struct regnode_charclass_class *and_with)
572{
653099ff
GS
573 if (!(and_with->flags & ANYOF_CLASS)
574 && !(cl->flags & ANYOF_CLASS)
575 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
576 && !(and_with->flags & ANYOF_FOLD)
577 && !(cl->flags & ANYOF_FOLD)) {
578 int i;
579
580 if (and_with->flags & ANYOF_INVERT)
581 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
582 cl->bitmap[i] &= ~and_with->bitmap[i];
583 else
584 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
585 cl->bitmap[i] &= and_with->bitmap[i];
586 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
587 if (!(and_with->flags & ANYOF_EOS))
588 cl->flags &= ~ANYOF_EOS;
1aa99e6b
IH
589
590 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
595 if (!(and_with->flags & ANYOF_UNICODE_ALL))
596 cl->flags &= ~ANYOF_UNICODE_ALL;
597 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
598 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
599}
600
601/* 'OR' a given class with another one. Can create false positives */
602/* We assume that cl is not inverted */
603STATIC void
830247a4 604S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 605{
653099ff
GS
606 if (or_with->flags & ANYOF_INVERT) {
607 /* We do not use
608 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
609 * <= (B1 | !B2) | (CL1 | !CL2)
610 * which is wasteful if CL2 is small, but we ignore CL2:
611 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
612 * XXXX Can we handle case-fold? Unclear:
613 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
614 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
615 */
616 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
617 && !(or_with->flags & ANYOF_FOLD)
618 && !(cl->flags & ANYOF_FOLD) ) {
619 int i;
620
621 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
622 cl->bitmap[i] |= ~or_with->bitmap[i];
623 } /* XXXX: logic is complicated otherwise */
624 else {
830247a4 625 cl_anything(pRExC_state, cl);
653099ff
GS
626 }
627 } else {
628 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
629 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 630 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
631 || (cl->flags & ANYOF_FOLD)) ) {
632 int i;
633
634 /* OR char bitmap and class bitmap separately */
635 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
636 cl->bitmap[i] |= or_with->bitmap[i];
637 if (or_with->flags & ANYOF_CLASS) {
638 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
639 cl->classflags[i] |= or_with->classflags[i];
640 cl->flags |= ANYOF_CLASS;
641 }
642 }
643 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 644 cl_anything(pRExC_state, cl);
653099ff
GS
645 }
646 }
647 if (or_with->flags & ANYOF_EOS)
648 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
649
650 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
651 ARG(cl) != ARG(or_with)) {
652 cl->flags |= ANYOF_UNICODE_ALL;
653 cl->flags &= ~ANYOF_UNICODE;
654 }
655 if (or_with->flags & ANYOF_UNICODE_ALL) {
656 cl->flags |= ANYOF_UNICODE_ALL;
657 cl->flags &= ~ANYOF_UNICODE;
658 }
653099ff
GS
659}
660
5d1c421c
JH
661/*
662 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
663 * These need to be revisited when a newer toolchain becomes available.
664 */
665#if defined(__sparc64__) && defined(__GNUC__)
666# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
667# undef SPARC64_GCC_WORKAROUND
668# define SPARC64_GCC_WORKAROUND 1
669# endif
670#endif
671
653099ff
GS
672/* REx optimizer. Converts nodes into quickier variants "in place".
673 Finds fixed substrings. */
674
c277df42
IZ
675/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
676 to the position after last scanned or to NULL. */
677
76e3520e 678STATIC I32
830247a4 679S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
680 /* scanp: Start here (read-write). */
681 /* deltap: Write maxlen-minlen here. */
682 /* last: Stop before this one. */
683{
684 I32 min = 0, pars = 0, code;
685 regnode *scan = *scanp, *next;
686 I32 delta = 0;
687 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 688 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
689 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
690 scan_data_t data_fake;
653099ff 691 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 692
c277df42
IZ
693 while (scan && OP(scan) != END && scan < last) {
694 /* Peephole optimizer: */
695
22c35a8c 696 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 697 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
698 regnode *n = regnext(scan);
699 U32 stringok = 1;
700#ifdef DEBUGGING
701 regnode *stop = scan;
b81d288d 702#endif
c277df42 703
cd439c50 704 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
705 /* Skip NOTHING, merge EXACT*. */
706 while (n &&
b81d288d 707 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
708 (stringok && (OP(n) == OP(scan))))
709 && NEXT_OFF(n)
710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
711 if (OP(n) == TAIL || n > next)
712 stringok = 0;
22c35a8c 713 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
714 NEXT_OFF(scan) += NEXT_OFF(n);
715 next = n + NODE_STEP_REGNODE;
716#ifdef DEBUGGING
717 if (stringok)
718 stop = n;
b81d288d 719#endif
c277df42 720 n = regnext(n);
a0ed51b3 721 }
f49d4d0f 722 else if (stringok) {
cd439c50 723 int oldl = STR_LEN(scan);
c277df42 724 regnode *nnext = regnext(n);
f49d4d0f 725
b81d288d 726 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
727 break;
728 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
729 STR_LEN(scan) += STR_LEN(n);
730 next = n + NODE_SZ_STR(n);
c277df42 731 /* Now we can overwrite *n : */
f49d4d0f 732 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 733#ifdef DEBUGGING
f49d4d0f 734 stop = next - 1;
b81d288d 735#endif
c277df42
IZ
736 n = nnext;
737 }
738 }
61a36c01 739
d65e4eab 740 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
741/*
742 Two problematic code points in Unicode casefolding of EXACT nodes:
743
744 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
745 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
746
747 which casefold to
748
749 Unicode UTF-8
750
751 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
752 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
753
754 This means that in case-insensitive matching (or "loose matching",
755 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
756 length of the above casefolded versions) can match a target string
757 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
758 This would rather mess up the minimum length computation.
759
760 What we'll do is to look for the tail four bytes, and then peek
761 at the preceding two bytes to see whether we need to decrease
762 the minimum length by four (six minus two).
763
764 Thanks to the design of UTF-8, there cannot be false matches:
765 A sequence of valid UTF-8 bytes cannot be a subsequence of
766 another valid sequence of UTF-8 bytes.
767
768*/
769 char *s0 = STRING(scan), *s, *t;
770 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
771 char *t0 = "\xcc\x88\xcc\x81";
772 char *t1 = t0 + 3;
773
774 for (s = s0 + 2;
775 s < s2 && (t = ninstr(s, s1, t0, t1));
776 s = t + 4) {
777 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
778 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
779 min -= 4;
780 }
781 }
782
c277df42
IZ
783#ifdef DEBUGGING
784 /* Allow dumping */
cd439c50 785 n = scan + NODE_SZ_STR(scan);
c277df42 786 while (n <= stop) {
22c35a8c 787 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
788 OP(n) = OPTIMIZED;
789 NEXT_OFF(n) = 0;
790 }
791 n++;
792 }
653099ff 793#endif
c277df42 794 }
653099ff
GS
795 /* Follow the next-chain of the current node and optimize
796 away all the NOTHINGs from it. */
c277df42 797 if (OP(scan) != CURLYX) {
048cfca1
GS
798 int max = (reg_off_by_arg[OP(scan)]
799 ? I32_MAX
800 /* I32 may be smaller than U16 on CRAYs! */
801 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
802 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
803 int noff;
804 regnode *n = scan;
b81d288d 805
c277df42
IZ
806 /* Skip NOTHING and LONGJMP. */
807 while ((n = regnext(n))
22c35a8c 808 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
809 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
810 && off + noff < max)
811 off += noff;
812 if (reg_off_by_arg[OP(scan)])
813 ARG(scan) = off;
b81d288d 814 else
c277df42
IZ
815 NEXT_OFF(scan) = off;
816 }
653099ff
GS
817 /* The principal pseudo-switch. Cannot be a switch, since we
818 look into several different things. */
b81d288d 819 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
820 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
821 next = regnext(scan);
822 code = OP(scan);
b81d288d
AB
823
824 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 825 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 826 struct regnode_charclass_class accum;
c277df42 827
653099ff 828 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 829 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 830 if (flags & SCF_DO_STCLASS)
830247a4 831 cl_init_zero(pRExC_state, &accum);
c277df42 832 while (OP(scan) == code) {
830247a4 833 I32 deltanext, minnext, f = 0, fake;
653099ff 834 struct regnode_charclass_class this_class;
c277df42
IZ
835
836 num++;
837 data_fake.flags = 0;
b81d288d 838 if (data) {
2c2d71f5 839 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
840 data_fake.last_closep = data->last_closep;
841 }
842 else
843 data_fake.last_closep = &fake;
c277df42
IZ
844 next = regnext(scan);
845 scan = NEXTOPER(scan);
846 if (code != BRANCH)
847 scan = NEXTOPER(scan);
653099ff 848 if (flags & SCF_DO_STCLASS) {
830247a4 849 cl_init(pRExC_state, &this_class);
653099ff
GS
850 data_fake.start_class = &this_class;
851 f = SCF_DO_STCLASS_AND;
b81d288d 852 }
e1901655
IZ
853 if (flags & SCF_WHILEM_VISITED_POS)
854 f |= SCF_WHILEM_VISITED_POS;
653099ff 855 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
856 minnext = study_chunk(pRExC_state, &scan, &deltanext,
857 next, &data_fake, f);
b81d288d 858 if (min1 > minnext)
c277df42
IZ
859 min1 = minnext;
860 if (max1 < minnext + deltanext)
861 max1 = minnext + deltanext;
862 if (deltanext == I32_MAX)
aca2d497 863 is_inf = is_inf_internal = 1;
c277df42
IZ
864 scan = next;
865 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
866 pars++;
405ff068 867 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 868 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
869 if (data)
870 data->whilem_c = data_fake.whilem_c;
653099ff 871 if (flags & SCF_DO_STCLASS)
830247a4 872 cl_or(pRExC_state, &accum, &this_class);
b81d288d 873 if (code == SUSPEND)
c277df42
IZ
874 break;
875 }
876 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
877 min1 = 0;
878 if (flags & SCF_DO_SUBSTR) {
879 data->pos_min += min1;
880 data->pos_delta += max1 - min1;
881 if (max1 != min1 || is_inf)
882 data->longest = &(data->longest_float);
883 }
884 min += min1;
885 delta += max1 - min1;
653099ff 886 if (flags & SCF_DO_STCLASS_OR) {
830247a4 887 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
888 if (min1) {
889 cl_and(data->start_class, &and_with);
890 flags &= ~SCF_DO_STCLASS;
891 }
892 }
893 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
894 if (min1) {
895 cl_and(data->start_class, &accum);
653099ff 896 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
897 }
898 else {
b81d288d 899 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
900 * data->start_class */
901 StructCopy(data->start_class, &and_with,
902 struct regnode_charclass_class);
903 flags &= ~SCF_DO_STCLASS_AND;
904 StructCopy(&accum, data->start_class,
905 struct regnode_charclass_class);
906 flags |= SCF_DO_STCLASS_OR;
907 data->start_class->flags |= ANYOF_EOS;
908 }
653099ff 909 }
a0ed51b3
LW
910 }
911 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
912 scan = NEXTOPER(NEXTOPER(scan));
913 else /* single branch is optimized. */
914 scan = NEXTOPER(scan);
915 continue;
a0ed51b3
LW
916 }
917 else if (OP(scan) == EXACT) {
cd439c50 918 I32 l = STR_LEN(scan);
1aa99e6b 919 UV uc = *((U8*)STRING(scan));
a0ed51b3 920 if (UTF) {
1aa99e6b
IH
921 U8 *s = (U8*)STRING(scan);
922 l = utf8_length(s, s + l);
9041c2e3 923 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
924 }
925 min += l;
c277df42 926 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
927 /* The code below prefers earlier match for fixed
928 offset, later match for variable offset. */
929 if (data->last_end == -1) { /* Update the start info. */
930 data->last_start_min = data->pos_min;
931 data->last_start_max = is_inf
b81d288d 932 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 933 }
cd439c50 934 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
c277df42
IZ
935 data->last_end = data->pos_min + l;
936 data->pos_min += l; /* As in the first entry. */
937 data->flags &= ~SF_BEFORE_EOL;
938 }
653099ff
GS
939 if (flags & SCF_DO_STCLASS_AND) {
940 /* Check whether it is compatible with what we know already! */
941 int compat = 1;
942
1aa99e6b 943 if (uc >= 0x100 ||
516a5887 944 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 945 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 946 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 947 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 948 )
653099ff
GS
949 compat = 0;
950 ANYOF_CLASS_ZERO(data->start_class);
951 ANYOF_BITMAP_ZERO(data->start_class);
952 if (compat)
1aa99e6b 953 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 954 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
955 if (uc < 0x100)
956 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
957 }
958 else if (flags & SCF_DO_STCLASS_OR) {
959 /* false positive possible if the class is case-folded */
1aa99e6b 960 if (uc < 0x100)
9b877dbb
IH
961 ANYOF_BITMAP_SET(data->start_class, uc);
962 else
963 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
964 data->start_class->flags &= ~ANYOF_EOS;
965 cl_and(data->start_class, &and_with);
966 }
967 flags &= ~SCF_DO_STCLASS;
a0ed51b3 968 }
653099ff 969 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 970 I32 l = STR_LEN(scan);
1aa99e6b 971 UV uc = *((U8*)STRING(scan));
653099ff
GS
972
973 /* Search for fixed substrings supports EXACT only. */
b81d288d 974 if (flags & SCF_DO_SUBSTR)
830247a4 975 scan_commit(pRExC_state, data);
a0ed51b3 976 if (UTF) {
1aa99e6b
IH
977 U8 *s = (U8 *)STRING(scan);
978 l = utf8_length(s, s + l);
9041c2e3 979 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
980 }
981 min += l;
c277df42 982 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 983 data->pos_min += l;
653099ff
GS
984 if (flags & SCF_DO_STCLASS_AND) {
985 /* Check whether it is compatible with what we know already! */
986 int compat = 1;
987
1aa99e6b 988 if (uc >= 0x100 ||
516a5887 989 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 990 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 991 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
992 compat = 0;
993 ANYOF_CLASS_ZERO(data->start_class);
994 ANYOF_BITMAP_ZERO(data->start_class);
995 if (compat) {
1aa99e6b 996 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
997 data->start_class->flags &= ~ANYOF_EOS;
998 data->start_class->flags |= ANYOF_FOLD;
999 if (OP(scan) == EXACTFL)
1000 data->start_class->flags |= ANYOF_LOCALE;
1001 }
1002 }
1003 else if (flags & SCF_DO_STCLASS_OR) {
1004 if (data->start_class->flags & ANYOF_FOLD) {
1005 /* false positive possible if the class is case-folded.
1006 Assume that the locale settings are the same... */
1aa99e6b
IH
1007 if (uc < 0x100)
1008 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 }
1011 cl_and(data->start_class, &and_with);
1012 }
1013 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1014 }
4d61ec05 1015 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1016 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1017 I32 f = flags, pos_before = 0;
c277df42 1018 regnode *oscan = scan;
653099ff
GS
1019 struct regnode_charclass_class this_class;
1020 struct regnode_charclass_class *oclass = NULL;
727f22e3 1021 I32 next_is_eval = 0;
653099ff 1022
22c35a8c 1023 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1024 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1025 scan = NEXTOPER(scan);
1026 goto finish;
1027 case PLUS:
653099ff 1028 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1029 next = NEXTOPER(scan);
653099ff 1030 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1031 mincount = 1;
1032 maxcount = REG_INFTY;
c277df42
IZ
1033 next = regnext(scan);
1034 scan = NEXTOPER(scan);
1035 goto do_curly;
1036 }
1037 }
1038 if (flags & SCF_DO_SUBSTR)
1039 data->pos_min++;
1040 min++;
1041 /* Fall through. */
1042 case STAR:
653099ff
GS
1043 if (flags & SCF_DO_STCLASS) {
1044 mincount = 0;
b81d288d 1045 maxcount = REG_INFTY;
653099ff
GS
1046 next = regnext(scan);
1047 scan = NEXTOPER(scan);
1048 goto do_curly;
1049 }
b81d288d 1050 is_inf = is_inf_internal = 1;
c277df42
IZ
1051 scan = regnext(scan);
1052 if (flags & SCF_DO_SUBSTR) {
830247a4 1053 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1054 data->longest = &(data->longest_float);
1055 }
1056 goto optimize_curly_tail;
1057 case CURLY:
b81d288d 1058 mincount = ARG1(scan);
c277df42
IZ
1059 maxcount = ARG2(scan);
1060 next = regnext(scan);
cb434fcc
IZ
1061 if (OP(scan) == CURLYX) {
1062 I32 lp = (data ? *(data->last_closep) : 0);
1063
1064 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1065 }
c277df42 1066 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1067 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1068 do_curly:
1069 if (flags & SCF_DO_SUBSTR) {
830247a4 1070 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1071 pos_before = data->pos_min;
1072 }
1073 if (data) {
1074 fl = data->flags;
1075 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1076 if (is_inf)
1077 data->flags |= SF_IS_INF;
1078 }
653099ff 1079 if (flags & SCF_DO_STCLASS) {
830247a4 1080 cl_init(pRExC_state, &this_class);
653099ff
GS
1081 oclass = data->start_class;
1082 data->start_class = &this_class;
1083 f |= SCF_DO_STCLASS_AND;
1084 f &= ~SCF_DO_STCLASS_OR;
1085 }
e1901655
IZ
1086 /* These are the cases when once a subexpression
1087 fails at a particular position, it cannot succeed
1088 even after backtracking at the enclosing scope.
b81d288d 1089
e1901655
IZ
1090 XXXX what if minimal match and we are at the
1091 initial run of {n,m}? */
1092 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1093 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1094
c277df42 1095 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1096 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1097 mincount == 0
653099ff
GS
1098 ? (f & ~SCF_DO_SUBSTR) : f);
1099
1100 if (flags & SCF_DO_STCLASS)
1101 data->start_class = oclass;
1102 if (mincount == 0 || minnext == 0) {
1103 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1104 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1105 }
1106 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1107 /* Switch to OR mode: cache the old value of
653099ff
GS
1108 * data->start_class */
1109 StructCopy(data->start_class, &and_with,
1110 struct regnode_charclass_class);
1111 flags &= ~SCF_DO_STCLASS_AND;
1112 StructCopy(&this_class, data->start_class,
1113 struct regnode_charclass_class);
1114 flags |= SCF_DO_STCLASS_OR;
1115 data->start_class->flags |= ANYOF_EOS;
1116 }
1117 } else { /* Non-zero len */
1118 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1119 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1120 cl_and(data->start_class, &and_with);
1121 }
1122 else if (flags & SCF_DO_STCLASS_AND)
1123 cl_and(data->start_class, &this_class);
1124 flags &= ~SCF_DO_STCLASS;
1125 }
c277df42
IZ
1126 if (!scan) /* It was not CURLYX, but CURLY. */
1127 scan = next;
84037bb0 1128 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1129 /* ? quantifier ok, except for (?{ ... }) */
1130 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1131 && (minnext == 0) && (deltanext == 0)
99799961 1132 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1133 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1134 {
830247a4 1135 vWARN(RExC_parse,
b45f050a
JF
1136 "Quantifier unexpected on zero-length expression");
1137 }
1138
c277df42 1139 min += minnext * mincount;
b81d288d 1140 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1141 && (minnext + deltanext) > 0)
1142 || deltanext == I32_MAX);
aca2d497 1143 is_inf |= is_inf_internal;
c277df42
IZ
1144 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1145
1146 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1147 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1148 && data->flags & SF_IN_PAR
1149 && !(data->flags & SF_HAS_EVAL)
1150 && !deltanext && minnext == 1 ) {
1151 /* Try to optimize to CURLYN. */
1152 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1153 regnode *nxt1 = nxt;
1154#ifdef DEBUGGING
1155 regnode *nxt2;
1156#endif
c277df42
IZ
1157
1158 /* Skip open. */
1159 nxt = regnext(nxt);
4d61ec05 1160 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1161 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1162 && STR_LEN(nxt) == 1))
c277df42 1163 goto nogo;
497b47a8 1164#ifdef DEBUGGING
c277df42 1165 nxt2 = nxt;
497b47a8 1166#endif
c277df42 1167 nxt = regnext(nxt);
b81d288d 1168 if (OP(nxt) != CLOSE)
c277df42
IZ
1169 goto nogo;
1170 /* Now we know that nxt2 is the only contents: */
1171 oscan->flags = ARG(nxt);
1172 OP(oscan) = CURLYN;
1173 OP(nxt1) = NOTHING; /* was OPEN. */
1174#ifdef DEBUGGING
1175 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1176 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1177 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1178 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1179 OP(nxt + 1) = OPTIMIZED; /* was count. */
1180 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1181#endif
c277df42 1182 }
c277df42
IZ
1183 nogo:
1184
1185 /* Try optimization CURLYX => CURLYM. */
b81d288d 1186 if ( OP(oscan) == CURLYX && data
c277df42 1187 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1188 && !(data->flags & SF_HAS_EVAL)
1189 && !deltanext ) {
1190 /* XXXX How to optimize if data == 0? */
1191 /* Optimize to a simpler form. */
1192 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1193 regnode *nxt2;
1194
1195 OP(oscan) = CURLYM;
1196 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1197 && (OP(nxt2) != WHILEM))
c277df42
IZ
1198 nxt = nxt2;
1199 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1200 /* Need to optimize away parenths. */
1201 if (data->flags & SF_IN_PAR) {
1202 /* Set the parenth number. */
1203 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1204
b81d288d 1205 if (OP(nxt) != CLOSE)
b45f050a 1206 FAIL("Panic opt close");
c277df42
IZ
1207 oscan->flags = ARG(nxt);
1208 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1209 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1210#ifdef DEBUGGING
1211 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1212 OP(nxt + 1) = OPTIMIZED; /* was count. */
1213 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1214 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1215#endif
c277df42
IZ
1216#if 0
1217 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1218 regnode *nnxt = regnext(nxt1);
b81d288d 1219
c277df42
IZ
1220 if (nnxt == nxt) {
1221 if (reg_off_by_arg[OP(nxt1)])
1222 ARG_SET(nxt1, nxt2 - nxt1);
1223 else if (nxt2 - nxt1 < U16_MAX)
1224 NEXT_OFF(nxt1) = nxt2 - nxt1;
1225 else
1226 OP(nxt) = NOTHING; /* Cannot beautify */
1227 }
1228 nxt1 = nnxt;
1229 }
1230#endif
1231 /* Optimize again: */
b81d288d 1232 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1233 NULL, 0);
a0ed51b3
LW
1234 }
1235 else
c277df42 1236 oscan->flags = 0;
c277df42 1237 }
e1901655
IZ
1238 else if ((OP(oscan) == CURLYX)
1239 && (flags & SCF_WHILEM_VISITED_POS)
1240 /* See the comment on a similar expression above.
1241 However, this time it not a subexpression
1242 we care about, but the expression itself. */
1243 && (maxcount == REG_INFTY)
1244 && data && ++data->whilem_c < 16) {
1245 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1246 /* Find WHILEM (as in regexec.c) */
1247 regnode *nxt = oscan + NEXT_OFF(oscan);
1248
1249 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1250 nxt += ARG(nxt);
1251 PREVOPER(nxt)->flags = data->whilem_c
830247a4 1252 | (RExC_whilem_seen << 4); /* On WHILEM */
2c2d71f5 1253 }
b81d288d 1254 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1255 pars++;
1256 if (flags & SCF_DO_SUBSTR) {
1257 SV *last_str = Nullsv;
1258 int counted = mincount != 0;
1259
1260 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1261#if defined(SPARC64_GCC_WORKAROUND)
1262 I32 b = 0;
1263 STRLEN l = 0;
1264 char *s = NULL;
1265 I32 old = 0;
1266
1267 if (pos_before >= data->last_start_min)
1268 b = pos_before;
1269 else
1270 b = data->last_start_min;
1271
1272 l = 0;
1273 s = SvPV(data->last_found, l);
1274 old = b - data->last_start_min;
1275
1276#else
b81d288d 1277 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1278 ? pos_before : data->last_start_min;
1279 STRLEN l;
1280 char *s = SvPV(data->last_found, l);
a0ed51b3 1281 I32 old = b - data->last_start_min;
5d1c421c 1282#endif
a0ed51b3
LW
1283
1284 if (UTF)
1285 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1286
a0ed51b3 1287 l -= old;
c277df42 1288 /* Get the added string: */
79cb57f6 1289 last_str = newSVpvn(s + old, l);
c277df42
IZ
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1292 if (mincount > 1) {
1293 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1294 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
b81d288d 1298 SvCUR_set(data->last_found,
c277df42
IZ
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
1301 data->last_end += l * (mincount - 1);
1302 }
2a8d9689
HS
1303 } else {
1304 /* start offset must point into the last copy */
1305 data->last_start_min += minnext * (mincount - 1);
4b2cff9a
HS
1306 data->last_start_max += is_inf ? 0 : (maxcount - 1)
1307 * (minnext + data->pos_delta);
c277df42
IZ
1308 }
1309 }
1310 /* It is counted once already... */
1311 data->pos_min += minnext * (mincount - counted);
1312 data->pos_delta += - counted * deltanext +
1313 (minnext + deltanext) * maxcount - minnext * mincount;
1314 if (mincount != maxcount) {
653099ff
GS
1315 /* Cannot extend fixed substrings found inside
1316 the group. */
830247a4 1317 scan_commit(pRExC_state,data);
c277df42
IZ
1318 if (mincount && last_str) {
1319 sv_setsv(data->last_found, last_str);
1320 data->last_end = data->pos_min;
b81d288d 1321 data->last_start_min =
a0ed51b3 1322 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1323 data->last_start_max = is_inf
1324 ? I32_MAX
c277df42 1325 : data->pos_min + data->pos_delta
a0ed51b3 1326 - CHR_SVLEN(last_str);
c277df42
IZ
1327 }
1328 data->longest = &(data->longest_float);
1329 }
aca2d497 1330 SvREFCNT_dec(last_str);
c277df42 1331 }
405ff068 1332 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1333 data->flags |= SF_HAS_EVAL;
1334 optimize_curly_tail:
c277df42 1335 if (OP(oscan) != CURLYX) {
22c35a8c 1336 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1337 && NEXT_OFF(next))
1338 NEXT_OFF(oscan) += NEXT_OFF(next);
1339 }
c277df42 1340 continue;
653099ff 1341 default: /* REF and CLUMP only? */
c277df42 1342 if (flags & SCF_DO_SUBSTR) {
830247a4 1343 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1344 data->longest = &(data->longest_float);
1345 }
aca2d497 1346 is_inf = is_inf_internal = 1;
653099ff 1347 if (flags & SCF_DO_STCLASS_OR)
830247a4 1348 cl_anything(pRExC_state, data->start_class);
653099ff 1349 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1350 break;
1351 }
a0ed51b3 1352 }
ffc61ed2 1353 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1354 int value = 0;
653099ff 1355
c277df42 1356 if (flags & SCF_DO_SUBSTR) {
830247a4 1357 scan_commit(pRExC_state,data);
c277df42
IZ
1358 data->pos_min++;
1359 }
1360 min++;
653099ff
GS
1361 if (flags & SCF_DO_STCLASS) {
1362 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1363
1364 /* Some of the logic below assumes that switching
1365 locale on will only add false positives. */
1366 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1367 case SANY:
653099ff
GS
1368 default:
1369 do_default:
1370 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1371 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1372 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1373 break;
1374 case REG_ANY:
1375 if (OP(scan) == SANY)
1376 goto do_default;
1377 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1378 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1379 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1380 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1381 }
1382 if (flags & SCF_DO_STCLASS_AND || !value)
1383 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1384 break;
1385 case ANYOF:
1386 if (flags & SCF_DO_STCLASS_AND)
1387 cl_and(data->start_class,
1388 (struct regnode_charclass_class*)scan);
1389 else
830247a4 1390 cl_or(pRExC_state, data->start_class,
653099ff
GS
1391 (struct regnode_charclass_class*)scan);
1392 break;
1393 case ALNUM:
1394 if (flags & SCF_DO_STCLASS_AND) {
1395 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1396 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1397 for (value = 0; value < 256; value++)
1398 if (!isALNUM(value))
1399 ANYOF_BITMAP_CLEAR(data->start_class, value);
1400 }
1401 }
1402 else {
1403 if (data->start_class->flags & ANYOF_LOCALE)
1404 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1405 else {
1406 for (value = 0; value < 256; value++)
1407 if (isALNUM(value))
b81d288d 1408 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1409 }
1410 }
1411 break;
1412 case ALNUML:
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (data->start_class->flags & ANYOF_LOCALE)
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416 }
1417 else {
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 data->start_class->flags |= ANYOF_LOCALE;
1420 }
1421 break;
1422 case NALNUM:
1423 if (flags & SCF_DO_STCLASS_AND) {
1424 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1425 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1426 for (value = 0; value < 256; value++)
1427 if (isALNUM(value))
1428 ANYOF_BITMAP_CLEAR(data->start_class, value);
1429 }
1430 }
1431 else {
1432 if (data->start_class->flags & ANYOF_LOCALE)
1433 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1434 else {
1435 for (value = 0; value < 256; value++)
1436 if (!isALNUM(value))
b81d288d 1437 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1438 }
1439 }
1440 break;
1441 case NALNUML:
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (data->start_class->flags & ANYOF_LOCALE)
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445 }
1446 else {
1447 data->start_class->flags |= ANYOF_LOCALE;
1448 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1449 }
1450 break;
1451 case SPACE:
1452 if (flags & SCF_DO_STCLASS_AND) {
1453 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1454 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1455 for (value = 0; value < 256; value++)
1456 if (!isSPACE(value))
1457 ANYOF_BITMAP_CLEAR(data->start_class, value);
1458 }
1459 }
1460 else {
1461 if (data->start_class->flags & ANYOF_LOCALE)
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1463 else {
1464 for (value = 0; value < 256; value++)
1465 if (isSPACE(value))
b81d288d 1466 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1467 }
1468 }
1469 break;
1470 case SPACEL:
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (data->start_class->flags & ANYOF_LOCALE)
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474 }
1475 else {
1476 data->start_class->flags |= ANYOF_LOCALE;
1477 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1478 }
1479 break;
1480 case NSPACE:
1481 if (flags & SCF_DO_STCLASS_AND) {
1482 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1483 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1484 for (value = 0; value < 256; value++)
1485 if (isSPACE(value))
1486 ANYOF_BITMAP_CLEAR(data->start_class, value);
1487 }
1488 }
1489 else {
1490 if (data->start_class->flags & ANYOF_LOCALE)
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1492 else {
1493 for (value = 0; value < 256; value++)
1494 if (!isSPACE(value))
b81d288d 1495 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1496 }
1497 }
1498 break;
1499 case NSPACEL:
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (data->start_class->flags & ANYOF_LOCALE) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1504 if (!isSPACE(value))
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1506 }
1507 }
1508 else {
1509 data->start_class->flags |= ANYOF_LOCALE;
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1511 }
1512 break;
1513 case DIGIT:
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1516 for (value = 0; value < 256; value++)
1517 if (!isDIGIT(value))
1518 ANYOF_BITMAP_CLEAR(data->start_class, value);
1519 }
1520 else {
1521 if (data->start_class->flags & ANYOF_LOCALE)
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1523 else {
1524 for (value = 0; value < 256; value++)
1525 if (isDIGIT(value))
b81d288d 1526 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1527 }
1528 }
1529 break;
1530 case NDIGIT:
1531 if (flags & SCF_DO_STCLASS_AND) {
1532 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1533 for (value = 0; value < 256; value++)
1534 if (isDIGIT(value))
1535 ANYOF_BITMAP_CLEAR(data->start_class, value);
1536 }
1537 else {
1538 if (data->start_class->flags & ANYOF_LOCALE)
1539 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1540 else {
1541 for (value = 0; value < 256; value++)
1542 if (!isDIGIT(value))
b81d288d 1543 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1544 }
1545 }
1546 break;
1547 }
1548 if (flags & SCF_DO_STCLASS_OR)
1549 cl_and(data->start_class, &and_with);
1550 flags &= ~SCF_DO_STCLASS;
1551 }
a0ed51b3 1552 }
22c35a8c 1553 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1554 data->flags |= (OP(scan) == MEOL
1555 ? SF_BEFORE_MEOL
1556 : SF_BEFORE_SEOL);
a0ed51b3 1557 }
653099ff
GS
1558 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1559 /* Lookbehind, or need to calculate parens/evals/stclass: */
1560 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1561 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1562 /* Lookahead/lookbehind */
cb434fcc 1563 I32 deltanext, minnext, fake = 0;
c277df42 1564 regnode *nscan;
653099ff
GS
1565 struct regnode_charclass_class intrnl;
1566 int f = 0;
c277df42
IZ
1567
1568 data_fake.flags = 0;
b81d288d 1569 if (data) {
2c2d71f5 1570 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1571 data_fake.last_closep = data->last_closep;
1572 }
1573 else
1574 data_fake.last_closep = &fake;
653099ff
GS
1575 if ( flags & SCF_DO_STCLASS && !scan->flags
1576 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1577 cl_init(pRExC_state, &intrnl);
653099ff 1578 data_fake.start_class = &intrnl;
e1901655 1579 f |= SCF_DO_STCLASS_AND;
653099ff 1580 }
e1901655
IZ
1581 if (flags & SCF_WHILEM_VISITED_POS)
1582 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1583 next = regnext(scan);
1584 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1585 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1586 if (scan->flags) {
1587 if (deltanext) {
9baa0206 1588 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1589 }
1590 else if (minnext > U8_MAX) {
9baa0206 1591 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42
IZ
1592 }
1593 scan->flags = minnext;
1594 }
1595 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1596 pars++;
405ff068 1597 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1598 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1599 if (data)
1600 data->whilem_c = data_fake.whilem_c;
e1901655 1601 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1602 int was = (data->start_class->flags & ANYOF_EOS);
1603
1604 cl_and(data->start_class, &intrnl);
1605 if (was)
1606 data->start_class->flags |= ANYOF_EOS;
1607 }
a0ed51b3
LW
1608 }
1609 else if (OP(scan) == OPEN) {
c277df42 1610 pars++;
a0ed51b3 1611 }
cb434fcc
IZ
1612 else if (OP(scan) == CLOSE) {
1613 if (ARG(scan) == is_par) {
1614 next = regnext(scan);
c277df42 1615
cb434fcc
IZ
1616 if ( next && (OP(next) != WHILEM) && next < last)
1617 is_par = 0; /* Disable optimization */
1618 }
1619 if (data)
1620 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1621 }
1622 else if (OP(scan) == EVAL) {
c277df42
IZ
1623 if (data)
1624 data->flags |= SF_HAS_EVAL;
1625 }
96776eda 1626 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1627 if (flags & SCF_DO_SUBSTR) {
830247a4 1628 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1629 data->longest = &(data->longest_float);
1630 }
1631 is_inf = is_inf_internal = 1;
653099ff 1632 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1633 cl_anything(pRExC_state, data->start_class);
96776eda 1634 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1635 }
c277df42
IZ
1636 /* Else: zero-length, ignore. */
1637 scan = regnext(scan);
1638 }
1639
1640 finish:
1641 *scanp = scan;
aca2d497 1642 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1643 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1644 data->pos_delta = I32_MAX - data->pos_min;
1645 if (is_par > U8_MAX)
1646 is_par = 0;
1647 if (is_par && pars==1 && data) {
1648 data->flags |= SF_IN_PAR;
1649 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1650 }
1651 else if (pars && data) {
c277df42
IZ
1652 data->flags |= SF_HAS_PAR;
1653 data->flags &= ~SF_IN_PAR;
1654 }
653099ff
GS
1655 if (flags & SCF_DO_STCLASS_OR)
1656 cl_and(data->start_class, &and_with);
c277df42
IZ
1657 return min;
1658}
1659
76e3520e 1660STATIC I32
830247a4 1661S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1662{
830247a4 1663 if (RExC_rx->data) {
b81d288d
AB
1664 Renewc(RExC_rx->data,
1665 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1666 char, struct reg_data);
830247a4
IZ
1667 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1668 RExC_rx->data->count += n;
a0ed51b3
LW
1669 }
1670 else {
830247a4 1671 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1672 char, struct reg_data);
830247a4
IZ
1673 New(1208, RExC_rx->data->what, n, U8);
1674 RExC_rx->data->count = n;
c277df42 1675 }
830247a4
IZ
1676 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1677 return RExC_rx->data->count - n;
c277df42
IZ
1678}
1679
d88dccdf 1680void
864dbfa3 1681Perl_reginitcolors(pTHX)
d88dccdf 1682{
d88dccdf
IZ
1683 int i = 0;
1684 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1685
d88dccdf
IZ
1686 if (s) {
1687 PL_colors[0] = s = savepv(s);
1688 while (++i < 6) {
1689 s = strchr(s, '\t');
1690 if (s) {
1691 *s = '\0';
1692 PL_colors[i] = ++s;
1693 }
1694 else
c712d376 1695 PL_colors[i] = s = "";
d88dccdf
IZ
1696 }
1697 } else {
b81d288d 1698 while (i < 6)
d88dccdf
IZ
1699 PL_colors[i++] = "";
1700 }
1701 PL_colorset = 1;
1702}
1703
8615cb43 1704
a687059c 1705/*
e50aee73 1706 - pregcomp - compile a regular expression into internal code
a687059c
LW
1707 *
1708 * We can't allocate space until we know how big the compiled form will be,
1709 * but we can't compile it (and thus know how big it is) until we've got a
1710 * place to put the code. So we cheat: we compile it twice, once with code
1711 * generation turned off and size counting turned on, and once "for real".
1712 * This also means that we don't allocate space until we are sure that the
1713 * thing really will compile successfully, and we never have to move the
1714 * code and thus invalidate pointers into it. (Note that it has to be in
1715 * one piece because free() must be able to free it all.) [NB: not true in perl]
1716 *
1717 * Beware that the optimization-preparation code in here knows about some
1718 * of the structure of the compiled regexp. [I'll say.]
1719 */
1720regexp *
864dbfa3 1721Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1722{
a0d0e21e 1723 register regexp *r;
c277df42 1724 regnode *scan;
c277df42 1725 regnode *first;
a0d0e21e 1726 I32 flags;
a0d0e21e
LW
1727 I32 minlen = 0;
1728 I32 sawplus = 0;
1729 I32 sawopen = 0;
2c2d71f5 1730 scan_data_t data;
830247a4
IZ
1731 RExC_state_t RExC_state;
1732 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1733
1734 if (exp == NULL)
c277df42 1735 FAIL("NULL regexp argument");
a0d0e21e 1736
a5961de5 1737 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1738
5cfc7842 1739 RExC_precomp = exp;
a5961de5
JH
1740 DEBUG_r({
1741 if (!PL_colorset) reginitcolors();
1742 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1743 PL_colors[4],PL_colors[5],PL_colors[0],
1744 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1745 });
830247a4
IZ
1746 RExC_flags16 = pm->op_pmflags;
1747 RExC_sawback = 0;
bbce6d69 1748
830247a4
IZ
1749 RExC_seen = 0;
1750 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1751 RExC_seen_evals = 0;
1752 RExC_extralen = 0;
c277df42 1753
bbce6d69 1754 /* First pass: determine size, legality. */
830247a4 1755 RExC_parse = exp;
fac92740 1756 RExC_start = exp;
830247a4
IZ
1757 RExC_end = xend;
1758 RExC_naughty = 0;
1759 RExC_npar = 1;
1760 RExC_size = 0L;
1761 RExC_emit = &PL_regdummy;
1762 RExC_whilem_seen = 0;
85ddcde9
JH
1763#if 0 /* REGC() is (currently) a NOP at the first pass.
1764 * Clever compilers notice this and complain. --jhi */
830247a4 1765 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1766#endif
830247a4 1767 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1768 RExC_precomp = Nullch;
a0d0e21e
LW
1769 return(NULL);
1770 }
830247a4 1771 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1772
c277df42
IZ
1773 /* Small enough for pointer-storage convention?
1774 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1775 if (RExC_size >= 0x10000L && RExC_extralen)
1776 RExC_size += RExC_extralen;
c277df42 1777 else
830247a4
IZ
1778 RExC_extralen = 0;
1779 if (RExC_whilem_seen > 15)
1780 RExC_whilem_seen = 15;
a0d0e21e 1781
bbce6d69 1782 /* Allocate space and initialize. */
830247a4 1783 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1784 char, regexp);
a0d0e21e 1785 if (r == NULL)
b45f050a
JF
1786 FAIL("Regexp out of space");
1787
0f79a09d
GS
1788#ifdef DEBUGGING
1789 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1790 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1791#endif
c277df42 1792 r->refcnt = 1;
bbce6d69 1793 r->prelen = xend - exp;
5cfc7842 1794 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d
IZ
1795 r->subbeg = NULL;
1796 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1797 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1798
1799 r->substrs = 0; /* Useful during FAIL. */
1800 r->startp = 0; /* Useful during FAIL. */
1801 r->endp = 0; /* Useful during FAIL. */
1802
fac92740
MJD
1803 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1804 if (r->offsets) {
1805 r->offsets[0] = RExC_size;
1806 }
1807 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1808 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1809 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1810 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1811
830247a4 1812 RExC_rx = r;
bbce6d69 1813
1814 /* Second pass: emit code. */
055bb491 1815 RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1816 RExC_parse = exp;
1817 RExC_end = xend;
1818 RExC_naughty = 0;
1819 RExC_npar = 1;
fac92740 1820 RExC_emit_start = r->program;
830247a4 1821 RExC_emit = r->program;
2cd61cdb 1822 /* Store the count of eval-groups for security checks: */
830247a4
IZ
1823 RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1824 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1825 r->data = 0;
830247a4 1826 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1827 return(NULL);
1828
1829 /* Dig out information for optimizations. */
cf93c79d 1830 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
830247a4 1831 pm->op_pmflags = RExC_flags16;
a0ed51b3 1832 if (UTF)
5ff6fc6d 1833 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1834 r->regstclass = NULL;
830247a4 1835 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1836 r->reganch |= ROPT_NAUGHTY;
c277df42 1837 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1838
1839 /* XXXX To minimize changes to RE engine we always allocate
1840 3-units-long substrs field. */
1841 Newz(1004, r->substrs, 1, struct reg_substr_data);
1842
2c2d71f5 1843 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1844 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1845 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1846 I32 fake;
c5254dd6 1847 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1848 struct regnode_charclass_class ch_class;
1849 int stclass_flag;
cb434fcc 1850 I32 last_close = 0;
a0d0e21e
LW
1851
1852 first = scan;
c277df42 1853 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1854 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1855 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1856 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1857 (OP(first) == PLUS) ||
1858 (OP(first) == MINMOD) ||
653099ff 1859 /* An {n,m} with n>0 */
22c35a8c 1860 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1861 if (OP(first) == PLUS)
1862 sawplus = 1;
1863 else
1864 first += regarglen[(U8)OP(first)];
1865 first = NEXTOPER(first);
a687059c
LW
1866 }
1867
a0d0e21e
LW
1868 /* Starting-point info. */
1869 again:
653099ff 1870 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1871 if (OP(first) == EXACT)
1872 ; /* Empty, get anchored substr later. */
1873 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1874 r->regstclass = first;
1875 }
653099ff 1876 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1877 r->regstclass = first;
22c35a8c
GS
1878 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1879 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1880 r->regstclass = first;
22c35a8c 1881 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1882 r->reganch |= (OP(first) == MBOL
1883 ? ROPT_ANCH_MBOL
1884 : (OP(first) == SBOL
1885 ? ROPT_ANCH_SBOL
1886 : ROPT_ANCH_BOL));
a0d0e21e 1887 first = NEXTOPER(first);
774d564b 1888 goto again;
1889 }
1890 else if (OP(first) == GPOS) {
1891 r->reganch |= ROPT_ANCH_GPOS;
1892 first = NEXTOPER(first);
1893 goto again;
a0d0e21e 1894 }
e09294f4 1895 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1896 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1897 !(r->reganch & ROPT_ANCH) )
1898 {
1899 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1900 int type = OP(NEXTOPER(first));
1901
ffc61ed2 1902 if (type == REG_ANY)
cad2e5aa
JH
1903 type = ROPT_ANCH_MBOL;
1904 else
1905 type = ROPT_ANCH_SBOL;
1906
1907 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1908 first = NEXTOPER(first);
774d564b 1909 goto again;
a0d0e21e 1910 }
b81d288d 1911 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1912 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1913 /* x+ must match at the 1st pos of run of x's */
1914 r->reganch |= ROPT_SKIP;
a0d0e21e 1915
c277df42 1916 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1917 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1918 (IV)(first - scan + 1)));
a0d0e21e
LW
1919 /*
1920 * If there's something expensive in the r.e., find the
1921 * longest literal string that must appear and make it the
1922 * regmust. Resolve ties in favor of later strings, since
1923 * the regstart check works with the beginning of the r.e.
1924 * and avoiding duplication strengthens checking. Not a
1925 * strong reason, but sufficient in the absence of others.
1926 * [Now we resolve ties in favor of the earlier string if
c277df42 1927 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1928 * earlier string may buy us something the later one won't.]
1929 */
a0d0e21e 1930 minlen = 0;
a687059c 1931
79cb57f6
GS
1932 data.longest_fixed = newSVpvn("",0);
1933 data.longest_float = newSVpvn("",0);
1934 data.last_found = newSVpvn("",0);
c277df42
IZ
1935 data.longest = &(data.longest_fixed);
1936 first = scan;
653099ff 1937 if (!r->regstclass) {
830247a4 1938 cl_init(pRExC_state, &ch_class);
653099ff
GS
1939 data.start_class = &ch_class;
1940 stclass_flag = SCF_DO_STCLASS_AND;
1941 } else /* XXXX Check for BOUND? */
1942 stclass_flag = 0;
cb434fcc 1943 data.last_closep = &last_close;
653099ff 1944
830247a4 1945 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1946 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1947 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1948 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1949 && !RExC_seen_zerolen
1950 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1951 r->reganch |= ROPT_CHECK_ALL;
830247a4 1952 scan_commit(pRExC_state, &data);
c277df42
IZ
1953 SvREFCNT_dec(data.last_found);
1954
a0ed51b3 1955 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1956 if (longest_float_length
c277df42
IZ
1957 || (data.flags & SF_FL_BEFORE_EOL
1958 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1959 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1960 int t;
1961
a0ed51b3 1962 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1963 && data.offset_fixed == data.offset_float_min
1964 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1965 goto remove_float; /* As in (a)+. */
1966
c277df42
IZ
1967 r->float_substr = data.longest_float;
1968 r->float_min_offset = data.offset_float_min;
1969 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1970 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1971 && (!(data.flags & SF_FL_BEFORE_MEOL)
830247a4 1972 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1973 fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1974 }
1975 else {
aca2d497 1976 remove_float:
c277df42
IZ
1977 r->float_substr = Nullsv;
1978 SvREFCNT_dec(data.longest_float);
c5254dd6 1979 longest_float_length = 0;
a0d0e21e 1980 }
c277df42 1981
a0ed51b3 1982 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 1983 if (longest_fixed_length
c277df42
IZ
1984 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1985 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1986 || (RExC_flags16 & PMf_MULTILINE)))) {
cf93c79d
IZ
1987 int t;
1988
c277df42
IZ
1989 r->anchored_substr = data.longest_fixed;
1990 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
1991 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FIX_BEFORE_MEOL)
830247a4 1993 || (RExC_flags16 & PMf_MULTILINE)));
cf93c79d 1994 fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1995 }
1996 else {
c277df42
IZ
1997 r->anchored_substr = Nullsv;
1998 SvREFCNT_dec(data.longest_fixed);
c5254dd6 1999 longest_fixed_length = 0;
a0d0e21e 2000 }
b81d288d 2001 if (r->regstclass
ffc61ed2 2002 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff
GS
2003 r->regstclass = NULL;
2004 if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
2005 && !(data.start_class->flags & ANYOF_EOS)
2006 && !cl_is_anything(data.start_class)) {
830247a4 2007 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2008
b81d288d 2009 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2010 struct regnode_charclass_class);
2011 StructCopy(data.start_class,
830247a4 2012 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2013 struct regnode_charclass_class);
830247a4 2014 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2015 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2016 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2017 DEBUG_r({ SV *sv = sv_newmortal();
2018 regprop(sv, (regnode*)data.start_class);
2019 PerlIO_printf(Perl_debug_log,
2020 "synthetic stclass `%s'.\n",
2021 SvPVX(sv));});
653099ff 2022 }
c277df42
IZ
2023
2024 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2025 if (longest_fixed_length > longest_float_length) {
c277df42
IZ
2026 r->check_substr = r->anchored_substr;
2027 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2028 if (r->reganch & ROPT_ANCH_SINGLE)
2029 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2030 }
2031 else {
c277df42
IZ
2032 r->check_substr = r->float_substr;
2033 r->check_offset_min = data.offset_float_min;
2034 r->check_offset_max = data.offset_float_max;
a0d0e21e 2035 }
30382c73
IZ
2036 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2037 This should be changed ASAP! */
2038 if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa
JH
2039 r->reganch |= RE_USE_INTUIT;
2040 if (SvTAIL(r->check_substr))
2041 r->reganch |= RE_INTUIT_TAIL;
2042 }
a0ed51b3
LW
2043 }
2044 else {
c277df42
IZ
2045 /* Several toplevels. Best we can is to set minlen. */
2046 I32 fake;
653099ff 2047 struct regnode_charclass_class ch_class;
cb434fcc 2048 I32 last_close = 0;
c277df42
IZ
2049
2050 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2051 scan = r->program + 1;
830247a4 2052 cl_init(pRExC_state, &ch_class);
653099ff 2053 data.start_class = &ch_class;
cb434fcc 2054 data.last_closep = &last_close;
e1901655 2055 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
c277df42 2056 r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
653099ff
GS
2057 if (!(data.start_class->flags & ANYOF_EOS)
2058 && !cl_is_anything(data.start_class)) {
830247a4 2059 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2060
b81d288d 2061 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2062 struct regnode_charclass_class);
2063 StructCopy(data.start_class,
830247a4 2064 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2065 struct regnode_charclass_class);
830247a4 2066 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2067 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2068 DEBUG_r({ SV* sv = sv_newmortal();
2069 regprop(sv, (regnode*)data.start_class);
2070 PerlIO_printf(Perl_debug_log,
2071 "synthetic stclass `%s'.\n",
2072 SvPVX(sv));});
653099ff 2073 }
a0d0e21e
LW
2074 }
2075
a0d0e21e 2076 r->minlen = minlen;
b81d288d 2077 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2078 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2079 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2080 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2081 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2082 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2083 if (RExC_seen & REG_SEEN_CANY)
2084 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2085 Newz(1002, r->startp, RExC_npar, I32);
2086 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2087 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2088 DEBUG_r(regdump(r));
2089 return(r);
a687059c
LW
2090}
2091
2092/*
2093 - reg - regular expression, i.e. main body or parenthesized thing
2094 *
2095 * Caller must absorb opening parenthesis.
2096 *
2097 * Combining parenthesis handling with the base level of regular expression
2098 * is a trifle forced, but the need to tie the tails of the branches to what
2099 * follows makes it hard to avoid.
2100 */
76e3520e 2101STATIC regnode *
830247a4 2102S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2103 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2104{
c277df42
IZ
2105 register regnode *ret; /* Will be the head of the group. */
2106 register regnode *br;
2107 register regnode *lastbr;
2108 register regnode *ender = 0;
a0d0e21e 2109 register I32 parno = 0;
830247a4 2110 I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0;
9d1d55b5
JP
2111
2112 /* for (?g), (?gc), and (?o) warnings; warning
2113 about (?c) will warn about (?g) -- japhy */
2114
2115 I32 wastedflags = 0x00,
2116 wasted_o = 0x01,
2117 wasted_g = 0x02,
2118 wasted_gc = 0x02 | 0x04,
2119 wasted_c = 0x04;
2120
fac92740 2121 char * parse_start = RExC_parse; /* MJD */
830247a4 2122 char *oregcomp_parse = RExC_parse;
c277df42 2123 char c;
a0d0e21e 2124
821b33a5 2125 *flagp = 0; /* Tentatively. */
a0d0e21e 2126
9d1d55b5 2127
a0d0e21e
LW
2128 /* Make an OPEN node, if parenthesized. */
2129 if (paren) {
fac92740 2130 if (*RExC_parse == '?') { /* (?...) */
ca9dfc88
IZ
2131 U16 posflags = 0, negflags = 0;
2132 U16 *flagsp = &posflags;
0f5d15d6 2133 int logical = 0;
830247a4 2134 char *seqstart = RExC_parse;
ca9dfc88 2135
830247a4
IZ
2136 RExC_parse++;
2137 paren = *RExC_parse++;
c277df42 2138 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2139 switch (paren) {
fac92740 2140 case '<': /* (?<...) */
830247a4 2141 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2142 if (*RExC_parse == '!')
c277df42 2143 paren = ',';
b81d288d 2144 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2145 goto unknown;
830247a4 2146 RExC_parse++;
fac92740
MJD
2147 case '=': /* (?=...) */
2148 case '!': /* (?!...) */
830247a4 2149 RExC_seen_zerolen++;
fac92740
MJD
2150 case ':': /* (?:...) */
2151 case '>': /* (?>...) */
a0d0e21e 2152 break;
fac92740
MJD
2153 case '$': /* (?$...) */
2154 case '@': /* (?@...) */
8615cb43 2155 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2156 break;
fac92740 2157 case '#': /* (?#...) */
830247a4
IZ
2158 while (*RExC_parse && *RExC_parse != ')')
2159 RExC_parse++;
2160 if (*RExC_parse != ')')
c277df42 2161 FAIL("Sequence (?#... not terminated");
830247a4 2162 nextchar(pRExC_state);
a0d0e21e
LW
2163 *flagp = TRYAGAIN;
2164 return NULL;
fac92740 2165 case 'p': /* (?p...) */
f9373011
PM
2166 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2167 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2168 /* FALL THROUGH*/
fac92740 2169 case '?': /* (??...) */
0f5d15d6 2170 logical = 1;
830247a4 2171 paren = *RExC_parse++;
0f5d15d6 2172 /* FALL THROUGH */
fac92740 2173 case '{': /* (?{...}) */
c277df42 2174 {
c277df42
IZ
2175 I32 count = 1, n = 0;
2176 char c;
830247a4 2177 char *s = RExC_parse;
c277df42
IZ
2178 SV *sv;
2179 OP_4tree *sop, *rop;
2180
830247a4
IZ
2181 RExC_seen_zerolen++;
2182 RExC_seen |= REG_SEEN_EVAL;
2183 while (count && (c = *RExC_parse)) {
2184 if (c == '\\' && RExC_parse[1])
2185 RExC_parse++;
b81d288d 2186 else if (c == '{')
c277df42 2187 count++;
b81d288d 2188 else if (c == '}')
c277df42 2189 count--;
830247a4 2190 RExC_parse++;
c277df42 2191 }
830247a4 2192 if (*RExC_parse != ')')
b45f050a 2193 {
b81d288d 2194 RExC_parse = s;
b45f050a
JF
2195 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2196 }
c277df42
IZ
2197 if (!SIZE_ONLY) {
2198 AV *av;
b81d288d
AB
2199
2200 if (RExC_parse - 1 - s)
830247a4 2201 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2202 else
79cb57f6 2203 sv = newSVpvn("", 0);
c277df42 2204
569233ed
SB
2205 ENTER;
2206 Perl_save_re_context(aTHX);
c277df42 2207 rop = sv_compile_2op(sv, &sop, "re", &av);
9b978d73
DM
2208 sop->op_private |= OPpREFCOUNTED;
2209 /* re_dup will OpREFCNT_inc */
2210 OpREFCNT_set(sop, 1);
569233ed 2211 LEAVE;
c277df42 2212
830247a4
IZ
2213 n = add_data(pRExC_state, 3, "nop");
2214 RExC_rx->data->data[n] = (void*)rop;
2215 RExC_rx->data->data[n+1] = (void*)sop;
2216 RExC_rx->data->data[n+2] = (void*)av;
c277df42 2217 SvREFCNT_dec(sv);
a0ed51b3 2218 }
e24b16f9 2219 else { /* First pass */
830247a4 2220 if (PL_reginterp_cnt < ++RExC_seen_evals
e24b16f9 2221 && PL_curcop != &PL_compiling)
2cd61cdb
IZ
2222 /* No compiled RE interpolated, has runtime
2223 components ===> unsafe. */
2224 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2225 if (PL_tainting && PL_tainted)
cc6b7395 2226 FAIL("Eval-group in insecure regular expression");
c277df42
IZ
2227 }
2228
830247a4 2229 nextchar(pRExC_state);
0f5d15d6 2230 if (logical) {
830247a4 2231 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2232 if (!SIZE_ONLY)
2233 ret->flags = 2;
830247a4 2234 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2235 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2236 return ret;
2237 }
830247a4 2238 return reganode(pRExC_state, EVAL, n);
c277df42 2239 }
fac92740 2240 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2241 {
fac92740 2242 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2243 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2244 || RExC_parse[1] == '<'
830247a4 2245 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2246 I32 flag;
2247
830247a4 2248 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2249 if (!SIZE_ONLY)
2250 ret->flags = 1;
830247a4 2251 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2252 goto insert_if;
b81d288d 2253 }
a0ed51b3 2254 }
830247a4 2255 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2256 /* (?(1)...) */
830247a4 2257 parno = atoi(RExC_parse++);
c277df42 2258
830247a4
IZ
2259 while (isDIGIT(*RExC_parse))
2260 RExC_parse++;
fac92740
MJD
2261 ret = reganode(pRExC_state, GROUPP, parno);
2262
830247a4 2263 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2264 vFAIL("Switch condition not recognized");
c277df42 2265 insert_if:
830247a4
IZ
2266 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2267 br = regbranch(pRExC_state, &flags, 1);
c277df42 2268 if (br == NULL)
830247a4 2269 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2270 else
830247a4
IZ
2271 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2272 c = *nextchar(pRExC_state);
d1b80229
IZ
2273 if (flags&HASWIDTH)
2274 *flagp |= HASWIDTH;
c277df42 2275 if (c == '|') {
830247a4
IZ
2276 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2277 regbranch(pRExC_state, &flags, 1);
2278 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2279 if (flags&HASWIDTH)
2280 *flagp |= HASWIDTH;
830247a4 2281 c = *nextchar(pRExC_state);
a0ed51b3
LW
2282 }
2283 else
c277df42
IZ
2284 lastbr = NULL;
2285 if (c != ')')
8615cb43 2286 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2287 ender = reg_node(pRExC_state, TAIL);
2288 regtail(pRExC_state, br, ender);
c277df42 2289 if (lastbr) {
830247a4
IZ
2290 regtail(pRExC_state, lastbr, ender);
2291 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2292 }
2293 else
830247a4 2294 regtail(pRExC_state, ret, ender);
c277df42 2295 return ret;
a0ed51b3
LW
2296 }
2297 else {
830247a4 2298 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2299 }
2300 }
1b1626e4 2301 case 0:
830247a4 2302 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2303 vFAIL("Sequence (? incomplete");
1b1626e4 2304 break;
a0d0e21e 2305 default:
830247a4 2306 --RExC_parse;
fac92740 2307 parse_flags: /* (?i) */
830247a4 2308 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2309 /* (?g), (?gc) and (?o) are useless here
2310 and must be globally applied -- japhy */
2311
2312 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2313 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2314 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2315 if (! (wastedflags & wflagbit) ) {
2316 wastedflags |= wflagbit;
2317 vWARN5(
2318 RExC_parse + 1,
2319 "Useless (%s%c) - %suse /%c modifier",
2320 flagsp == &negflags ? "?-" : "?",
2321 *RExC_parse,
2322 flagsp == &negflags ? "don't " : "",
2323 *RExC_parse
2324 );
2325 }
2326 }
2327 }
2328 else if (*RExC_parse == 'c') {
2329 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2330 if (! (wastedflags & wasted_c) ) {
2331 wastedflags |= wasted_gc;
2332 vWARN3(
2333 RExC_parse + 1,
2334 "Useless (%sc) - %suse /gc modifier",
2335 flagsp == &negflags ? "?-" : "?",
2336 flagsp == &negflags ? "don't " : ""
2337 );
2338 }
2339 }
2340 }
2341 else { pmflag(flagsp, *RExC_parse); }
2342
830247a4 2343 ++RExC_parse;
ca9dfc88 2344 }
830247a4 2345 if (*RExC_parse == '-') {
ca9dfc88 2346 flagsp = &negflags;
9d1d55b5 2347 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2348 ++RExC_parse;
ca9dfc88 2349 goto parse_flags;
48c036b1 2350 }
830247a4
IZ
2351 RExC_flags16 |= posflags;
2352 RExC_flags16 &= ~negflags;
2353 if (*RExC_parse == ':') {
2354 RExC_parse++;
ca9dfc88
IZ
2355 paren = ':';
2356 break;
2357 }
c277df42 2358 unknown:
830247a4
IZ
2359 if (*RExC_parse != ')') {
2360 RExC_parse++;
2361 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2362 }
830247a4 2363 nextchar(pRExC_state);
a0d0e21e
LW
2364 *flagp = TRYAGAIN;
2365 return NULL;
2366 }
2367 }
fac92740 2368 else { /* (...) */
830247a4
IZ
2369 parno = RExC_npar;
2370 RExC_npar++;
2371 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2372 Set_Node_Length(ret, 1); /* MJD */
2373 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2374 open = 1;
a0d0e21e 2375 }
a0ed51b3 2376 }
fac92740 2377 else /* ! paren */
a0d0e21e
LW
2378 ret = NULL;
2379
2380 /* Pick up the branches, linking them together. */
fac92740 2381 parse_start = RExC_parse; /* MJD */
830247a4 2382 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2383 /* branch_len = (paren != 0); */
2384
a0d0e21e
LW
2385 if (br == NULL)
2386 return(NULL);
830247a4
IZ
2387 if (*RExC_parse == '|') {
2388 if (!SIZE_ONLY && RExC_extralen) {
2389 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2390 }
fac92740 2391 else { /* MJD */
830247a4 2392 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2393 Set_Node_Length(br, paren != 0);
2394 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2395 }
c277df42
IZ
2396 have_branch = 1;
2397 if (SIZE_ONLY)
830247a4 2398 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2399 }
2400 else if (paren == ':') {
c277df42
IZ
2401 *flagp |= flags&SIMPLE;
2402 }
2403 if (open) { /* Starts with OPEN. */
830247a4 2404 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2405 }
2406 else if (paren != '?') /* Not Conditional */
a0d0e21e 2407 ret = br;
32a0ca98 2408 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2409 lastbr = br;
830247a4
IZ
2410 while (*RExC_parse == '|') {
2411 if (!SIZE_ONLY && RExC_extralen) {
2412 ender = reganode(pRExC_state, LONGJMP,0);
2413 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2414 }
2415 if (SIZE_ONLY)
830247a4
IZ
2416 RExC_extralen += 2; /* Account for LONGJMP. */
2417 nextchar(pRExC_state);
2418 br = regbranch(pRExC_state, &flags, 0);
fac92740 2419
a687059c 2420 if (br == NULL)
a0d0e21e 2421 return(NULL);
830247a4 2422 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2423 lastbr = br;
821b33a5
IZ
2424 if (flags&HASWIDTH)
2425 *flagp |= HASWIDTH;
a687059c 2426 *flagp |= flags&SPSTART;
a0d0e21e
LW
2427 }
2428
c277df42
IZ
2429 if (have_branch || paren != ':') {
2430 /* Make a closing node, and hook it on the end. */
2431 switch (paren) {
2432 case ':':
830247a4 2433 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2434 break;
2435 case 1:
830247a4 2436 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2437 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2438 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2439 break;
2440 case '<':
c277df42
IZ
2441 case ',':
2442 case '=':
2443 case '!':
c277df42 2444 *flagp &= ~HASWIDTH;
821b33a5
IZ
2445 /* FALL THROUGH */
2446 case '>':
830247a4 2447 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2448 break;
2449 case 0:
830247a4 2450 ender = reg_node(pRExC_state, END);
c277df42
IZ
2451 break;
2452 }
830247a4 2453 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2454
c277df42
IZ
2455 if (have_branch) {
2456 /* Hook the tails of the branches to the closing node. */
2457 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2458 regoptail(pRExC_state, br, ender);
c277df42
IZ
2459 }
2460 }
a0d0e21e 2461 }
c277df42
IZ
2462
2463 {
2464 char *p;
2465 static char parens[] = "=!<,>";
2466
2467 if (paren && (p = strchr(parens, paren))) {
2468 int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2469 int flag = (p - parens) > 1;
2470
2471 if (paren == '>')
2472 node = SUSPEND, flag = 0;
830247a4 2473 reginsert(pRExC_state, node,ret);
c277df42 2474 ret->flags = flag;
830247a4 2475 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2476 }
a0d0e21e
LW
2477 }
2478
2479 /* Check for proper termination. */
ce3e6498 2480 if (paren) {
830247a4
IZ
2481 RExC_flags16 = oregflags;
2482 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2483 RExC_parse = oregcomp_parse;
380a0633 2484 vFAIL("Unmatched (");
ce3e6498 2485 }
a0ed51b3 2486 }
830247a4
IZ
2487 else if (!paren && RExC_parse < RExC_end) {
2488 if (*RExC_parse == ')') {
2489 RExC_parse++;
380a0633 2490 vFAIL("Unmatched )");
a0ed51b3
LW
2491 }
2492 else
b45f050a 2493 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2494 /* NOTREACHED */
2495 }
a687059c 2496
a0d0e21e 2497 return(ret);
a687059c
LW
2498}
2499
2500/*
2501 - regbranch - one alternative of an | operator
2502 *
2503 * Implements the concatenation operator.
2504 */
76e3520e 2505STATIC regnode *
830247a4 2506S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2507{
c277df42
IZ
2508 register regnode *ret;
2509 register regnode *chain = NULL;
2510 register regnode *latest;
2511 I32 flags = 0, c = 0;
a0d0e21e 2512
b81d288d 2513 if (first)
c277df42
IZ
2514 ret = NULL;
2515 else {
b81d288d 2516 if (!SIZE_ONLY && RExC_extralen)
830247a4 2517 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2518 else {
830247a4 2519 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2520 Set_Node_Length(ret, 1);
2521 }
c277df42
IZ
2522 }
2523
b81d288d 2524 if (!first && SIZE_ONLY)
830247a4 2525 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2526
c277df42 2527 *flagp = WORST; /* Tentatively. */
a0d0e21e 2528
830247a4
IZ
2529 RExC_parse--;
2530 nextchar(pRExC_state);
2531 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2532 flags &= ~TRYAGAIN;
830247a4 2533 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2534 if (latest == NULL) {
2535 if (flags & TRYAGAIN)
2536 continue;
2537 return(NULL);
a0ed51b3
LW
2538 }
2539 else if (ret == NULL)
c277df42 2540 ret = latest;
a0d0e21e 2541 *flagp |= flags&HASWIDTH;
c277df42 2542 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2543 *flagp |= flags&SPSTART;
2544 else {
830247a4
IZ
2545 RExC_naughty++;
2546 regtail(pRExC_state, chain, latest);
a687059c 2547 }
a0d0e21e 2548 chain = latest;
c277df42
IZ
2549 c++;
2550 }
2551 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2552 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2553 if (ret == NULL)
2554 ret = chain;
2555 }
2556 if (c == 1) {
2557 *flagp |= flags&SIMPLE;
a0d0e21e 2558 }
a687059c 2559
a0d0e21e 2560 return(ret);
a687059c
LW
2561}
2562
2563/*
2564 - regpiece - something followed by possible [*+?]
2565 *
2566 * Note that the branching code sequences used for ? and the general cases
2567 * of * and + are somewhat optimized: they use the same NOTHING node as
2568 * both the endmarker for their branch list and the body of the last branch.
2569 * It might seem that this node could be dispensed with entirely, but the
2570 * endmarker role is not redundant.
2571 */
76e3520e 2572STATIC regnode *
830247a4 2573S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2574{
c277df42 2575 register regnode *ret;
a0d0e21e
LW
2576 register char op;
2577 register char *next;
2578 I32 flags;
830247a4 2579 char *origparse = RExC_parse;
a0d0e21e
LW
2580 char *maxpos;
2581 I32 min;
c277df42 2582 I32 max = REG_INFTY;
fac92740 2583 char *parse_start;
a0d0e21e 2584
830247a4 2585 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2586 if (ret == NULL) {
2587 if (flags & TRYAGAIN)
2588 *flagp |= TRYAGAIN;
2589 return(NULL);
2590 }
2591
830247a4 2592 op = *RExC_parse;
a0d0e21e 2593
830247a4 2594 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2595 parse_start = RExC_parse; /* MJD */
830247a4 2596 next = RExC_parse + 1;
a0d0e21e
LW
2597 maxpos = Nullch;
2598 while (isDIGIT(*next) || *next == ',') {
2599 if (*next == ',') {
2600 if (maxpos)
2601 break;
2602 else
2603 maxpos = next;
a687059c 2604 }
a0d0e21e
LW
2605 next++;
2606 }
2607 if (*next == '}') { /* got one */
2608 if (!maxpos)
2609 maxpos = next;
830247a4
IZ
2610 RExC_parse++;
2611 min = atoi(RExC_parse);
a0d0e21e
LW
2612 if (*maxpos == ',')
2613 maxpos++;
2614 else
830247a4 2615 maxpos = RExC_parse;
a0d0e21e
LW
2616 max = atoi(maxpos);
2617 if (!max && *maxpos != '0')
c277df42
IZ
2618 max = REG_INFTY; /* meaning "infinity" */
2619 else if (max >= REG_INFTY)
8615cb43 2620 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2621 RExC_parse = next;
2622 nextchar(pRExC_state);
a0d0e21e
LW
2623
2624 do_curly:
2625 if ((flags&SIMPLE)) {
830247a4
IZ
2626 RExC_naughty += 2 + RExC_naughty / 2;
2627 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2628 Set_Node_Offset(ret, parse_start+1); /* MJD */
2629 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2630 }
2631 else {
830247a4 2632 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2633
2634 w->flags = 0;
830247a4
IZ
2635 regtail(pRExC_state, ret, w);
2636 if (!SIZE_ONLY && RExC_extralen) {
2637 reginsert(pRExC_state, LONGJMP,ret);
2638 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2639 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2640 }
830247a4 2641 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2642 /* MJD hk */
2643 Set_Node_Offset(ret, parse_start+1);
2644 Set_Node_Length(ret,
2645 op == '{' ? (RExC_parse - parse_start) : 1);
2646
830247a4 2647 if (!SIZE_ONLY && RExC_extralen)
c277df42 2648 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2649 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2650 if (SIZE_ONLY)
830247a4
IZ
2651 RExC_whilem_seen++, RExC_extralen += 3;
2652 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2653 }
c277df42 2654 ret->flags = 0;
a0d0e21e
LW
2655
2656 if (min > 0)
821b33a5
IZ
2657 *flagp = WORST;
2658 if (max > 0)
2659 *flagp |= HASWIDTH;
a0d0e21e 2660 if (max && max < min)
8615cb43 2661 vFAIL("Can't do {n,m} with n > m");
c277df42
IZ
2662 if (!SIZE_ONLY) {
2663 ARG1_SET(ret, min);
2664 ARG2_SET(ret, max);
a687059c 2665 }
a687059c 2666
a0d0e21e 2667 goto nest_check;
a687059c 2668 }
a0d0e21e 2669 }
a687059c 2670
a0d0e21e
LW
2671 if (!ISMULT1(op)) {
2672 *flagp = flags;
a687059c 2673 return(ret);
a0d0e21e 2674 }
bb20fd44 2675
c277df42 2676#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2677
2678 /* if this is reinstated, don't forget to put this back into perldiag:
2679
2680 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2681
2682 (F) The part of the regexp subject to either the * or + quantifier
2683 could match an empty string. The {#} shows in the regular
2684 expression about where the problem was discovered.
2685
2686 */
2687
bb20fd44 2688 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2689 vFAIL("Regexp *+ operand could be empty");
b81d288d 2690#endif
bb20fd44 2691
fac92740 2692 parse_start = RExC_parse;
830247a4 2693 nextchar(pRExC_state);
a0d0e21e 2694
821b33a5 2695 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2696
2697 if (op == '*' && (flags&SIMPLE)) {
830247a4 2698 reginsert(pRExC_state, STAR, ret);
c277df42 2699 ret->flags = 0;
830247a4 2700 RExC_naughty += 4;
a0d0e21e
LW
2701 }
2702 else if (op == '*') {
2703 min = 0;
2704 goto do_curly;
a0ed51b3
LW
2705 }
2706 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2707 reginsert(pRExC_state, PLUS, ret);
c277df42 2708 ret->flags = 0;
830247a4 2709 RExC_naughty += 3;
a0d0e21e
LW
2710 }
2711 else if (op == '+') {
2712 min = 1;
2713 goto do_curly;
a0ed51b3
LW
2714 }
2715 else if (op == '?') {
a0d0e21e
LW
2716 min = 0; max = 1;
2717 goto do_curly;
2718 }
2719 nest_check:
e476b1b5 2720 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2721 vWARN3(RExC_parse,
b45f050a 2722 "%.*s matches null string many times",
830247a4 2723 RExC_parse - origparse,
b45f050a 2724 origparse);
a0d0e21e
LW
2725 }
2726
830247a4
IZ
2727 if (*RExC_parse == '?') {
2728 nextchar(pRExC_state);
2729 reginsert(pRExC_state, MINMOD, ret);
2730 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2731 }
830247a4
IZ
2732 if (ISMULT2(RExC_parse)) {
2733 RExC_parse++;
b45f050a
JF
2734 vFAIL("Nested quantifiers");
2735 }
a0d0e21e
LW
2736
2737 return(ret);
a687059c
LW
2738}
2739
2740/*
2741 - regatom - the lowest level
2742 *
2743 * Optimization: gobbles an entire sequence of ordinary characters so that
2744 * it can turn them into a single node, which is smaller to store and
2745 * faster to run. Backslashed characters are exceptions, each becoming a
2746 * separate node; the code is simpler that way and it's not worth fixing.
2747 *
b45f050a 2748 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2749STATIC regnode *
830247a4 2750S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2751{
c277df42 2752 register regnode *ret = 0;
a0d0e21e 2753 I32 flags;
f06dbbb7 2754 char *parse_start = 0;
a0d0e21e
LW
2755
2756 *flagp = WORST; /* Tentatively. */
2757
2758tryagain:
830247a4 2759 switch (*RExC_parse) {
a0d0e21e 2760 case '^':
830247a4
IZ
2761 RExC_seen_zerolen++;
2762 nextchar(pRExC_state);
2763 if (RExC_flags16 & PMf_MULTILINE)
2764 ret = reg_node(pRExC_state, MBOL);
2765 else if (RExC_flags16 & PMf_SINGLELINE)
2766 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2767 else
830247a4 2768 ret = reg_node(pRExC_state, BOL);
fac92740 2769 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2770 break;
2771 case '$':
830247a4 2772 nextchar(pRExC_state);
b81d288d 2773 if (*RExC_parse)
830247a4
IZ
2774 RExC_seen_zerolen++;
2775 if (RExC_flags16 & PMf_MULTILINE)
2776 ret = reg_node(pRExC_state, MEOL);
2777 else if (RExC_flags16 & PMf_SINGLELINE)
2778 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2779 else
830247a4 2780 ret = reg_node(pRExC_state, EOL);
fac92740 2781 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2782 break;
2783 case '.':
830247a4 2784 nextchar(pRExC_state);
cce850e4 2785 if (RExC_flags16 & PMf_SINGLELINE)
ffc61ed2
JH
2786 ret = reg_node(pRExC_state, SANY);
2787 else
2788 ret = reg_node(pRExC_state, REG_ANY);
2789 *flagp |= HASWIDTH|SIMPLE;
830247a4 2790 RExC_naughty++;
fac92740 2791 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2792 break;
2793 case '[':
b45f050a 2794 {
830247a4 2795 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2796 ret = regclass(pRExC_state);
830247a4
IZ
2797 if (*RExC_parse != ']') {
2798 RExC_parse = oregcomp_parse;
b45f050a
JF
2799 vFAIL("Unmatched [");
2800 }
830247a4 2801 nextchar(pRExC_state);
a0d0e21e 2802 *flagp |= HASWIDTH|SIMPLE;
fac92740 2803 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2804 break;
b45f050a 2805 }
a0d0e21e 2806 case '(':
830247a4
IZ
2807 nextchar(pRExC_state);
2808 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2809 if (ret == NULL) {
bf93d4cc 2810 if (flags & TRYAGAIN) {
830247a4 2811 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2812 /* Make parent create an empty node if needed. */
2813 *flagp |= TRYAGAIN;
2814 return(NULL);
2815 }
a0d0e21e 2816 goto tryagain;
bf93d4cc 2817 }
a0d0e21e
LW
2818 return(NULL);
2819 }
c277df42 2820 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2821 break;
2822 case '|':
2823 case ')':
2824 if (flags & TRYAGAIN) {
2825 *flagp |= TRYAGAIN;
2826 return NULL;
2827 }
b45f050a 2828 vFAIL("Internal urp");
a0d0e21e
LW
2829 /* Supposed to be caught earlier. */
2830 break;
85afd4ae 2831 case '{':
830247a4
IZ
2832 if (!regcurly(RExC_parse)) {
2833 RExC_parse++;
85afd4ae
CS
2834 goto defchar;
2835 }
2836 /* FALL THROUGH */
a0d0e21e
LW
2837 case '?':
2838 case '+':
2839 case '*':
830247a4 2840 RExC_parse++;
b45f050a 2841 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2842 break;
2843 case '\\':
830247a4 2844 switch (*++RExC_parse) {
a0d0e21e 2845 case 'A':
830247a4
IZ
2846 RExC_seen_zerolen++;
2847 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2848 *flagp |= SIMPLE;
830247a4 2849 nextchar(pRExC_state);
fac92740 2850 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2851 break;
2852 case 'G':
830247a4
IZ
2853 ret = reg_node(pRExC_state, GPOS);
2854 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2855 *flagp |= SIMPLE;
830247a4 2856 nextchar(pRExC_state);
fac92740 2857 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2858 break;
2859 case 'Z':
830247a4 2860 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2861 *flagp |= SIMPLE;
a1917ab9 2862 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2863 nextchar(pRExC_state);
a0d0e21e 2864 break;
b85d18e9 2865 case 'z':
830247a4 2866 ret = reg_node(pRExC_state, EOS);
b85d18e9 2867 *flagp |= SIMPLE;
830247a4
IZ
2868 RExC_seen_zerolen++; /* Do not optimize RE away */
2869 nextchar(pRExC_state);
fac92740 2870 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2871 break;
4a2d328f 2872 case 'C':
f33976b4
DB
2873 ret = reg_node(pRExC_state, CANY);
2874 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2875 *flagp |= HASWIDTH|SIMPLE;
830247a4 2876 nextchar(pRExC_state);
fac92740 2877 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2878 break;
2879 case 'X':
830247a4 2880 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2881 *flagp |= HASWIDTH;
830247a4 2882 nextchar(pRExC_state);
fac92740 2883 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2884 break;
a0d0e21e 2885 case 'w':
ffc61ed2 2886 ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
a0d0e21e 2887 *flagp |= HASWIDTH|SIMPLE;
830247a4 2888 nextchar(pRExC_state);
fac92740 2889 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2890 break;
2891 case 'W':
ffc61ed2 2892 ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
a0d0e21e 2893 *flagp |= HASWIDTH|SIMPLE;
830247a4 2894 nextchar(pRExC_state);
fac92740 2895 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2896 break;
2897 case 'b':
830247a4
IZ
2898 RExC_seen_zerolen++;
2899 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2900 ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
a0d0e21e 2901 *flagp |= SIMPLE;
830247a4 2902 nextchar(pRExC_state);
fac92740 2903 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2904 break;
2905 case 'B':
830247a4
IZ
2906 RExC_seen_zerolen++;
2907 RExC_seen |= REG_SEEN_LOOKBEHIND;
ffc61ed2 2908 ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
a0d0e21e 2909 *flagp |= SIMPLE;
830247a4 2910 nextchar(pRExC_state);
fac92740 2911 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2912 break;
2913 case 's':
ffc61ed2 2914 ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
a0d0e21e 2915 *flagp |= HASWIDTH|SIMPLE;
830247a4 2916 nextchar(pRExC_state);
fac92740 2917 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2918 break;
2919 case 'S':
ffc61ed2 2920 ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
a0d0e21e 2921 *flagp |= HASWIDTH|SIMPLE;
830247a4 2922 nextchar(pRExC_state);
fac92740 2923 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2924 break;
2925 case 'd':
ffc61ed2 2926 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2927 *flagp |= HASWIDTH|SIMPLE;
830247a4 2928 nextchar(pRExC_state);
fac92740 2929 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2930 break;
2931 case 'D':
ffc61ed2 2932 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2933 *flagp |= HASWIDTH|SIMPLE;
830247a4 2934 nextchar(pRExC_state);
fac92740 2935 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2936 break;
a14b48bc
LW
2937 case 'p':
2938 case 'P':
3568d838 2939 {
830247a4 2940 char* oldregxend = RExC_end;
fac92740 2941 char* parse_start = RExC_parse;
a14b48bc 2942
830247a4 2943 if (RExC_parse[1] == '{') {
3568d838 2944 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2945 RExC_end = strchr(RExC_parse, '}');
2946 if (!RExC_end) {
0da60cf5 2947 U8 c = (U8)*RExC_parse;
830247a4
IZ
2948 RExC_parse += 2;
2949 RExC_end = oldregxend;
0da60cf5 2950 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2951 }
830247a4 2952 RExC_end++;
a14b48bc
LW
2953 }
2954 else
830247a4
IZ
2955 RExC_end = RExC_parse + 2;
2956 RExC_parse--;
a14b48bc 2957
ffc61ed2 2958 ret = regclass(pRExC_state);
a14b48bc 2959
830247a4
IZ
2960 RExC_end = oldregxend;
2961 RExC_parse--;
fac92740 2962 Set_Node_Cur_Length(ret); /* MJD */
830247a4 2963 nextchar(pRExC_state);
a14b48bc
LW
2964 *flagp |= HASWIDTH|SIMPLE;
2965 }
2966 break;
a0d0e21e
LW
2967 case 'n':
2968 case 'r':
2969 case 't':
2970 case 'f':
2971 case 'e':
2972 case 'a':
2973 case 'x':
2974 case 'c':
2975 case '0':
2976 goto defchar;
2977 case '1': case '2': case '3': case '4':
2978 case '5': case '6': case '7': case '8': case '9':
2979 {
830247a4 2980 I32 num = atoi(RExC_parse);
a0d0e21e 2981
830247a4 2982 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
2983 goto defchar;
2984 else {
fac92740 2985 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
2986 while (isDIGIT(*RExC_parse))
2987 RExC_parse++;
b45f050a 2988
830247a4 2989 if (!SIZE_ONLY && num > RExC_rx->nparens)
9baa0206 2990 vFAIL("Reference to nonexistent group");
830247a4
IZ
2991 RExC_sawback = 1;
2992 ret = reganode(pRExC_state, FOLD
a0ed51b3 2993 ? (LOC ? REFFL : REFF)
c8756f30 2994 : REF, num);
a0d0e21e 2995 *flagp |= HASWIDTH;
fac92740
MJD
2996
2997 /* override incorrect value set in reganode MJD */
2998 Set_Node_Offset(ret, parse_start+1);
2999 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3000 RExC_parse--;
3001 nextchar(pRExC_state);
a0d0e21e
LW
3002 }
3003 }
3004 break;
3005 case '\0':
830247a4 3006 if (RExC_parse >= RExC_end)
b45f050a 3007 FAIL("Trailing \\");
a0d0e21e
LW
3008 /* FALL THROUGH */
3009 default:
c9f97d15
IZ
3010 /* Do not generate `unrecognized' warnings here, we fall
3011 back into the quick-grab loop below */
a0d0e21e
LW
3012 goto defchar;
3013 }
3014 break;
4633a7c4
LW
3015
3016 case '#':
830247a4
IZ
3017 if (RExC_flags16 & PMf_EXTENDED) {
3018 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3019 if (RExC_parse < RExC_end)
4633a7c4
LW
3020 goto tryagain;
3021 }
3022 /* FALL THROUGH */
3023
a0d0e21e 3024 default: {
ba210ebe 3025 register STRLEN len;
a0ed51b3 3026 register UV ender;
a0d0e21e 3027 register char *p;
c277df42 3028 char *oldp, *s;
ba210ebe 3029 STRLEN numlen;
80aecb99 3030 STRLEN foldlen;
60a8b682 3031 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3032
3033 parse_start = RExC_parse - 1;
a0d0e21e 3034
830247a4 3035 RExC_parse++;
a0d0e21e
LW
3036
3037 defchar:
830247a4 3038 ret = reg_node(pRExC_state, FOLD
a0ed51b3 3039 ? (LOC ? EXACTFL : EXACTF)
bbce6d69 3040 : EXACT);
cd439c50 3041 s = STRING(ret);
830247a4
IZ
3042 for (len = 0, p = RExC_parse - 1;
3043 len < 127 && p < RExC_end;
a0d0e21e
LW
3044 len++)
3045 {
3046 oldp = p;
5b5a24f7 3047
830247a4
IZ
3048 if (RExC_flags16 & PMf_EXTENDED)
3049 p = regwhite(p, RExC_end);
a0d0e21e
LW
3050 switch (*p) {
3051 case '^':
3052 case '$':
3053 case '.':
3054 case '[':
3055 case '(':
3056 case ')':
3057 case '|':
3058 goto loopdone;
3059 case '\\':
3060 switch (*++p) {
3061 case 'A':
1ed8eac0
JF
3062 case 'C':
3063 case 'X':
a0d0e21e
LW
3064 case 'G':
3065 case 'Z':
b85d18e9 3066 case 'z':
a0d0e21e
LW
3067 case 'w':
3068 case 'W':
3069 case 'b':
3070 case 'B':
3071 case 's':
3072 case 'S':
3073 case 'd':
3074 case 'D':
a14b48bc
LW
3075 case 'p':
3076 case 'P':
a0d0e21e
LW
3077 --p;
3078 goto loopdone;
3079 case 'n':
3080 ender = '\n';
3081 p++;
a687059c 3082 break;
a0d0e21e
LW
3083 case 'r':
3084 ender = '\r';
3085 p++;
a687059c 3086 break;
a0d0e21e
LW
3087 case 't':
3088 ender = '\t';
3089 p++;
a687059c 3090 break;
a0d0e21e
LW
3091 case 'f':
3092 ender = '\f';
3093 p++;
a687059c 3094 break;
a0d0e21e 3095 case 'e':
c7f1f016 3096 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3097 p++;
a687059c 3098 break;
a0d0e21e 3099 case 'a':
c7f1f016 3100 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3101 p++;
a687059c 3102 break;
a0d0e21e 3103 case 'x':
a0ed51b3
LW
3104 if (*++p == '{') {
3105 char* e = strchr(p, '}');
b81d288d 3106
b45f050a 3107 if (!e) {
830247a4 3108 RExC_parse = p + 1;
b45f050a
JF
3109 vFAIL("Missing right brace on \\x{}");
3110 }
de5f0749 3111 else {
a4c04bdc
NC
3112 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3113 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3114 numlen = e - p - 1;
3115 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3116 if (ender > 0xff)
3117 RExC_utf8 = 1;
b21ed0a9
GS
3118 /* numlen is generous */
3119 if (numlen + len >= 127) {
a0ed51b3
LW
3120 p--;
3121 goto loopdone;
3122 }
3123 p = e + 1;
3124 }
a0ed51b3
LW
3125 }
3126 else {
a4c04bdc 3127 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3128 numlen = 2;
3129 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3130 p += numlen;
3131 }
a687059c 3132 break;
a0d0e21e
LW
3133 case 'c':
3134 p++;
bbce6d69 3135 ender = UCHARAT(p++);
3136 ender = toCTRL(ender);
a687059c 3137 break;
a0d0e21e
LW
3138 case '0': case '1': case '2': case '3':case '4':
3139 case '5': case '6': case '7': case '8':case '9':
3140 if (*p == '0' ||
830247a4 3141 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3142 I32 flags = 0;
3143 numlen = 3;
3144 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3145 p += numlen;
3146 }
3147 else {
3148 --p;
3149 goto loopdone;
a687059c
LW
3150 }
3151 break;
a0d0e21e 3152 case '\0':
830247a4 3153 if (p >= RExC_end)
b45f050a 3154 FAIL("Trailing \\");
a687059c 3155 /* FALL THROUGH */
a0d0e21e 3156 default:
e476b1b5 3157 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3158 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3159 goto normal_default;
a0d0e21e
LW
3160 }
3161 break;
a687059c 3162 default:
a0ed51b3 3163 normal_default:
fd400ab9 3164 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3165 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3166 &numlen, 0);
a0ed51b3
LW
3167 p += numlen;
3168 }
3169 else
3170 ender = *p++;
a0d0e21e 3171 break;
a687059c 3172 }
830247a4
IZ
3173 if (RExC_flags16 & PMf_EXTENDED)
3174 p = regwhite(p, RExC_end);
60a8b682
JH
3175 if (UTF && FOLD) {
3176 /* Prime the casefolded buffer. */
ac7e0132 3177 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3178 }
a0d0e21e
LW
3179 if (ISMULT2(p)) { /* Back off on ?+*. */
3180 if (len)
3181 p = oldp;
16ea2a2e 3182 else if (UTF) {
80aecb99 3183 if (FOLD) {
60a8b682 3184 /* Emit all the Unicode characters. */
80aecb99
JH
3185 for (foldbuf = tmpbuf;
3186 foldlen;
3187 foldlen -= numlen) {
3188 ender = utf8_to_uvchr(foldbuf, &numlen);
3189 reguni(pRExC_state, ender, s, &numlen);
3190 s += numlen;
3191 len += numlen;
3192 foldbuf += numlen;
3193 }
3194 }
3195 else {
3196 reguni(pRExC_state, ender, s, &numlen);
3197 s += numlen;
3198 len += numlen;
3199 }
a0ed51b3 3200 }
a0d0e21e
LW
3201 else {
3202 len++;
cd439c50 3203 REGC(ender, s++);
a0d0e21e
LW
3204 }
3205 break;
a687059c 3206 }
16ea2a2e 3207 if (UTF) {
80aecb99 3208 if (FOLD) {
60a8b682 3209 /* Emit all the Unicode characters. */
80aecb99
JH
3210 for (foldbuf = tmpbuf;
3211 foldlen;
3212 foldlen -= numlen) {
3213 ender = utf8_to_uvchr(foldbuf, &numlen);
3214 reguni(pRExC_state, ender, s, &numlen);
3215 s += numlen;
3216 len += numlen;
3217 foldbuf += numlen;
3218 }
3219 }
3220 else {
3221 reguni(pRExC_state, ender, s, &numlen);
3222 s += numlen;
3223 len += numlen;
3224 }
3225 len--;
a0ed51b3
LW
3226 }
3227 else
cd439c50 3228 REGC(ender, s++);
a0d0e21e
LW
3229 }
3230 loopdone:
830247a4 3231 RExC_parse = p - 1;
fac92740 3232 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3233 nextchar(pRExC_state);
793db0cb
JH
3234 {
3235 /* len is STRLEN which is unsigned, need to copy to signed */
3236 IV iv = len;
3237 if (iv < 0)
3238 vFAIL("Internal disaster");
3239 }
a0d0e21e
LW
3240 if (len > 0)
3241 *flagp |= HASWIDTH;
3242 if (len == 1)
3243 *flagp |= SIMPLE;
c277df42 3244 if (!SIZE_ONLY)
cd439c50
IZ
3245 STR_LEN(ret) = len;
3246 if (SIZE_ONLY)
830247a4 3247 RExC_size += STR_SZ(len);
cd439c50 3248 else
830247a4 3249 RExC_emit += STR_SZ(len);
a687059c 3250 }
a0d0e21e
LW
3251 break;
3252 }
a687059c 3253
60a8b682
JH
3254 /* If the encoding pragma is in effect recode the text of
3255 * any EXACT-kind nodes. */
22c54be3 3256 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
a72c7584
JH
3257 STRLEN oldlen = STR_LEN(ret);
3258 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
22c54be3
JH
3259
3260 if (RExC_utf8)
3261 SvUTF8_on(sv);
3262 if (sv_utf8_downgrade(sv, TRUE)) {
3263 char *s = Perl_sv_recode_to_utf8(aTHX_ sv, PL_encoding);
3264 STRLEN newlen = SvCUR(sv);
3265
3266 if (!SIZE_ONLY) {
3267 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3268 (int)oldlen, STRING(ret),
3269 (int)newlen, s));
3270 Copy(s, STRING(ret), newlen, char);
3271 STR_LEN(ret) += newlen - oldlen;
3272 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3273 } else
3274 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3275 }
a72c7584
JH
3276 }
3277
a0d0e21e 3278 return(ret);
a687059c
LW
3279}
3280
873ef191 3281STATIC char *
cea2e8a9 3282S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3283{
3284 while (p < e) {
3285 if (isSPACE(*p))
3286 ++p;
3287 else if (*p == '#') {
3288 do {
3289 p++;
3290 } while (p < e && *p != '\n');
3291 }
3292 else
3293 break;
3294 }
3295 return p;
3296}
3297
b8c5462f
JH
3298/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3299 Character classes ([:foo:]) can also be negated ([:^foo:]).
3300 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3301 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3302 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3303
3304#define POSIXCC_DONE(c) ((c) == ':')
3305#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3306#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3307
b8c5462f 3308STATIC I32
830247a4 3309S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3310{
3311 char *posixcc = 0;
936ed897 3312 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3313
830247a4 3314 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3315 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3316 POSIXCC(UCHARAT(RExC_parse))) {
3317 char c = UCHARAT(RExC_parse);
830247a4 3318 char* s = RExC_parse++;
b81d288d 3319
9a86a77b 3320 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3321 RExC_parse++;
3322 if (RExC_parse == RExC_end)
620e46c5 3323 /* Grandfather lone [:, [=, [. */
830247a4 3324 RExC_parse = s;
620e46c5 3325 else {
830247a4 3326 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3327
9a86a77b 3328 if (UCHARAT(RExC_parse) == ']') {
830247a4 3329 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3330 posixcc = s + 1;
3331 if (*s == ':') {
3332 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3333 I32 skip = 5; /* the most common skip */
3334
3335 switch (*posixcc) {
3336 case 'a':
3337 if (strnEQ(posixcc, "alnum", 5))
3338 namedclass =
3339 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3340 else if (strnEQ(posixcc, "alpha", 5))
3341 namedclass =
3342 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3343 else if (strnEQ(posixcc, "ascii", 5))
3344 namedclass =
3345 complement ? ANYOF_NASCII : ANYOF_ASCII;
3346 break;
aaa51d5e
JF
3347 case 'b':
3348 if (strnEQ(posixcc, "blank", 5))
3349 namedclass =
3350 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3351 break;
b8c5462f
JH
3352 case 'c':
3353 if (strnEQ(posixcc, "cntrl", 5))
3354 namedclass =
3355 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3356 break;
3357 case 'd':
3358 if (strnEQ(posixcc, "digit", 5))
3359 namedclass =
3360 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3361 break;
3362 case 'g':
3363 if (strnEQ(posixcc, "graph", 5))
3364 namedclass =
3365 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3366 break;
3367 case 'l':
3368 if (strnEQ(posixcc, "lower", 5))
3369 namedclass =
3370 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3371 break;
3372 case 'p':
3373 if (strnEQ(posixcc, "print", 5))
3374 namedclass =
3375 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3376 else if (strnEQ(posixcc, "punct", 5))
3377 namedclass =
3378 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3379 break;
3380 case 's':
3381 if (strnEQ(posixcc, "space", 5))
3382 namedclass =
aaa51d5e 3383 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3384 break;
b8c5462f
JH
3385 case 'u':
3386 if (strnEQ(posixcc, "upper", 5))
3387 namedclass =
3388 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3389 break;
3390 case 'w': /* this is not POSIX, this is the Perl \w */
3391 if (strnEQ(posixcc, "word", 4)) {
3392 namedclass =
3393 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3394 skip = 4;
3395 }
3396 break;
3397 case 'x':
3398 if (strnEQ(posixcc, "xdigit", 6)) {
3399 namedclass =
3400 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3401 skip = 6;
3402 }
3403 break;
3404 }
ac561586
JH
3405 if (namedclass == OOB_NAMEDCLASS ||
3406 posixcc[skip] != ':' ||
3407 posixcc[skip+1] != ']')
b45f050a
JF
3408 {
3409 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3410 t - s - 1, s + 1);
3411 }
3412 } else if (!SIZE_ONLY) {
b8c5462f 3413 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3414
830247a4 3415 /* adjust RExC_parse so the warning shows after
b45f050a 3416 the class closes */
9a86a77b 3417 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3418 RExC_parse++;
b45f050a
JF
3419 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3420 }
b8c5462f
JH
3421 } else {
3422 /* Maternal grandfather:
3423 * "[:" ending in ":" but not in ":]" */
830247a4 3424 RExC_parse = s;
767d463e 3425 }
620e46c5
JH
3426 }
3427 }
3428
b8c5462f
JH
3429 return namedclass;
3430}
3431
3432STATIC void
830247a4 3433S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3434{
e476b1b5 3435 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
9a86a77b 3436 POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3437 char *s = RExC_parse;
93733859 3438 char c = *s++;
b8c5462f
JH
3439
3440 while(*s && isALNUM(*s))
3441 s++;
3442 if (*s && c == *s && s[1] == ']') {
b45f050a
JF
3443 vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
3444
3445 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3446 if (POSIXCC_NOTYET(c)) {
830247a4 3447 /* adjust RExC_parse so the error shows after
b45f050a 3448 the class closes */
9a86a77b 3449 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3450 ;
3451 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3452 }
b8c5462f
JH
3453 }
3454 }
620e46c5
JH
3455}
3456
76e3520e 3457STATIC regnode *
830247a4 3458S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3459{
ffc61ed2 3460 register UV value;
9a86a77b 3461 register UV nextvalue;
3568d838 3462 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3463 register IV range = 0;
c277df42 3464 register regnode *ret;
ba210ebe 3465 STRLEN numlen;
ffc61ed2 3466 IV namedclass;
9c5ffd7c 3467 char *rangebegin = 0;
936ed897 3468 bool need_class = 0;
9c5ffd7c 3469 SV *listsv = Nullsv;
ffc61ed2
JH
3470 register char *e;
3471 UV n;
9e55ce06
JH
3472 bool optimize_invert = TRUE;
3473 AV* unicode_alternate = 0;
ffc61ed2
JH
3474
3475 ret = reganode(pRExC_state, ANYOF, 0);
3476
3477 if (!SIZE_ONLY)
3478 ANYOF_FLAGS(ret) = 0;
3479
9a86a77b 3480 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3481 RExC_naughty++;
3482 RExC_parse++;
3483 if (!SIZE_ONLY)
3484 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3485 }
a0d0e21e 3486
936ed897 3487 if (SIZE_ONLY)
830247a4 3488 RExC_size += ANYOF_SKIP;
936ed897 3489 else {
830247a4 3490 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3491 if (FOLD)
3492 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3493 if (LOC)
3494 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3495 ANYOF_BITMAP_ZERO(ret);
3496 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3497 }
b8c5462f 3498
9a86a77b
JH
3499 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3500
3501 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && POSIXCC(nextvalue))
830247a4 3502 checkposixcc(pRExC_state);
b8c5462f 3503
f064b6ad
HS
3504 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3505 if (UCHARAT(RExC_parse) == ']')
3506 goto charclassloop;
ffc61ed2 3507
9a86a77b 3508 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3509
3510 charclassloop:
3511
3512 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3513
73b437c8 3514 if (!range)
830247a4 3515 rangebegin = RExC_parse;
ffc61ed2 3516 if (UTF) {
5e12f4fb 3517 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3518 RExC_end - RExC_parse,
3519 &numlen, 0);
ffc61ed2
JH
3520 RExC_parse += numlen;
3521 }
3522 else
3523 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3524 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3525 if (value == '[' && POSIXCC(nextvalue))
830247a4 3526 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3527 else if (value == '\\') {
ffc61ed2 3528 if (UTF) {
5e12f4fb 3529 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3530 RExC_end - RExC_parse,
3531 &numlen, 0);
3532 RExC_parse += numlen;
3533 }
3534 else
3535 value = UCHARAT(RExC_parse++);
470c3474 3536 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3537 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3538 * be a problem later if we want switch on Unicode.
3539 * A similar issue a little bit later when switching on
3540 * namedclass. --jhi */
ffc61ed2 3541 switch ((I32)value) {
b8c5462f
JH
3542 case 'w': namedclass = ANYOF_ALNUM; break;
3543 case 'W': namedclass = ANYOF_NALNUM; break;
3544 case 's': namedclass = ANYOF_SPACE; break;
3545 case 'S': namedclass = ANYOF_NSPACE; break;
3546 case 'd': namedclass = ANYOF_DIGIT; break;
3547 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3548 case 'p':
3549 case 'P':
3550 if (*RExC_parse == '{') {
0da60cf5 3551 U8 c = (U8)value;
ffc61ed2
JH
3552 e = strchr(RExC_parse++, '}');
3553 if (!e)
0da60cf5 3554 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3555 while (isSPACE(UCHARAT(RExC_parse)))
3556 RExC_parse++;
3557 if (e == RExC_parse)
0da60cf5 3558 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3559 n = e - RExC_parse;
ab13f0c7
JH
3560 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3561 n--;
ffc61ed2
JH
3562 }
3563 else {
3564 e = RExC_parse;
3565 n = 1;
3566 }
3567 if (!SIZE_ONLY) {
ab13f0c7
JH
3568 if (UCHARAT(RExC_parse) == '^') {
3569 RExC_parse++;
3570 n--;
3571 value = value == 'p' ? 'P' : 'p'; /* toggle */
3572 while (isSPACE(UCHARAT(RExC_parse))) {
3573 RExC_parse++;
3574 n--;
3575 }
3576 }
ffc61ed2 3577 if (value == 'p')
ab13f0c7
JH
3578 Perl_sv_catpvf(aTHX_ listsv,
3579 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3580 else
ab13f0c7
JH
3581 Perl_sv_catpvf(aTHX_ listsv,
3582 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3583 }
3584 RExC_parse = e + 1;
3585 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3586 continue;
b8c5462f
JH
3587 case 'n': value = '\n'; break;
3588 case 'r': value = '\r'; break;
3589 case 't': value = '\t'; break;
3590 case 'f': value = '\f'; break;
3591 case 'b': value = '\b'; break;
c7f1f016
NIS
3592 case 'e': value = ASCII_TO_NATIVE('\033');break;
3593 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3594 case 'x':
ffc61ed2 3595 if (*RExC_parse == '{') {
a4c04bdc
NC
3596 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3597 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3598 e = strchr(RExC_parse++, '}');
b81d288d 3599 if (!e)
ffc61ed2 3600 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3601
3602 numlen = e - RExC_parse;
3603 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3604 RExC_parse = e + 1;
3605 }
3606 else {
a4c04bdc 3607 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3608 numlen = 2;
3609 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3610 RExC_parse += numlen;
3611 }
b8c5462f
JH
3612 break;
3613 case 'c':
830247a4 3614 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3615 value = toCTRL(value);
3616 break;
3617 case '0': case '1': case '2': case '3': case '4':
3618 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3619 {
3620 I32 flags = 0;
3621 numlen = 3;
3622 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3623 RExC_parse += numlen;
b8c5462f 3624 break;
53305cf1 3625 }
1028017a 3626 default:
e476b1b5 3627 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3628 vWARN2(RExC_parse,
3629 "Unrecognized escape \\%c in character class passed through",
3630 (int)value);
1028017a 3631 break;
b8c5462f 3632 }
ffc61ed2
JH
3633 } /* end of \blah */
3634
3635 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3636
3637 if (!SIZE_ONLY && !need_class)
936ed897 3638 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3639
936ed897 3640 need_class = 1;
ffc61ed2
JH
3641
3642 /* a bad range like a-\d, a-[:digit:] ? */
3643 if (range) {
73b437c8 3644 if (!SIZE_ONLY) {
e476b1b5 3645 if (ckWARN(WARN_REGEXP))
830247a4 3646 vWARN4(RExC_parse,
b45f050a 3647 "False [] range \"%*.*s\"",
830247a4
IZ
3648 RExC_parse - rangebegin,
3649 RExC_parse - rangebegin,
b45f050a 3650 rangebegin);
3568d838
JH
3651 if (prevvalue < 256) {
3652 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3653 ANYOF_BITMAP_SET(ret, '-');
3654 }
3655 else {
3656 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3657 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3658 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3659 }
b8c5462f 3660 }
ffc61ed2
JH
3661
3662 range = 0; /* this was not a true range */
73b437c8 3663 }
ffc61ed2 3664
73b437c8 3665 if (!SIZE_ONLY) {
3568d838
JH
3666 if (namedclass > OOB_NAMEDCLASS)
3667 optimize_invert = FALSE;
e2962f66
JH
3668 /* Possible truncation here but in some 64-bit environments
3669 * the compiler gets heartburn about switch on 64-bit values.
3670 * A similar issue a little earlier when switching on value.
98f323fa 3671 * --jhi */
e2962f66 3672 switch ((I32)namedclass) {
73b437c8
JH
3673 case ANYOF_ALNUM:
3674 if (LOC)
936ed897 3675 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3676 else {
3677 for (value = 0; value < 256; value++)
3678 if (isALNUM(value))
936ed897 3679 ANYOF_BITMAP_SET(ret, value);
73b437c8 3680 }
ffc61ed2 3681 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3682 break;
3683 case ANYOF_NALNUM:
3684 if (LOC)
936ed897 3685 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3686 else {
3687 for (value = 0; value < 256; value++)
3688 if (!isALNUM(value))
936ed897 3689 ANYOF_BITMAP_SET(ret, value);
73b437c8 3690 }
ffc61ed2 3691 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3692 break;
ffc61ed2 3693 case ANYOF_ALNUMC:
73b437c8 3694 if (LOC)
ffc61ed2 3695 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3696 else {
3697 for (value = 0; value < 256; value++)
ffc61ed2 3698 if (isALNUMC(value))
936ed897 3699 ANYOF_BITMAP_SET(ret, value);
73b437c8 3700 }
ffc61ed2 3701 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3702 break;
3703 case ANYOF_NALNUMC:
3704 if (LOC)
936ed897 3705 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3706 else {
3707 for (value = 0; value < 256; value++)
3708 if (!isALNUMC(value))
936ed897 3709 ANYOF_BITMAP_SET(ret, value);
73b437c8 3710 }
ffc61ed2 3711 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3712 break;
3713 case ANYOF_ALPHA:
3714 if (LOC)
936ed897 3715 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3716 else {
3717 for (value = 0; value < 256; value++)
3718 if (isALPHA(value))
936ed897 3719 ANYOF_BITMAP_SET(ret, value);
73b437c8 3720 }
ffc61ed2 3721 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3722 break;
3723 case ANYOF_NALPHA:
3724 if (LOC)
936ed897 3725 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3726 else {
3727 for (value = 0; value < 256; value++)
3728 if (!isALPHA(value))
936ed897 3729 ANYOF_BITMAP_SET(ret, value);
73b437c8 3730 }
ffc61ed2 3731 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3732 break;
3733 case ANYOF_ASCII:
3734 if (LOC)
936ed897 3735 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3736 else {
c7f1f016 3737#ifndef EBCDIC
1ba5c669
JH
3738 for (value = 0; value < 128; value++)
3739 ANYOF_BITMAP_SET(ret, value);
3740#else /* EBCDIC */
ffbc6a93 3741 for (value = 0; value < 256; value++) {
3a3c4447
JH
3742 if (isASCII(value))
3743 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3744 }
1ba5c669 3745#endif /* EBCDIC */
73b437c8 3746 }
ffc61ed2 3747 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3748 break;
3749 case ANYOF_NASCII:
3750 if (LOC)
936ed897 3751 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3752 else {
c7f1f016 3753#ifndef EBCDIC
1ba5c669
JH
3754 for (value = 128; value < 256; value++)
3755 ANYOF_BITMAP_SET(ret, value);
3756#else /* EBCDIC */
ffbc6a93 3757 for (value = 0; value < 256; value++) {
3a3c4447
JH
3758 if (!isASCII(value))
3759 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3760 }
1ba5c669 3761#endif /* EBCDIC */
73b437c8 3762 }
ffc61ed2 3763 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3764 break;
aaa51d5e
JF
3765 case ANYOF_BLANK:
3766 if (LOC)
3767 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3768 else {
3769 for (value = 0; value < 256; value++)
3770 if (isBLANK(value))
3771 ANYOF_BITMAP_SET(ret, value);
3772 }
ffc61ed2 3773 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3774 break;
3775 case ANYOF_NBLANK:
3776 if (LOC)
3777 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3778 else {
3779 for (value = 0; value < 256; value++)
3780 if (!isBLANK(value))
3781 ANYOF_BITMAP_SET(ret, value);
3782 }
ffc61ed2 3783 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3784 break;
73b437c8
JH
3785 case ANYOF_CNTRL:
3786 if (LOC)
936ed897 3787 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3788 else {
3789 for (value = 0; value < 256; value++)
3790 if (isCNTRL(value))
936ed897 3791 ANYOF_BITMAP_SET(ret, value);
73b437c8 3792 }
ffc61ed2 3793 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3794 break;
3795 case ANYOF_NCNTRL:
3796 if (LOC)
936ed897 3797 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3798 else {
3799 for (value = 0; value < 256; value++)
3800 if (!isCNTRL(value))
936ed897 3801 ANYOF_BITMAP_SET(ret, value);
73b437c8 3802 }
ffc61ed2
JH
3803 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3804 break;
3805 case ANYOF_DIGIT:
3806 if (LOC)
3807 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3808 else {
3809 /* consecutive digits assumed */
3810 for (value = '0'; value <= '9'; value++)
3811 ANYOF_BITMAP_SET(ret, value);
3812 }
3813 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3814 break;
3815 case ANYOF_NDIGIT:
3816 if (LOC)
3817 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3818 else {
3819 /* consecutive digits assumed */
3820 for (value = 0; value < '0'; value++)
3821 ANYOF_BITMAP_SET(ret, value);
3822 for (value = '9' + 1; value < 256; value++)
3823 ANYOF_BITMAP_SET(ret, value);
3824 }
3825 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3826 break;
3827 case ANYOF_GRAPH:
3828 if (LOC)
936ed897 3829 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3830 else {
3831 for (value = 0; value < 256; value++)
3832 if (isGRAPH(value))
936ed897 3833 ANYOF_BITMAP_SET(ret, value);
73b437c8 3834 }
ffc61ed2 3835 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3836 break;
3837 case ANYOF_NGRAPH:
3838 if (LOC)
936ed897 3839 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3840 else {
3841 for (value = 0; value < 256; value++)
3842 if (!isGRAPH(value))
936ed897 3843 ANYOF_BITMAP_SET(ret, value);
73b437c8 3844 }
ffc61ed2 3845 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3846 break;
3847 case ANYOF_LOWER:
3848 if (LOC)
936ed897 3849 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3850 else {
3851 for (value = 0; value < 256; value++)
3852 if (isLOWER(value))
936ed897 3853 ANYOF_BITMAP_SET(ret, value);
73b437c8 3854 }
ffc61ed2 3855 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3856 break;
3857 case ANYOF_NLOWER:
3858 if (LOC)
936ed897 3859 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3860 else {
3861 for (value = 0; value < 256; value++)
3862 if (!isLOWER(value))
936ed897 3863 ANYOF_BITMAP_SET(ret, value);
73b437c8 3864 }
ffc61ed2 3865 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3866 break;
3867 case ANYOF_PRINT:
3868 if (LOC)
936ed897 3869 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3870 else {
3871 for (value = 0; value < 256; value++)
3872 if (isPRINT(value))
936ed897 3873 ANYOF_BITMAP_SET(ret, value);
73b437c8 3874 }
ffc61ed2 3875 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3876 break;
3877 case ANYOF_NPRINT:
3878 if (LOC)
936ed897 3879 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3880 else {
3881 for (value = 0; value < 256; value++)
3882 if (!isPRINT(value))
936ed897 3883 ANYOF_BITMAP_SET(ret, value);
73b437c8 3884 }
ffc61ed2 3885 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3886 break;
aaa51d5e
JF
3887 case ANYOF_PSXSPC:
3888 if (LOC)
3889 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3890 else {
3891 for (value = 0; value < 256; value++)
3892 if (isPSXSPC(value))
3893 ANYOF_BITMAP_SET(ret, value);
3894 }
ffc61ed2 3895 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3896 break;
3897 case ANYOF_NPSXSPC:
3898 if (LOC)
3899 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3900 else {
3901 for (value = 0; value < 256; value++)
3902 if (!isPSXSPC(value))
3903 ANYOF_BITMAP_SET(ret, value);
3904 }
ffc61ed2 3905 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3906 break;
73b437c8
JH
3907 case ANYOF_PUNCT:
3908 if (LOC)
936ed897 3909 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3910 else {
3911 for (value = 0; value < 256; value++)
3912 if (isPUNCT(value))
936ed897 3913 ANYOF_BITMAP_SET(ret, value);
73b437c8 3914 }
ffc61ed2 3915 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3916 break;
3917 case ANYOF_NPUNCT:
3918 if (LOC)
936ed897 3919 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
3920 else {
3921 for (value = 0; value < 256; value++)
3922 if (!isPUNCT(value))
936ed897 3923 ANYOF_BITMAP_SET(ret, value);
73b437c8 3924 }
ffc61ed2
JH
3925 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
3926 break;
3927 case ANYOF_SPACE:
3928 if (LOC)
3929 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
3930 else {
3931 for (value = 0; value < 256; value++)
3932 if (isSPACE(value))
3933 ANYOF_BITMAP_SET(ret, value);
3934 }
3935 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
3936 break;
3937 case ANYOF_NSPACE:
3938 if (LOC)
3939 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
3940 else {
3941 for (value = 0; value < 256; value++)
3942 if (!isSPACE(value))
3943 ANYOF_BITMAP_SET(ret, value);
3944 }
3945 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
3946 break;
3947 case ANYOF_UPPER:
3948 if (LOC)
936ed897 3949 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
3950 else {
3951 for (value = 0; value < 256; value++)
3952 if (isUPPER(value))
936ed897 3953 ANYOF_BITMAP_SET(ret, value);
73b437c8 3954 }
ffc61ed2 3955 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
3956 break;
3957 case ANYOF_NUPPER:
3958 if (LOC)
936ed897 3959 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
3960 else {
3961 for (value = 0; value < 256; value++)
3962 if (!isUPPER(value))
936ed897 3963 ANYOF_BITMAP_SET(ret, value);
73b437c8 3964 }
ffc61ed2 3965 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
3966 break;
3967 case ANYOF_XDIGIT:
3968 if (LOC)
936ed897 3969 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
3970 else {
3971 for (value = 0; value < 256; value++)
3972 if (isXDIGIT(value))
936ed897 3973 ANYOF_BITMAP_SET(ret, value);
73b437c8 3974 }
ffc61ed2 3975 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
3976 break;
3977 case ANYOF_NXDIGIT:
3978 if (LOC)
936ed897 3979 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
3980 else {
3981 for (value = 0; value < 256; value++)
3982 if (!isXDIGIT(value))
936ed897 3983 ANYOF_BITMAP_SET(ret, value);
73b437c8 3984 }
ffc61ed2 3985 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
3986 break;
3987 default:
b45f050a 3988 vFAIL("Invalid [::] class");
73b437c8 3989 break;
b8c5462f 3990 }
b8c5462f 3991 if (LOC)
936ed897 3992 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 3993 continue;
a0d0e21e 3994 }
ffc61ed2
JH
3995 } /* end of namedclass \blah */
3996
a0d0e21e 3997 if (range) {
3a3c4447 3998 if (prevvalue > value) /* b-a */ {
b45f050a 3999 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4000 RExC_parse - rangebegin,
4001 RExC_parse - rangebegin,
b45f050a 4002 rangebegin);
3568d838 4003 range = 0; /* not a valid range */
73b437c8 4004 }
a0d0e21e
LW
4005 }
4006 else {
3568d838 4007 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4008 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4009 RExC_parse[1] != ']') {
4010 RExC_parse++;
ffc61ed2
JH
4011
4012 /* a bad range like \w-, [:word:]- ? */
4013 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4014 if (ckWARN(WARN_REGEXP))
830247a4 4015 vWARN4(RExC_parse,
b45f050a 4016 "False [] range \"%*.*s\"",
830247a4
IZ
4017 RExC_parse - rangebegin,
4018 RExC_parse - rangebegin,
b45f050a 4019 rangebegin);
73b437c8 4020 if (!SIZE_ONLY)
936ed897 4021 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4022 } else
ffc61ed2
JH
4023 range = 1; /* yeah, it's a range! */
4024 continue; /* but do it the next time */
a0d0e21e 4025 }
a687059c 4026 }
ffc61ed2 4027
93733859 4028 /* now is the next time */
ae5c130c 4029 if (!SIZE_ONLY) {
3568d838
JH
4030 IV i;
4031
4032 if (prevvalue < 256) {
4033 IV ceilvalue = value < 256 ? value : 255;
4034
4035#ifdef EBCDIC
3a3c4447
JH
4036 if ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4037 (isUPPER(prevvalue) && isUPPER(ceilvalue)))
ffc61ed2 4038 {
3568d838
JH
4039 if (isLOWER(prevvalue)) {
4040 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4041 if (isLOWER(i))
4042 ANYOF_BITMAP_SET(ret, i);
4043 } else {
3568d838 4044 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4045 if (isUPPER(i))
4046 ANYOF_BITMAP_SET(ret, i);
4047 }
8ada0baa 4048 }
ffc61ed2 4049 else
8ada0baa 4050#endif
a5961de5
JH
4051 for (i = prevvalue; i <= ceilvalue; i++)
4052 ANYOF_BITMAP_SET(ret, i);
3568d838 4053 }
a5961de5 4054 if (value > 255 || UTF) {
ffc61ed2 4055 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3568d838 4056 if (prevvalue < value)
ffc61ed2 4057 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
3568d838 4058 (UV)prevvalue, (UV)value);
09091399 4059 else if (prevvalue == value) {
ffc61ed2
JH
4060 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4061 (UV)value);
09091399 4062 if (FOLD) {
254ba52a
JH
4063 U8 tmpbuf [UTF8_MAXLEN+1];
4064 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4065 STRLEN foldlen;
4066 UV f;
4067
4068 uvchr_to_utf8(tmpbuf, value);
2ddfca77
JH
4069 to_utf8_fold(tmpbuf, foldbuf, &foldlen);
4070 f = utf8_to_uvchr(foldbuf, 0);
254ba52a 4071
c840d2a2
JH
4072 /* If folding and foldable and a single
4073 * character, insert also the folded version
4074 * to the charclass. */
9e55ce06
JH
4075 if (f != value) {
4076 if (foldlen == UNISKIP(f))
4077 Perl_sv_catpvf(aTHX_ listsv,
4078 "%04"UVxf"\n", f);
4079 else {
4080 /* Any multicharacter foldings
4081 * require the following transform:
4082 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4083 * where E folds into "pq" and F folds
4084 * into "rst", all other characters
4085 * fold to single characters. We save
4086 * away these multicharacter foldings,
4087 * to be later saved as part of the
4088 * additional "s" data. */
4089 SV *sv;
4090
4091 if (!unicode_alternate)
4092 unicode_alternate = newAV();
4093 sv = newSVpvn((char*)foldbuf, foldlen);
4094 SvUTF8_on(sv);
4095 av_push(unicode_alternate, sv);
4096 }
4097 }
254ba52a 4098
60a8b682
JH
4099 /* If folding and the value is one of the Greek
4100 * sigmas insert a few more sigmas to make the
4101 * folding rules of the sigmas to work right.
4102 * Note that not all the possible combinations
4103 * are handled here: some of them are handled
9e55ce06
JH
4104 * by the standard folding rules, and some of
4105 * them (literal or EXACTF cases) are handled
4106 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4107 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4108 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4109 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4110 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4111 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4112 }
4113 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4114 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4115 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4116 }
4117 }
ffc61ed2 4118 }
8ada0baa 4119 }
ffc61ed2
JH
4120
4121 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4122 }
ffc61ed2 4123
936ed897 4124 if (need_class) {
4f66b38d 4125 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4126 if (SIZE_ONLY)
830247a4 4127 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4128 else
830247a4 4129 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4130 }
ffc61ed2 4131
ae5c130c 4132 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4133 if (!SIZE_ONLY &&
ffc61ed2 4134 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4135 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4136 ) {
a0ed51b3 4137 for (value = 0; value < 256; ++value) {
936ed897 4138 if (ANYOF_BITMAP_TEST(ret, value)) {
ffc61ed2
JH
4139 IV fold = PL_fold[value];
4140
4141 if (fold != value)
4142 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4143 }
4144 }
936ed897 4145 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4146 }
ffc61ed2 4147
ae5c130c 4148 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4149 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4150 /* If the only flag is inversion. */
4151 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4152 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4153 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4154 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4155 }
a0d0e21e 4156
b81d288d 4157 if (!SIZE_ONLY) {
fde631ed 4158 AV *av = newAV();
ffc61ed2
JH
4159 SV *rv;
4160
9e55ce06
JH
4161 /* The 0th element stores the character class description
4162 * in its textual form: used later (regexec.c:Perl_regclass_swatch())
4163 * to initialize the appropriate swash (which gets stored in
4164 * the 1st element), and also useful for dumping the regnode.
4165 * The 2nd element stores the multicharacter foldings,
4166 * used later (regexec.c:s_reginclasslen()). */
ffc61ed2
JH
4167 av_store(av, 0, listsv);
4168 av_store(av, 1, NULL);
9e55ce06 4169 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4170 rv = newRV_noinc((SV*)av);
19860706 4171 n = add_data(pRExC_state, 1, "s");
830247a4 4172 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4173 ARG_SET(ret, n);
a0ed51b3
LW
4174 }
4175
4176 return ret;
4177}
4178
76e3520e 4179STATIC char*
830247a4 4180S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4181{
830247a4 4182 char* retval = RExC_parse++;
a0d0e21e 4183
4633a7c4 4184 for (;;) {
830247a4
IZ
4185 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4186 RExC_parse[2] == '#') {
4187 while (*RExC_parse && *RExC_parse != ')')
4188 RExC_parse++;
4189 RExC_parse++;
4633a7c4
LW
4190 continue;
4191 }
830247a4
IZ
4192 if (RExC_flags16 & PMf_EXTENDED) {
4193 if (isSPACE(*RExC_parse)) {
4194 RExC_parse++;
748a9306
LW
4195 continue;
4196 }
830247a4
IZ
4197 else if (*RExC_parse == '#') {
4198 while (*RExC_parse && *RExC_parse != '\n')
4199 RExC_parse++;
4200 RExC_parse++;
748a9306
LW
4201 continue;
4202 }
748a9306 4203 }
4633a7c4 4204 return retval;
a0d0e21e 4205 }
a687059c
LW
4206}
4207
4208/*
c277df42 4209- reg_node - emit a node
a0d0e21e 4210*/
76e3520e 4211STATIC regnode * /* Location. */
830247a4 4212S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4213{
c277df42
IZ
4214 register regnode *ret;
4215 register regnode *ptr;
a687059c 4216
830247a4 4217 ret = RExC_emit;
c277df42 4218 if (SIZE_ONLY) {
830247a4
IZ
4219 SIZE_ALIGN(RExC_size);
4220 RExC_size += 1;
a0d0e21e
LW
4221 return(ret);
4222 }
a687059c 4223
c277df42 4224 NODE_ALIGN_FILL(ret);
a0d0e21e 4225 ptr = ret;
c277df42 4226 FILL_ADVANCE_NODE(ptr, op);
fac92740
MJD
4227 if (RExC_offsets) { /* MJD */
4228 MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4229 "reg_node", __LINE__,
4230 reg_name[op],
4231 RExC_emit - RExC_emit_start > RExC_offsets[0]
4232 ? "Overwriting end of array!\n" : "OK",
4233 RExC_emit - RExC_emit_start,
4234 RExC_parse - RExC_start,
4235 RExC_offsets[0]));
4236 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4237 }
4238
830247a4 4239 RExC_emit = ptr;
a687059c 4240
a0d0e21e 4241 return(ret);
a687059c
LW
4242}
4243
4244/*
a0d0e21e
LW
4245- reganode - emit a node with an argument
4246*/
76e3520e 4247STATIC regnode * /* Location. */
830247a4 4248S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4249{
c277df42
IZ
4250 register regnode *ret;
4251 register regnode *ptr;
fe14fcc3 4252
830247a4 4253 ret = RExC_emit;
c277df42 4254 if (SIZE_ONLY) {
830247a4
IZ
4255 SIZE_ALIGN(RExC_size);
4256 RExC_size += 2;
a0d0e21e
LW
4257 return(ret);
4258 }
fe14fcc3 4259
c277df42 4260 NODE_ALIGN_FILL(ret);
a0d0e21e 4261 ptr = ret;
c277df42 4262 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740
MJD
4263 if (RExC_offsets) { /* MJD */
4264 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4265 "reganode",
4266 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4267 "Overwriting end of array!\n" : "OK",
4268 RExC_emit - RExC_emit_start,
4269 RExC_parse - RExC_start,
4270 RExC_offsets[0]));
4271 Set_Cur_Node_Offset;
4272 }
4273
830247a4 4274 RExC_emit = ptr;
fe14fcc3 4275
a0d0e21e 4276 return(ret);
fe14fcc3
LW
4277}
4278
4279/*
cd439c50 4280- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4281*/
4282STATIC void
830247a4 4283S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4284{
5e12f4fb 4285 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4286}
4287
4288/*
a0d0e21e
LW
4289- reginsert - insert an operator in front of already-emitted operand
4290*
4291* Means relocating the operand.
4292*/
76e3520e 4293STATIC void
830247a4 4294S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4295{
c277df42
IZ
4296 register regnode *src;
4297 register regnode *dst;
4298 register regnode *place;
4299 register int offset = regarglen[(U8)op];
b81d288d 4300
22c35a8c 4301/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4302
4303 if (SIZE_ONLY) {
830247a4 4304 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4305 return;
4306 }
a687059c 4307
830247a4
IZ
4308 src = RExC_emit;
4309 RExC_emit += NODE_STEP_REGNODE + offset;
4310 dst = RExC_emit;
fac92740 4311 while (src > opnd) {
c277df42 4312 StructCopy(--src, --dst, regnode);
fac92740
MJD
4313 if (RExC_offsets) { /* MJD 20010112 */
4314 MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
4315 "reg_insert",
4316 dst - RExC_emit_start > RExC_offsets[0]
4317 ? "Overwriting end of array!\n" : "OK",
4318 src - RExC_emit_start,
4319 dst - RExC_emit_start,
4320 RExC_offsets[0]));
4321 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4322 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4323 }
4324 }
4325
a0d0e21e
LW
4326
4327 place = opnd; /* Op node, where operand used to be. */
fac92740
MJD
4328 if (RExC_offsets) { /* MJD */
4329 MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
4330 "reginsert",
4331 place - RExC_emit_start > RExC_offsets[0]
4332 ? "Overwriting end of array!\n" : "OK",
4333 place - RExC_emit_start,
4334 RExC_parse - RExC_start,
4335 RExC_offsets[0]));
4336 Set_Node_Offset(place, RExC_parse);
4337 }
c277df42
IZ
4338 src = NEXTOPER(place);
4339 FILL_ADVANCE_NODE(place, op);
4340 Zero(src, offset, regnode);
a687059c
LW
4341}
4342
4343/*
c277df42 4344- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4345*/
76e3520e 4346STATIC void
830247a4 4347S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4348{
c277df42
IZ
4349 register regnode *scan;
4350 register regnode *temp;
a0d0e21e 4351
c277df42 4352 if (SIZE_ONLY)
a0d0e21e
LW
4353 return;
4354
4355 /* Find last node. */
4356 scan = p;
4357 for (;;) {
4358 temp = regnext(scan);
4359 if (temp == NULL)
4360 break;
4361 scan = temp;
4362 }
a687059c 4363
c277df42
IZ
4364 if (reg_off_by_arg[OP(scan)]) {
4365 ARG_SET(scan, val - scan);
a0ed51b3
LW
4366 }
4367 else {
c277df42
IZ
4368 NEXT_OFF(scan) = val - scan;
4369 }
a687059c
LW
4370}
4371
4372/*
a0d0e21e
LW
4373- regoptail - regtail on operand of first argument; nop if operandless
4374*/
76e3520e 4375STATIC void
830247a4 4376S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4377{
a0d0e21e 4378 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4379 if (p == NULL || SIZE_ONLY)
4380 return;
22c35a8c 4381 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4382 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4383 }
22c35a8c 4384 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4385 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4386 }
4387 else
a0d0e21e 4388 return;
a687059c
LW
4389}
4390
4391/*
4392 - regcurly - a little FSA that accepts {\d+,?\d*}
4393 */
79072805 4394STATIC I32
cea2e8a9 4395S_regcurly(pTHX_ register char *s)
a687059c
LW
4396{
4397 if (*s++ != '{')
4398 return FALSE;
f0fcb552 4399 if (!isDIGIT(*s))
a687059c 4400 return FALSE;
f0fcb552 4401 while (isDIGIT(*s))
a687059c
LW
4402 s++;
4403 if (*s == ',')
4404 s++;
f0fcb552 4405 while (isDIGIT(*s))
a687059c
LW
4406 s++;
4407 if (*s != '}')
4408 return FALSE;
4409 return TRUE;
4410}
4411
a687059c 4412
8fa7f367
JH
4413#ifdef DEBUGGING
4414
76e3520e 4415STATIC regnode *
cea2e8a9 4416S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4417{
f248d071 4418 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4419 register regnode *next;
c277df42
IZ
4420
4421 while (op != END && (!last || node < last)) {
4422 /* While that wasn't END last time... */
4423
4424 NODE_ALIGN(node);
4425 op = OP(node);
4426 if (op == CLOSE)
4427 l--;
4428 next = regnext(node);
4429 /* Where, what. */
4430 if (OP(node) == OPTIMIZED)
4431 goto after_print;
4432 regprop(sv, node);
b900a521 4433 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4434 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4435 if (next == NULL) /* Next ptr. */
4436 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4437 else
b900a521 4438 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4439 (void)PerlIO_putc(Perl_debug_log, '\n');
4440 after_print:
22c35a8c 4441 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4442 register regnode *nnode = (OP(next) == LONGJMP
4443 ? regnext(next)
c277df42
IZ
4444 : next);
4445 if (last && nnode > last)
4446 nnode = last;
4447 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4448 }
22c35a8c 4449 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4450 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4451 }
4452 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4453 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4454 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4455 }
22c35a8c 4456 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4457 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4458 next, sv, l + 1);
a0ed51b3
LW
4459 }
4460 else if ( op == PLUS || op == STAR) {
c277df42 4461 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4462 }
4463 else if (op == ANYOF) {
4f66b38d
HS
4464 /* arglen 1 + class block */
4465 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4466 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4467 node = NEXTOPER(node);
a0ed51b3 4468 }
22c35a8c 4469 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4470 /* Literal string, where present. */
cd439c50 4471 node += NODE_SZ_STR(node) - 1;
c277df42 4472 node = NEXTOPER(node);
a0ed51b3
LW
4473 }
4474 else {
c277df42
IZ
4475 node = NEXTOPER(node);
4476 node += regarglen[(U8)op];
4477 }
4478 if (op == CURLYX || op == OPEN)
4479 l++;
4480 else if (op == WHILEM)
4481 l--;
4482 }
4483 return node;
4484}
4485
8fa7f367
JH
4486#endif /* DEBUGGING */
4487
a687059c 4488/*
fd181c75 4489 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4490 */
4491void
864dbfa3 4492Perl_regdump(pTHX_ regexp *r)
a687059c 4493{
35ff7856 4494#ifdef DEBUGGING
46fc3d4c 4495 SV *sv = sv_newmortal();
a687059c 4496
c277df42 4497 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4498
4499 /* Header fields of interest. */
c277df42 4500 if (r->anchored_substr)
7b0972df 4501 PerlIO_printf(Perl_debug_log,
b81d288d 4502 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4503 PL_colors[0],
7b0972df 4504 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4505 SvPVX(r->anchored_substr),
3280af22 4506 PL_colors[1],
c277df42 4507 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4508 (IV)r->anchored_offset);
c277df42 4509 if (r->float_substr)
7b0972df 4510 PerlIO_printf(Perl_debug_log,
b81d288d 4511 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4512 PL_colors[0],
b81d288d 4513 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4514 SvPVX(r->float_substr),
3280af22 4515 PL_colors[1],
c277df42 4516 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4517 (IV)r->float_min_offset, (UV)r->float_max_offset);
c277df42 4518 if (r->check_substr)
b81d288d
AB
4519 PerlIO_printf(Perl_debug_log,
4520 r->check_substr == r->float_substr
c277df42
IZ
4521 ? "(checking floating" : "(checking anchored");
4522 if (r->reganch & ROPT_NOSCAN)
4523 PerlIO_printf(Perl_debug_log, " noscan");
4524 if (r->reganch & ROPT_CHECK_ALL)
4525 PerlIO_printf(Perl_debug_log, " isall");
4526 if (r->check_substr)
4527 PerlIO_printf(Perl_debug_log, ") ");
4528
46fc3d4c 4529 if (r->regstclass) {
4530 regprop(sv, r->regstclass);
4531 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4532 }
774d564b 4533 if (r->reganch & ROPT_ANCH) {
4534 PerlIO_printf(Perl_debug_log, "anchored");
4535 if (r->reganch & ROPT_ANCH_BOL)
4536 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4537 if (r->reganch & ROPT_ANCH_MBOL)
4538 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4539 if (r->reganch & ROPT_ANCH_SBOL)
4540 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4541 if (r->reganch & ROPT_ANCH_GPOS)
4542 PerlIO_printf(Perl_debug_log, "(GPOS)");
4543 PerlIO_putc(Perl_debug_log, ' ');
4544 }
c277df42
IZ
4545 if (r->reganch & ROPT_GPOS_SEEN)
4546 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4547 if (r->reganch & ROPT_SKIP)
760ac839 4548 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4549 if (r->reganch & ROPT_IMPLICIT)
760ac839 4550 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4551 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4552 if (r->reganch & ROPT_EVAL_SEEN)
4553 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4554 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4555 if (r->offsets) {
4556 U32 i;
4557 U32 len = r->offsets[0];
392fbf5d 4558 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4559 for (i = 1; i <= len; i++)
392fbf5d
RB
4560 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4561 (UV)r->offsets[i*2-1],
4562 (UV)r->offsets[i*2]);
fac92740
MJD
4563 PerlIO_printf(Perl_debug_log, "\n");
4564 }
17c3b450 4565#endif /* DEBUGGING */
a687059c
LW
4566}
4567
8fa7f367
JH
4568#ifdef DEBUGGING
4569
653099ff
GS
4570STATIC void
4571S_put_byte(pTHX_ SV *sv, int c)
4572{
7be5a6cf 4573 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4574 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4575 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4576 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4577 else
4578 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4579}
4580
8fa7f367
JH
4581#endif /* DEBUGGING */
4582
a687059c 4583/*
a0d0e21e
LW
4584- regprop - printable representation of opcode
4585*/
46fc3d4c 4586void
864dbfa3 4587Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4588{
35ff7856 4589#ifdef DEBUGGING
9b155405 4590 register int k;
a0d0e21e 4591
54dc92de 4592 sv_setpvn(sv, "", 0);
9b155405 4593 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4594 /* It would be nice to FAIL() here, but this may be called from
4595 regexec.c, and it would be hard to supply pRExC_state. */
4596 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4597 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4598
4599 k = PL_regkind[(U8)OP(o)];
4600
2a782b5b
JH
4601 if (k == EXACT) {
4602 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4603 /* Using is_utf8_string() is a crude hack but it may
4604 * be the best for now since we have no flag "this EXACTish
4605 * node was UTF-8" --jhi */
4606 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4607 char *s = do_utf8 ?
c728cb41
JH
4608 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4609 UNI_DISPLAY_REGEX) :
2a782b5b 4610 STRING(o);
40eddc46 4611 int len = do_utf8 ?
2a782b5b
JH
4612 strlen(s) :
4613 STR_LEN(o);
4614 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4615 PL_colors[0],
4616 len, s,
4617 PL_colors[1]);
4618 }
9b155405 4619 else if (k == CURLY) {
cb434fcc 4620 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4621 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4622 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4623 }
2c2d71f5
JH
4624 else if (k == WHILEM && o->flags) /* Ordinal/of */
4625 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4626 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4627 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4628 else if (k == LOGICAL)
04ebc1ab 4629 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4630 else if (k == ANYOF) {
4631 int i, rangestart = -1;
ffc61ed2 4632 U8 flags = ANYOF_FLAGS(o);
19860706
JH
4633 const char * const anyofs[] = { /* Should be syncronized with
4634 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4635 "\\w",
4636 "\\W",
4637 "\\s",
4638 "\\S",
4639 "\\d",
4640 "\\D",
4641 "[:alnum:]",
4642 "[:^alnum:]",
4643 "[:alpha:]",
4644 "[:^alpha:]",
4645 "[:ascii:]",
4646 "[:^ascii:]",
4647 "[:ctrl:]",
4648 "[:^ctrl:]",
4649 "[:graph:]",
4650 "[:^graph:]",
4651 "[:lower:]",
4652 "[:^lower:]",
4653 "[:print:]",
4654 "[:^print:]",
4655 "[:punct:]",
4656 "[:^punct:]",
4657 "[:upper:]",
aaa51d5e 4658 "[:^upper:]",
653099ff 4659 "[:xdigit:]",
aaa51d5e
JF
4660 "[:^xdigit:]",
4661 "[:space:]",
4662 "[:^space:]",
4663 "[:blank:]",
4664 "[:^blank:]"
653099ff
GS
4665 };
4666
19860706 4667 if (flags & ANYOF_LOCALE)
653099ff 4668 sv_catpv(sv, "{loc}");
19860706 4669 if (flags & ANYOF_FOLD)
653099ff
GS
4670 sv_catpv(sv, "{i}");
4671 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4672 if (flags & ANYOF_INVERT)
653099ff 4673 sv_catpv(sv, "^");
ffc61ed2
JH
4674 for (i = 0; i <= 256; i++) {
4675 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4676 if (rangestart == -1)
4677 rangestart = i;
4678 } else if (rangestart != -1) {
4679 if (i <= rangestart + 3)
4680 for (; rangestart < i; rangestart++)
653099ff 4681 put_byte(sv, rangestart);
ffc61ed2
JH
4682 else {
4683 put_byte(sv, rangestart);
4684 sv_catpv(sv, "-");
4685 put_byte(sv, i - 1);
653099ff 4686 }
ffc61ed2 4687 rangestart = -1;
653099ff 4688 }
847a199f 4689 }
ffc61ed2
JH
4690
4691 if (o->flags & ANYOF_CLASS)
4692 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4693 if (ANYOF_CLASS_TEST(o,i))
4694 sv_catpv(sv, anyofs[i]);
4695
4696 if (flags & ANYOF_UNICODE)
4697 sv_catpv(sv, "{unicode}");
1aa99e6b 4698 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4699 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4700
4701 {
4702 SV *lv;
9e55ce06 4703 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4704
ffc61ed2
JH
4705 if (lv) {
4706 if (sw) {
4707 UV i;
4708 U8 s[UTF8_MAXLEN+1];
b81d288d 4709
ffc61ed2 4710 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4711 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4712
3568d838 4713 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4714 if (rangestart == -1)
4715 rangestart = i;
4716 } else if (rangestart != -1) {
4717 U8 *p;
b81d288d 4718
ffc61ed2
JH
4719 if (i <= rangestart + 3)
4720 for (; rangestart < i; rangestart++) {
2b9d42f0 4721 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4722 put_byte(sv, *p);
4723 }
4724 else {
2b9d42f0 4725 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4726 put_byte(sv, *p);
4727 sv_catpv(sv, "-");
2b9d42f0 4728 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4729 put_byte(sv, *p);
4730 }
4731 rangestart = -1;
4732 }
19860706 4733 }
ffc61ed2
JH
4734
4735 sv_catpv(sv, "..."); /* et cetera */
19860706 4736 }
fde631ed 4737
ffc61ed2
JH
4738 {
4739 char *s = savepv(SvPVX(lv));
4740 char *origs = s;
b81d288d 4741
ffc61ed2 4742 while(*s && *s != '\n') s++;
b81d288d 4743
ffc61ed2
JH
4744 if (*s == '\n') {
4745 char *t = ++s;
4746
4747 while (*s) {
4748 if (*s == '\n')
4749 *s = ' ';
4750 s++;
4751 }
4752 if (s[-1] == ' ')
4753 s[-1] = 0;
4754
4755 sv_catpv(sv, t);
fde631ed 4756 }
b81d288d 4757
ffc61ed2 4758 Safefree(origs);
fde631ed
JH
4759 }
4760 }
653099ff 4761 }
ffc61ed2 4762
653099ff
GS
4763 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4764 }
9b155405 4765 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4766 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4767#endif /* DEBUGGING */
35ff7856 4768}
a687059c 4769
cad2e5aa
JH
4770SV *
4771Perl_re_intuit_string(pTHX_ regexp *prog)
4772{ /* Assume that RE_INTUIT is set */
4773 DEBUG_r(
4774 { STRLEN n_a;
4775 char *s = SvPV(prog->check_substr,n_a);
4776
4777 if (!PL_colorset) reginitcolors();
4778 PerlIO_printf(Perl_debug_log,
4779 "%sUsing REx substr:%s `%s%.60s%s%s'\n",
4780 PL_colors[4],PL_colors[5],PL_colors[0],
4781 s,
4782 PL_colors[1],
4783 (strlen(s) > 60 ? "..." : ""));
4784 } );
4785
4786 return prog->check_substr;
4787}
4788
2b69d0c2 4789void
864dbfa3 4790Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4791{
9e55ce06
JH
4792#ifdef DEBUGGING
4793 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4794#endif
7821416a
IZ
4795
4796 if (!r || (--r->refcnt > 0))
4797 return;
9e55ce06 4798 DEBUG_r({
9e55ce06 4799 char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60,
c728cb41 4800 UNI_DISPLAY_REGEX);
9e55ce06
JH
4801 int len = SvCUR(dsv);
4802 if (!PL_colorset)
4803 reginitcolors();
4804 PerlIO_printf(Perl_debug_log,
4805 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4806 PL_colors[4],PL_colors[5],PL_colors[0],
4807 len, len, s,
4808 PL_colors[1],
4809 len > 60 ? "..." : "");
4810 });
cad2e5aa 4811
c277df42 4812 if (r->precomp)
a0d0e21e 4813 Safefree(r->precomp);
fac92740
MJD
4814 if (r->offsets) /* 20010421 MJD */
4815 Safefree(r->offsets);
cf93c79d
IZ
4816 if (RX_MATCH_COPIED(r))
4817 Safefree(r->subbeg);
a193d654
GS
4818 if (r->substrs) {
4819 if (r->anchored_substr)
4820 SvREFCNT_dec(r->anchored_substr);
4821 if (r->float_substr)
4822 SvREFCNT_dec(r->float_substr);
2779dcf1 4823 Safefree(r->substrs);
a193d654 4824 }
c277df42
IZ
4825 if (r->data) {
4826 int n = r->data->count;
dfad63ad
HS
4827 AV* new_comppad = NULL;
4828 AV* old_comppad;
4829 SV** old_curpad;
4830
c277df42 4831 while (--n >= 0) {
261faec3 4832 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4833 switch (r->data->what[n]) {
4834 case 's':
4835 SvREFCNT_dec((SV*)r->data->data[n]);
4836 break;
653099ff
GS
4837 case 'f':
4838 Safefree(r->data->data[n]);
4839 break;
dfad63ad
HS
4840 case 'p':
4841 new_comppad = (AV*)r->data->data[n];
4842 break;
c277df42 4843 case 'o':
dfad63ad 4844 if (new_comppad == NULL)
cea2e8a9 4845 Perl_croak(aTHX_ "panic: pregfree comppad");
dfad63ad
HS
4846 old_comppad = PL_comppad;
4847 old_curpad = PL_curpad;
1e6dc0b6
SB
4848 /* Watch out for global destruction's random ordering. */
4849 if (SvTYPE(new_comppad) == SVt_PVAV) {
4850 PL_comppad = new_comppad;
4851 PL_curpad = AvARRAY(new_comppad);
4852 }
4853 else
4854 PL_curpad = NULL;
9b978d73
DM
4855
4856 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4857 op_free((OP_4tree*)r->data->data[n]);
4858 }
4859
dfad63ad
HS
4860 PL_comppad = old_comppad;
4861 PL_curpad = old_curpad;
4862 SvREFCNT_dec((SV*)new_comppad);
4863 new_comppad = NULL;
c277df42
IZ
4864 break;
4865 case 'n':
9e55ce06 4866 break;
c277df42 4867 default:
830247a4 4868 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4869 }
4870 }
4871 Safefree(r->data->what);
4872 Safefree(r->data);
a0d0e21e
LW
4873 }
4874 Safefree(r->startp);
4875 Safefree(r->endp);
4876 Safefree(r);
a687059c 4877}
c277df42
IZ
4878
4879/*
4880 - regnext - dig the "next" pointer out of a node
4881 *
4882 * [Note, when REGALIGN is defined there are two places in regmatch()
4883 * that bypass this code for speed.]
4884 */
4885regnode *
864dbfa3 4886Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
4887{
4888 register I32 offset;
4889
3280af22 4890 if (p == &PL_regdummy)
c277df42
IZ
4891 return(NULL);
4892
4893 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4894 if (offset == 0)
4895 return(NULL);
4896
c277df42 4897 return(p+offset);
c277df42
IZ
4898}
4899
01f988be 4900STATIC void
cea2e8a9 4901S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
4902{
4903 va_list args;
4904 STRLEN l1 = strlen(pat1);
4905 STRLEN l2 = strlen(pat2);
4906 char buf[512];
06bf62c7 4907 SV *msv;
c277df42
IZ
4908 char *message;
4909
4910 if (l1 > 510)
4911 l1 = 510;
4912 if (l1 + l2 > 510)
4913 l2 = 510 - l1;
4914 Copy(pat1, buf, l1 , char);
4915 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
4916 buf[l1 + l2] = '\n';
4917 buf[l1 + l2 + 1] = '\0';
8736538c
AS
4918#ifdef I_STDARG
4919 /* ANSI variant takes additional second argument */
c277df42 4920 va_start(args, pat2);
8736538c
AS
4921#else
4922 va_start(args);
4923#endif
5a844595 4924 msv = vmess(buf, &args);
c277df42 4925 va_end(args);
06bf62c7 4926 message = SvPV(msv,l1);
c277df42
IZ
4927 if (l1 > 512)
4928 l1 = 512;
4929 Copy(message, buf, l1 , char);
4930 buf[l1] = '\0'; /* Overwrite \n */
cea2e8a9 4931 Perl_croak(aTHX_ "%s", buf);
c277df42 4932}
a0ed51b3
LW
4933
4934/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
4935
4936void
864dbfa3 4937Perl_save_re_context(pTHX)
b81d288d 4938{
830247a4
IZ
4939#if 0
4940 SAVEPPTR(RExC_precomp); /* uncompiled string. */
4941 SAVEI32(RExC_npar); /* () count. */
4942 SAVEI32(RExC_size); /* Code size. */
4943 SAVEI16(RExC_flags16); /* are we folding, multilining? */
4944 SAVEVPTR(RExC_rx); /* from regcomp.c */
4945 SAVEI32(RExC_seen); /* from regcomp.c */
4946 SAVEI32(RExC_sawback); /* Did we see \1, ...? */
4947 SAVEI32(RExC_naughty); /* How bad is this pattern? */
4948 SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
4949 SAVEPPTR(RExC_end); /* End of input for compile */
4950 SAVEPPTR(RExC_parse); /* Input-scan pointer. */
4951#endif
4952
4953 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 4954 SAVEPPTR(PL_bostr);
a0ed51b3
LW
4955 SAVEPPTR(PL_reginput); /* String-input pointer. */
4956 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
4957 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
4958 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
4959 SAVEVPTR(PL_regendp); /* Ditto for endp. */
4960 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a0ed51b3 4961 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 4962 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 4963 PL_reg_start_tmp = 0;
a0ed51b3
LW
4964 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
4965 PL_reg_start_tmpl = 0;
7766f137 4966 SAVEVPTR(PL_regdata);
a0ed51b3
LW
4967 SAVEI32(PL_reg_eval_set); /* from regexec.c */
4968 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 4969 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 4970 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
4971 SAVEVPTR(PL_regcc); /* from regexec.c */
4972 SAVEVPTR(PL_curcop);
7766f137
GS
4973 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
4974 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
4975 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
4976 SAVESPTR(PL_reg_sv); /* from regexec.c */
53c4c00c 4977 SAVEI8(PL_reg_match_utf8); /* from regexec.c */
7766f137 4978 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 4979 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
4980 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
4981 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5fb7366e 4982 SAVEI32(PL_regnpar); /* () count. */
e49a9654 4983 SAVEI32(PL_regsize); /* from regexec.c */
54b6e2fa 4984#ifdef DEBUGGING
b81d288d 4985 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 4986#endif
a0ed51b3 4987}
51371543 4988
51371543 4989static void
acfe0abc 4990clear_re(pTHX_ void *r)
51371543
GS
4991{
4992 ReREFCNT_dec((regexp *)r);
4993}
ffbc6a93 4994