This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix copy & paste bugs in mX?PUSH macro tests.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2
JH
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 74 ****
9ef589d8
LW
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
77
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGCOMP_C
a687059c 85#include "perl.h"
d06ea78c 86
acfe0abc 87#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
88# include "INTERN.h"
89#endif
c277df42
IZ
90
91#define REG_COMP_C
a687059c
LW
92#include "regcomp.h"
93
d4cce5f1 94#ifdef op
11343788 95#undef op
d4cce5f1 96#endif /* op */
11343788 97
fe14fcc3 98#ifdef MSDOS
7e4e8c89 99# if defined(BUGGY_MSC6)
fe14fcc3 100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 101# pragma optimize("a",off)
fe14fcc3 102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
103# pragma optimize("w",on )
104# endif /* BUGGY_MSC6 */
fe14fcc3
LW
105#endif /* MSDOS */
106
a687059c
LW
107#ifndef STATIC
108#define STATIC static
109#endif
110
830247a4 111typedef struct RExC_state_t {
e2509266 112 U32 flags; /* are we folding, multilining? */
830247a4
IZ
113 char *precomp; /* uncompiled string. */
114 regexp *rx;
fac92740 115 char *start; /* Start of input for compile */
830247a4
IZ
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 119 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 120 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
123 U32 seen;
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
126 I32 extralen;
127 I32 seen_zerolen;
128 I32 seen_evals;
1aa99e6b 129 I32 utf8;
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
134} RExC_state_t;
135
e2509266 136#define RExC_flags (pRExC_state->flags)
830247a4
IZ
137#define RExC_precomp (pRExC_state->precomp)
138#define RExC_rx (pRExC_state->rx)
fac92740 139#define RExC_start (pRExC_state->start)
830247a4
IZ
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 143#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 144#define RExC_emit (pRExC_state->emit)
fac92740 145#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
146#define RExC_naughty (pRExC_state->naughty)
147#define RExC_sawback (pRExC_state->sawback)
148#define RExC_seen (pRExC_state->seen)
149#define RExC_size (pRExC_state->size)
150#define RExC_npar (pRExC_state->npar)
151#define RExC_extralen (pRExC_state->extralen)
152#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 154#define RExC_utf8 (pRExC_state->utf8)
830247a4 155
a687059c
LW
156#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
a687059c 159
35c8bce7
LW
160#ifdef SPSTART
161#undef SPSTART /* dratted cpp namespace... */
162#endif
a687059c
LW
163/*
164 * Flags to be passed up and down.
165 */
a687059c 166#define WORST 0 /* Worst case. */
821b33a5 167#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
168#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169#define SPSTART 0x4 /* Starts with * or +. */
170#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 171
2c2d71f5
JH
172/* Length of a variant. */
173
174typedef struct scan_data_t {
175 I32 len_min;
176 I32 len_delta;
177 I32 pos_min;
178 I32 pos_delta;
179 SV *last_found;
180 I32 last_end; /* min value, <0 unless valid. */
181 I32 last_start_min;
182 I32 last_start_max;
183 SV **longest; /* Either &l_fixed, or &l_float. */
184 SV *longest_fixed;
185 I32 offset_fixed;
186 SV *longest_float;
187 I32 offset_float_min;
188 I32 offset_float_max;
189 I32 flags;
190 I32 whilem_c;
cb434fcc 191 I32 *last_closep;
653099ff 192 struct regnode_charclass_class *start_class;
2c2d71f5
JH
193} scan_data_t;
194
a687059c 195/*
e50aee73 196 * Forward declarations for pregcomp()'s friends.
a687059c 197 */
a0d0e21e 198
b81d288d 199static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 200 0, 0, 0, 0, 0, 0};
c277df42
IZ
201
202#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203#define SF_BEFORE_SEOL 0x1
204#define SF_BEFORE_MEOL 0x2
205#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
09b7f37c
CB
208#ifdef NO_UNARY_PLUS
209# define SF_FIX_SHIFT_EOL (0+2)
210# define SF_FL_SHIFT_EOL (0+4)
211#else
212# define SF_FIX_SHIFT_EOL (+2)
213# define SF_FL_SHIFT_EOL (+4)
214#endif
c277df42
IZ
215
216#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221#define SF_IS_INF 0x40
222#define SF_HAS_PAR 0x80
223#define SF_IN_PAR 0x100
224#define SF_HAS_EVAL 0x200
4bfe0158 225#define SCF_DO_SUBSTR 0x400
653099ff
GS
226#define SCF_DO_STCLASS_AND 0x0800
227#define SCF_DO_STCLASS_OR 0x1000
228#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 229#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 230
eb160463 231#define UTF (RExC_utf8 != 0)
e2509266
JH
232#define LOC ((RExC_flags & PMf_LOCALE) != 0)
233#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 234
ffc61ed2 235#define OOB_UNICODE 12345678
93733859 236#define OOB_NAMEDCLASS -1
b8c5462f 237
a0ed51b3
LW
238#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
8615cb43 241
b45f050a
JF
242/* length of regex to show in messages that don't mark a position within */
243#define RegexLengthToShowInErrorMessages 127
244
245/*
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
249 */
7253e4e3
RK
250#define MARKER1 "<-- HERE" /* marker as it appears in the description */
251#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 252
7253e4e3 253#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
254
255/*
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
258 * "...".
259 */
ccb2c380
MP
260#define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
263 \
264 if (!SIZE_ONLY) \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
269 ellipses = "..."; \
270 } \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
273} STMT_END
8615cb43 274
b45f050a
JF
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
278 * "...".
279 */
ccb2c380
MP
280#define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
283 \
284 if (!SIZE_ONLY) \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
289 ellipses = "..."; \
290 } \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
293} STMT_END
b45f050a
JF
294
295
296/*
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 */
ccb2c380
MP
299#define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
303} STMT_END
b45f050a
JF
304
305/*
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 */
ccb2c380
MP
308#define vFAIL(m) STMT_START { \
309 if (!SIZE_ONLY) \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
311 Simple_vFAIL(m); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Like Simple_vFAIL(), but accepts two arguments.
316 */
ccb2c380
MP
317#define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 */
ccb2c380
MP
326#define vFAIL2(m,a1) STMT_START { \
327 if (!SIZE_ONLY) \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
330} STMT_END
b45f050a
JF
331
332
333/*
334 * Like Simple_vFAIL(), but accepts three arguments.
335 */
ccb2c380
MP
336#define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
340} STMT_END
b45f050a
JF
341
342/*
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 */
ccb2c380
MP
345#define vFAIL3(m,a1,a2) STMT_START { \
346 if (!SIZE_ONLY) \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Like Simple_vFAIL(), but accepts four arguments.
353 */
ccb2c380
MP
354#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts five arguments.
362 */
ccb2c380
MP
363#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
368
369
370#define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
375
376#define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
382
383
384#define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
388} STMT_END
389
390#define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
394} STMT_END
395
396#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400} STMT_END
401
402#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
9d1d55b5 407
8615cb43 408
cd439c50 409/* Allow for side effects in s */
ccb2c380
MP
410#define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412} STMT_END
cd439c50 413
fac92740
MJD
414/* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
418 */
419
420#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
421/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424#define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 if (! SIZE_ONLY) { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
428 if((node) < 0) { \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 } else { \
431 RExC_offsets[2*(node)-1] = (byte); \
432 } \
433 } \
434} STMT_END
435
436#define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440#define Set_Node_Length_To_R(node,len) STMT_START { \
441 if (! SIZE_ONLY) { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
444 if((node) < 0) { \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 } else { \
447 RExC_offsets[2*(node)] = (len); \
448 } \
449 } \
450} STMT_END
451
452#define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455#define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
457
458/* Get offsets and lengths */
459#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
acfe0abc 462static void clear_re(pTHX_ void *r);
4327152a 463
653099ff
GS
464/* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
467
4327152a 468STATIC void
830247a4 469S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 470{
a0ed51b3
LW
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 473
c277df42 474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 475 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
b81d288d 479 data->flags
c277df42
IZ
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 else
482 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
483 }
484 else {
c277df42 485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
486 data->offset_float_max = (l
487 ? data->last_start_max
c277df42 488 : data->pos_min + data->pos_delta);
9051bda5
HS
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
c277df42 491 if (data->flags & SF_BEFORE_EOL)
b81d288d 492 data->flags
c277df42
IZ
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 else
495 data->flags &= ~SF_FL_BEFORE_EOL;
496 }
497 }
498 SvCUR_set(data->last_found, 0);
0eda9292
JH
499 {
500 SV * sv = data->last_found;
501 MAGIC *mg =
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
504 mg->mg_len = 0;
505 }
c277df42
IZ
506 data->last_end = -1;
507 data->flags &= ~SF_BEFORE_EOL;
508}
509
653099ff
GS
510/* Can match anything (initialization) */
511STATIC void
830247a4 512S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 513{
653099ff 514 ANYOF_CLASS_ZERO(cl);
f8bef550 515 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
517 if (LOC)
518 cl->flags |= ANYOF_LOCALE;
519}
520
521/* Can match anything (initialization) */
522STATIC int
523S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524{
525 int value;
526
aaa51d5e 527 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 return 1;
1aa99e6b
IH
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 return 0;
f8bef550
NC
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
533 return 0;
653099ff
GS
534 return 1;
535}
536
537/* Can match anything (initialization) */
538STATIC void
830247a4 539S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 540{
8ecf7187 541 Zero(cl, 1, struct regnode_charclass_class);
653099ff 542 cl->type = ANYOF;
830247a4 543 cl_anything(pRExC_state, cl);
653099ff
GS
544}
545
546STATIC void
830247a4 547S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 548{
8ecf7187 549 Zero(cl, 1, struct regnode_charclass_class);
653099ff 550 cl->type = ANYOF;
830247a4 551 cl_anything(pRExC_state, cl);
653099ff
GS
552 if (LOC)
553 cl->flags |= ANYOF_LOCALE;
554}
555
556/* 'And' a given class with another one. Can create false positives */
557/* We assume that cl is not inverted */
558STATIC void
559S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
561{
653099ff
GS
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
567 int i;
568
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
572 else
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
1aa99e6b 578
14ebb1a2
JH
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
581 cl->flags &= ~ANYOF_UNICODE_ALL;
582 cl->flags |= ANYOF_UNICODE;
583 ARG_SET(cl, ARG(and_with));
584 }
14ebb1a2
JH
585 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 587 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 590 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
591}
592
593/* 'OR' a given class with another one. Can create false positives */
594/* We assume that cl is not inverted */
595STATIC void
830247a4 596S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 597{
653099ff
GS
598 if (or_with->flags & ANYOF_INVERT) {
599 /* We do not use
600 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601 * <= (B1 | !B2) | (CL1 | !CL2)
602 * which is wasteful if CL2 is small, but we ignore CL2:
603 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604 * XXXX Can we handle case-fold? Unclear:
605 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 */
608 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609 && !(or_with->flags & ANYOF_FOLD)
610 && !(cl->flags & ANYOF_FOLD) ) {
611 int i;
612
613 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614 cl->bitmap[i] |= ~or_with->bitmap[i];
615 } /* XXXX: logic is complicated otherwise */
616 else {
830247a4 617 cl_anything(pRExC_state, cl);
653099ff
GS
618 }
619 } else {
620 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 622 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
623 || (cl->flags & ANYOF_FOLD)) ) {
624 int i;
625
626 /* OR char bitmap and class bitmap separately */
627 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628 cl->bitmap[i] |= or_with->bitmap[i];
629 if (or_with->flags & ANYOF_CLASS) {
630 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631 cl->classflags[i] |= or_with->classflags[i];
632 cl->flags |= ANYOF_CLASS;
633 }
634 }
635 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 636 cl_anything(pRExC_state, cl);
653099ff
GS
637 }
638 }
639 if (or_with->flags & ANYOF_EOS)
640 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
641
642 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643 ARG(cl) != ARG(or_with)) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
646 }
647 if (or_with->flags & ANYOF_UNICODE_ALL) {
648 cl->flags |= ANYOF_UNICODE_ALL;
649 cl->flags &= ~ANYOF_UNICODE;
650 }
653099ff
GS
651}
652
5d1c421c
JH
653/*
654 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655 * These need to be revisited when a newer toolchain becomes available.
656 */
657#if defined(__sparc64__) && defined(__GNUC__)
658# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659# undef SPARC64_GCC_WORKAROUND
660# define SPARC64_GCC_WORKAROUND 1
661# endif
662#endif
663
653099ff
GS
664/* REx optimizer. Converts nodes into quickier variants "in place".
665 Finds fixed substrings. */
666
c277df42
IZ
667/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668 to the position after last scanned or to NULL. */
669
76e3520e 670STATIC I32
830247a4 671S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
672 /* scanp: Start here (read-write). */
673 /* deltap: Write maxlen-minlen here. */
674 /* last: Stop before this one. */
675{
676 I32 min = 0, pars = 0, code;
677 regnode *scan = *scanp, *next;
678 I32 delta = 0;
679 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 680 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
681 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682 scan_data_t data_fake;
653099ff 683 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 684
c277df42
IZ
685 while (scan && OP(scan) != END && scan < last) {
686 /* Peephole optimizer: */
687
22c35a8c 688 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 689 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
690 regnode *n = regnext(scan);
691 U32 stringok = 1;
692#ifdef DEBUGGING
693 regnode *stop = scan;
b81d288d 694#endif
c277df42 695
cd439c50 696 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
697 /* Skip NOTHING, merge EXACT*. */
698 while (n &&
b81d288d 699 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
700 (stringok && (OP(n) == OP(scan))))
701 && NEXT_OFF(n)
702 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703 if (OP(n) == TAIL || n > next)
704 stringok = 0;
22c35a8c 705 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
706 NEXT_OFF(scan) += NEXT_OFF(n);
707 next = n + NODE_STEP_REGNODE;
708#ifdef DEBUGGING
709 if (stringok)
710 stop = n;
b81d288d 711#endif
c277df42 712 n = regnext(n);
a0ed51b3 713 }
f49d4d0f 714 else if (stringok) {
cd439c50 715 int oldl = STR_LEN(scan);
c277df42 716 regnode *nnext = regnext(n);
f49d4d0f 717
b81d288d 718 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
719 break;
720 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
721 STR_LEN(scan) += STR_LEN(n);
722 next = n + NODE_SZ_STR(n);
c277df42 723 /* Now we can overwrite *n : */
f49d4d0f 724 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 725#ifdef DEBUGGING
f49d4d0f 726 stop = next - 1;
b81d288d 727#endif
c277df42
IZ
728 n = nnext;
729 }
730 }
61a36c01 731
d65e4eab 732 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
733/*
734 Two problematic code points in Unicode casefolding of EXACT nodes:
735
736 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
738
739 which casefold to
740
741 Unicode UTF-8
742
743 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
744 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
745
746 This means that in case-insensitive matching (or "loose matching",
747 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748 length of the above casefolded versions) can match a target string
749 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750 This would rather mess up the minimum length computation.
751
752 What we'll do is to look for the tail four bytes, and then peek
753 at the preceding two bytes to see whether we need to decrease
754 the minimum length by four (six minus two).
755
756 Thanks to the design of UTF-8, there cannot be false matches:
757 A sequence of valid UTF-8 bytes cannot be a subsequence of
758 another valid sequence of UTF-8 bytes.
759
760*/
761 char *s0 = STRING(scan), *s, *t;
762 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763 char *t0 = "\xcc\x88\xcc\x81";
764 char *t1 = t0 + 3;
765
766 for (s = s0 + 2;
767 s < s2 && (t = ninstr(s, s1, t0, t1));
768 s = t + 4) {
769 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
771 min -= 4;
772 }
773 }
774
c277df42
IZ
775#ifdef DEBUGGING
776 /* Allow dumping */
cd439c50 777 n = scan + NODE_SZ_STR(scan);
c277df42 778 while (n <= stop) {
22c35a8c 779 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
780 OP(n) = OPTIMIZED;
781 NEXT_OFF(n) = 0;
782 }
783 n++;
784 }
653099ff 785#endif
c277df42 786 }
653099ff
GS
787 /* Follow the next-chain of the current node and optimize
788 away all the NOTHINGs from it. */
c277df42 789 if (OP(scan) != CURLYX) {
048cfca1
GS
790 int max = (reg_off_by_arg[OP(scan)]
791 ? I32_MAX
792 /* I32 may be smaller than U16 on CRAYs! */
793 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
794 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
795 int noff;
796 regnode *n = scan;
b81d288d 797
c277df42
IZ
798 /* Skip NOTHING and LONGJMP. */
799 while ((n = regnext(n))
22c35a8c 800 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
801 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
802 && off + noff < max)
803 off += noff;
804 if (reg_off_by_arg[OP(scan)])
805 ARG(scan) = off;
b81d288d 806 else
c277df42
IZ
807 NEXT_OFF(scan) = off;
808 }
653099ff
GS
809 /* The principal pseudo-switch. Cannot be a switch, since we
810 look into several different things. */
b81d288d 811 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
812 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813 next = regnext(scan);
814 code = OP(scan);
b81d288d
AB
815
816 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 817 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 818 struct regnode_charclass_class accum;
c277df42 819
653099ff 820 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 821 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 822 if (flags & SCF_DO_STCLASS)
830247a4 823 cl_init_zero(pRExC_state, &accum);
c277df42 824 while (OP(scan) == code) {
830247a4 825 I32 deltanext, minnext, f = 0, fake;
653099ff 826 struct regnode_charclass_class this_class;
c277df42
IZ
827
828 num++;
829 data_fake.flags = 0;
b81d288d 830 if (data) {
2c2d71f5 831 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
832 data_fake.last_closep = data->last_closep;
833 }
834 else
835 data_fake.last_closep = &fake;
c277df42
IZ
836 next = regnext(scan);
837 scan = NEXTOPER(scan);
838 if (code != BRANCH)
839 scan = NEXTOPER(scan);
653099ff 840 if (flags & SCF_DO_STCLASS) {
830247a4 841 cl_init(pRExC_state, &this_class);
653099ff
GS
842 data_fake.start_class = &this_class;
843 f = SCF_DO_STCLASS_AND;
b81d288d 844 }
e1901655
IZ
845 if (flags & SCF_WHILEM_VISITED_POS)
846 f |= SCF_WHILEM_VISITED_POS;
653099ff 847 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
848 minnext = study_chunk(pRExC_state, &scan, &deltanext,
849 next, &data_fake, f);
b81d288d 850 if (min1 > minnext)
c277df42
IZ
851 min1 = minnext;
852 if (max1 < minnext + deltanext)
853 max1 = minnext + deltanext;
854 if (deltanext == I32_MAX)
aca2d497 855 is_inf = is_inf_internal = 1;
c277df42
IZ
856 scan = next;
857 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
858 pars++;
405ff068 859 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 860 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
861 if (data)
862 data->whilem_c = data_fake.whilem_c;
653099ff 863 if (flags & SCF_DO_STCLASS)
830247a4 864 cl_or(pRExC_state, &accum, &this_class);
b81d288d 865 if (code == SUSPEND)
c277df42
IZ
866 break;
867 }
868 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
869 min1 = 0;
870 if (flags & SCF_DO_SUBSTR) {
871 data->pos_min += min1;
872 data->pos_delta += max1 - min1;
873 if (max1 != min1 || is_inf)
874 data->longest = &(data->longest_float);
875 }
876 min += min1;
877 delta += max1 - min1;
653099ff 878 if (flags & SCF_DO_STCLASS_OR) {
830247a4 879 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
880 if (min1) {
881 cl_and(data->start_class, &and_with);
882 flags &= ~SCF_DO_STCLASS;
883 }
884 }
885 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
886 if (min1) {
887 cl_and(data->start_class, &accum);
653099ff 888 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
889 }
890 else {
b81d288d 891 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
892 * data->start_class */
893 StructCopy(data->start_class, &and_with,
894 struct regnode_charclass_class);
895 flags &= ~SCF_DO_STCLASS_AND;
896 StructCopy(&accum, data->start_class,
897 struct regnode_charclass_class);
898 flags |= SCF_DO_STCLASS_OR;
899 data->start_class->flags |= ANYOF_EOS;
900 }
653099ff 901 }
a0ed51b3
LW
902 }
903 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
907 continue;
a0ed51b3
LW
908 }
909 else if (OP(scan) == EXACT) {
cd439c50 910 I32 l = STR_LEN(scan);
1aa99e6b 911 UV uc = *((U8*)STRING(scan));
a0ed51b3 912 if (UTF) {
1aa99e6b
IH
913 U8 *s = (U8*)STRING(scan);
914 l = utf8_length(s, s + l);
9041c2e3 915 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
916 }
917 min += l;
c277df42 918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
b81d288d 924 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 925 }
cd439c50 926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
927 {
928 SV * sv = data->last_found;
929 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930 mg_find(sv, PERL_MAGIC_utf8) : NULL;
931 if (mg && mg->mg_len >= 0)
5e43f467
JH
932 mg->mg_len += utf8_length((U8*)STRING(scan),
933 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 934 }
33b8afdf
JH
935 if (UTF)
936 SvUTF8_on(data->last_found);
c277df42
IZ
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
940 }
653099ff
GS
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
943 int compat = 1;
944
1aa99e6b 945 if (uc >= 0x100 ||
516a5887 946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 948 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 950 )
653099ff
GS
951 compat = 0;
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
954 if (compat)
1aa99e6b 955 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 956 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
957 if (uc < 0x100)
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
959 }
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
1aa99e6b 962 if (uc < 0x100)
9b877dbb
IH
963 ANYOF_BITMAP_SET(data->start_class, uc);
964 else
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
968 }
969 flags &= ~SCF_DO_STCLASS;
a0ed51b3 970 }
653099ff 971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 972 I32 l = STR_LEN(scan);
1aa99e6b 973 UV uc = *((U8*)STRING(scan));
653099ff
GS
974
975 /* Search for fixed substrings supports EXACT only. */
b81d288d 976 if (flags & SCF_DO_SUBSTR)
830247a4 977 scan_commit(pRExC_state, data);
a0ed51b3 978 if (UTF) {
1aa99e6b
IH
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
9041c2e3 981 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
982 }
983 min += l;
c277df42 984 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 985 data->pos_min += l;
653099ff
GS
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
988 int compat = 1;
989
1aa99e6b 990 if (uc >= 0x100 ||
516a5887 991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
994 compat = 0;
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
997 if (compat) {
1aa99e6b 998 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1003 }
1004 }
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1aa99e6b
IH
1009 if (uc < 0x100)
1010 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1011 data->start_class->flags &= ~ANYOF_EOS;
1012 }
1013 cl_and(data->start_class, &and_with);
1014 }
1015 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1016 }
4d61ec05 1017 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1019 I32 f = flags, pos_before = 0;
c277df42 1020 regnode *oscan = scan;
653099ff
GS
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
727f22e3 1023 I32 next_is_eval = 0;
653099ff 1024
22c35a8c 1025 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1026 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1027 scan = NEXTOPER(scan);
1028 goto finish;
1029 case PLUS:
653099ff 1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1031 next = NEXTOPER(scan);
653099ff 1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1033 mincount = 1;
1034 maxcount = REG_INFTY;
c277df42
IZ
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1037 goto do_curly;
1038 }
1039 }
1040 if (flags & SCF_DO_SUBSTR)
1041 data->pos_min++;
1042 min++;
1043 /* Fall through. */
1044 case STAR:
653099ff
GS
1045 if (flags & SCF_DO_STCLASS) {
1046 mincount = 0;
b81d288d 1047 maxcount = REG_INFTY;
653099ff
GS
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1050 goto do_curly;
1051 }
b81d288d 1052 is_inf = is_inf_internal = 1;
c277df42
IZ
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
830247a4 1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1056 data->longest = &(data->longest_float);
1057 }
1058 goto optimize_curly_tail;
1059 case CURLY:
b81d288d 1060 mincount = ARG1(scan);
c277df42
IZ
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
cb434fcc
IZ
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1065
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067 }
c277df42 1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1069 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1070 do_curly:
1071 if (flags & SCF_DO_SUBSTR) {
830247a4 1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1073 pos_before = data->pos_min;
1074 }
1075 if (data) {
1076 fl = data->flags;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078 if (is_inf)
1079 data->flags |= SF_IS_INF;
1080 }
653099ff 1081 if (flags & SCF_DO_STCLASS) {
830247a4 1082 cl_init(pRExC_state, &this_class);
653099ff
GS
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1087 }
e1901655
IZ
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
b81d288d 1091
e1901655
IZ
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1096
c277df42 1097 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099 mincount == 0
653099ff
GS
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1106 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1107 }
1108 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1109 /* Switch to OR mode: cache the old value of
653099ff
GS
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1118 }
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1121 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1122 cl_and(data->start_class, &and_with);
1123 }
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1127 }
c277df42
IZ
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1129 scan = next;
84037bb0 1130 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1131 /* ? quantifier ok, except for (?{ ... }) */
1132 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1133 && (minnext == 0) && (deltanext == 0)
99799961 1134 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1135 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1136 {
830247a4 1137 vWARN(RExC_parse,
b45f050a
JF
1138 "Quantifier unexpected on zero-length expression");
1139 }
1140
c277df42 1141 min += minnext * mincount;
b81d288d 1142 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
aca2d497 1145 is_inf |= is_inf_internal;
c277df42
IZ
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1149 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1155 regnode *nxt1 = nxt;
1156#ifdef DEBUGGING
1157 regnode *nxt2;
1158#endif
c277df42
IZ
1159
1160 /* Skip open. */
1161 nxt = regnext(nxt);
4d61ec05 1162 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1164 && STR_LEN(nxt) == 1))
c277df42 1165 goto nogo;
497b47a8 1166#ifdef DEBUGGING
c277df42 1167 nxt2 = nxt;
497b47a8 1168#endif
c277df42 1169 nxt = regnext(nxt);
b81d288d 1170 if (OP(nxt) != CLOSE)
c277df42
IZ
1171 goto nogo;
1172 /* Now we know that nxt2 is the only contents: */
eb160463 1173 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1174 OP(oscan) = CURLYN;
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1176#ifdef DEBUGGING
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1183#endif
c277df42 1184 }
c277df42
IZ
1185 nogo:
1186
1187 /* Try optimization CURLYX => CURLYM. */
b81d288d 1188 if ( OP(oscan) == CURLYX && data
c277df42 1189 && !(data->flags & SF_HAS_PAR)
c277df42 1190 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
1191 && !deltanext /* atom is fixed width */
1192 && minnext != 0 /* CURLYM can't handle zero width */
1193 ) {
c277df42
IZ
1194 /* XXXX How to optimize if data == 0? */
1195 /* Optimize to a simpler form. */
1196 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1197 regnode *nxt2;
1198
1199 OP(oscan) = CURLYM;
1200 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1201 && (OP(nxt2) != WHILEM))
c277df42
IZ
1202 nxt = nxt2;
1203 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1204 /* Need to optimize away parenths. */
1205 if (data->flags & SF_IN_PAR) {
1206 /* Set the parenth number. */
1207 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1208
b81d288d 1209 if (OP(nxt) != CLOSE)
b45f050a 1210 FAIL("Panic opt close");
eb160463 1211 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1212 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1213 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1214#ifdef DEBUGGING
1215 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1216 OP(nxt + 1) = OPTIMIZED; /* was count. */
1217 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1218 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1219#endif
c277df42
IZ
1220#if 0
1221 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1222 regnode *nnxt = regnext(nxt1);
b81d288d 1223
c277df42
IZ
1224 if (nnxt == nxt) {
1225 if (reg_off_by_arg[OP(nxt1)])
1226 ARG_SET(nxt1, nxt2 - nxt1);
1227 else if (nxt2 - nxt1 < U16_MAX)
1228 NEXT_OFF(nxt1) = nxt2 - nxt1;
1229 else
1230 OP(nxt) = NOTHING; /* Cannot beautify */
1231 }
1232 nxt1 = nnxt;
1233 }
1234#endif
1235 /* Optimize again: */
b81d288d 1236 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1237 NULL, 0);
a0ed51b3
LW
1238 }
1239 else
c277df42 1240 oscan->flags = 0;
c277df42 1241 }
e1901655
IZ
1242 else if ((OP(oscan) == CURLYX)
1243 && (flags & SCF_WHILEM_VISITED_POS)
1244 /* See the comment on a similar expression above.
1245 However, this time it not a subexpression
1246 we care about, but the expression itself. */
1247 && (maxcount == REG_INFTY)
1248 && data && ++data->whilem_c < 16) {
1249 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1250 /* Find WHILEM (as in regexec.c) */
1251 regnode *nxt = oscan + NEXT_OFF(oscan);
1252
1253 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1254 nxt += ARG(nxt);
eb160463
GS
1255 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1256 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1257 }
b81d288d 1258 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1259 pars++;
1260 if (flags & SCF_DO_SUBSTR) {
1261 SV *last_str = Nullsv;
1262 int counted = mincount != 0;
1263
1264 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1265#if defined(SPARC64_GCC_WORKAROUND)
1266 I32 b = 0;
1267 STRLEN l = 0;
1268 char *s = NULL;
1269 I32 old = 0;
1270
1271 if (pos_before >= data->last_start_min)
1272 b = pos_before;
1273 else
1274 b = data->last_start_min;
1275
1276 l = 0;
1277 s = SvPV(data->last_found, l);
1278 old = b - data->last_start_min;
1279
1280#else
b81d288d 1281 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1282 ? pos_before : data->last_start_min;
1283 STRLEN l;
1284 char *s = SvPV(data->last_found, l);
a0ed51b3 1285 I32 old = b - data->last_start_min;
5d1c421c 1286#endif
a0ed51b3
LW
1287
1288 if (UTF)
1289 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1290
a0ed51b3 1291 l -= old;
c277df42 1292 /* Get the added string: */
79cb57f6 1293 last_str = newSVpvn(s + old, l);
0e933229
IH
1294 if (UTF)
1295 SvUTF8_on(last_str);
c277df42
IZ
1296 if (deltanext == 0 && pos_before == b) {
1297 /* What was added is a constant string */
1298 if (mincount > 1) {
1299 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1300 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1301 SvPVX(last_str), l, mincount - 1);
1302 SvCUR(last_str) *= mincount;
1303 /* Add additional parts. */
b81d288d 1304 SvCUR_set(data->last_found,
c277df42
IZ
1305 SvCUR(data->last_found) - l);
1306 sv_catsv(data->last_found, last_str);
0eda9292
JH
1307 {
1308 SV * sv = data->last_found;
1309 MAGIC *mg =
1310 SvUTF8(sv) && SvMAGICAL(sv) ?
1311 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1312 if (mg && mg->mg_len >= 0)
1313 mg->mg_len += CHR_SVLEN(last_str);
1314 }
c277df42
IZ
1315 data->last_end += l * (mincount - 1);
1316 }
2a8d9689
HS
1317 } else {
1318 /* start offset must point into the last copy */
1319 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1320 data->last_start_max += is_inf ? I32_MAX
1321 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1322 }
1323 }
1324 /* It is counted once already... */
1325 data->pos_min += minnext * (mincount - counted);
1326 data->pos_delta += - counted * deltanext +
1327 (minnext + deltanext) * maxcount - minnext * mincount;
1328 if (mincount != maxcount) {
653099ff
GS
1329 /* Cannot extend fixed substrings found inside
1330 the group. */
830247a4 1331 scan_commit(pRExC_state,data);
c277df42
IZ
1332 if (mincount && last_str) {
1333 sv_setsv(data->last_found, last_str);
1334 data->last_end = data->pos_min;
b81d288d 1335 data->last_start_min =
a0ed51b3 1336 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1337 data->last_start_max = is_inf
1338 ? I32_MAX
c277df42 1339 : data->pos_min + data->pos_delta
a0ed51b3 1340 - CHR_SVLEN(last_str);
c277df42
IZ
1341 }
1342 data->longest = &(data->longest_float);
1343 }
aca2d497 1344 SvREFCNT_dec(last_str);
c277df42 1345 }
405ff068 1346 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1347 data->flags |= SF_HAS_EVAL;
1348 optimize_curly_tail:
c277df42 1349 if (OP(oscan) != CURLYX) {
22c35a8c 1350 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1351 && NEXT_OFF(next))
1352 NEXT_OFF(oscan) += NEXT_OFF(next);
1353 }
c277df42 1354 continue;
653099ff 1355 default: /* REF and CLUMP only? */
c277df42 1356 if (flags & SCF_DO_SUBSTR) {
830247a4 1357 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1358 data->longest = &(data->longest_float);
1359 }
aca2d497 1360 is_inf = is_inf_internal = 1;
653099ff 1361 if (flags & SCF_DO_STCLASS_OR)
830247a4 1362 cl_anything(pRExC_state, data->start_class);
653099ff 1363 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1364 break;
1365 }
a0ed51b3 1366 }
ffc61ed2 1367 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1368 int value = 0;
653099ff 1369
c277df42 1370 if (flags & SCF_DO_SUBSTR) {
830247a4 1371 scan_commit(pRExC_state,data);
c277df42
IZ
1372 data->pos_min++;
1373 }
1374 min++;
653099ff
GS
1375 if (flags & SCF_DO_STCLASS) {
1376 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1377
1378 /* Some of the logic below assumes that switching
1379 locale on will only add false positives. */
1380 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1381 case SANY:
653099ff
GS
1382 default:
1383 do_default:
1384 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1385 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1386 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1387 break;
1388 case REG_ANY:
1389 if (OP(scan) == SANY)
1390 goto do_default;
1391 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1392 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1393 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1394 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1395 }
1396 if (flags & SCF_DO_STCLASS_AND || !value)
1397 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1398 break;
1399 case ANYOF:
1400 if (flags & SCF_DO_STCLASS_AND)
1401 cl_and(data->start_class,
1402 (struct regnode_charclass_class*)scan);
1403 else
830247a4 1404 cl_or(pRExC_state, data->start_class,
653099ff
GS
1405 (struct regnode_charclass_class*)scan);
1406 break;
1407 case ALNUM:
1408 if (flags & SCF_DO_STCLASS_AND) {
1409 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1411 for (value = 0; value < 256; value++)
1412 if (!isALNUM(value))
1413 ANYOF_BITMAP_CLEAR(data->start_class, value);
1414 }
1415 }
1416 else {
1417 if (data->start_class->flags & ANYOF_LOCALE)
1418 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1419 else {
1420 for (value = 0; value < 256; value++)
1421 if (isALNUM(value))
b81d288d 1422 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1423 }
1424 }
1425 break;
1426 case ALNUML:
1427 if (flags & SCF_DO_STCLASS_AND) {
1428 if (data->start_class->flags & ANYOF_LOCALE)
1429 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1430 }
1431 else {
1432 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1433 data->start_class->flags |= ANYOF_LOCALE;
1434 }
1435 break;
1436 case NALNUM:
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1440 for (value = 0; value < 256; value++)
1441 if (isALNUM(value))
1442 ANYOF_BITMAP_CLEAR(data->start_class, value);
1443 }
1444 }
1445 else {
1446 if (data->start_class->flags & ANYOF_LOCALE)
1447 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1448 else {
1449 for (value = 0; value < 256; value++)
1450 if (!isALNUM(value))
b81d288d 1451 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1452 }
1453 }
1454 break;
1455 case NALNUML:
1456 if (flags & SCF_DO_STCLASS_AND) {
1457 if (data->start_class->flags & ANYOF_LOCALE)
1458 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1459 }
1460 else {
1461 data->start_class->flags |= ANYOF_LOCALE;
1462 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1463 }
1464 break;
1465 case SPACE:
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1469 for (value = 0; value < 256; value++)
1470 if (!isSPACE(value))
1471 ANYOF_BITMAP_CLEAR(data->start_class, value);
1472 }
1473 }
1474 else {
1475 if (data->start_class->flags & ANYOF_LOCALE)
1476 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1477 else {
1478 for (value = 0; value < 256; value++)
1479 if (isSPACE(value))
b81d288d 1480 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1481 }
1482 }
1483 break;
1484 case SPACEL:
1485 if (flags & SCF_DO_STCLASS_AND) {
1486 if (data->start_class->flags & ANYOF_LOCALE)
1487 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1488 }
1489 else {
1490 data->start_class->flags |= ANYOF_LOCALE;
1491 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1492 }
1493 break;
1494 case NSPACE:
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1498 for (value = 0; value < 256; value++)
1499 if (isSPACE(value))
1500 ANYOF_BITMAP_CLEAR(data->start_class, value);
1501 }
1502 }
1503 else {
1504 if (data->start_class->flags & ANYOF_LOCALE)
1505 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1506 else {
1507 for (value = 0; value < 256; value++)
1508 if (!isSPACE(value))
b81d288d 1509 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1510 }
1511 }
1512 break;
1513 case NSPACEL:
1514 if (flags & SCF_DO_STCLASS_AND) {
1515 if (data->start_class->flags & ANYOF_LOCALE) {
1516 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
1519 ANYOF_BITMAP_CLEAR(data->start_class, value);
1520 }
1521 }
1522 else {
1523 data->start_class->flags |= ANYOF_LOCALE;
1524 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1525 }
1526 break;
1527 case DIGIT:
1528 if (flags & SCF_DO_STCLASS_AND) {
1529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1530 for (value = 0; value < 256; value++)
1531 if (!isDIGIT(value))
1532 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 }
1534 else {
1535 if (data->start_class->flags & ANYOF_LOCALE)
1536 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1537 else {
1538 for (value = 0; value < 256; value++)
1539 if (isDIGIT(value))
b81d288d 1540 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1541 }
1542 }
1543 break;
1544 case NDIGIT:
1545 if (flags & SCF_DO_STCLASS_AND) {
1546 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1547 for (value = 0; value < 256; value++)
1548 if (isDIGIT(value))
1549 ANYOF_BITMAP_CLEAR(data->start_class, value);
1550 }
1551 else {
1552 if (data->start_class->flags & ANYOF_LOCALE)
1553 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1554 else {
1555 for (value = 0; value < 256; value++)
1556 if (!isDIGIT(value))
b81d288d 1557 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1558 }
1559 }
1560 break;
1561 }
1562 if (flags & SCF_DO_STCLASS_OR)
1563 cl_and(data->start_class, &and_with);
1564 flags &= ~SCF_DO_STCLASS;
1565 }
a0ed51b3 1566 }
22c35a8c 1567 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1568 data->flags |= (OP(scan) == MEOL
1569 ? SF_BEFORE_MEOL
1570 : SF_BEFORE_SEOL);
a0ed51b3 1571 }
653099ff
GS
1572 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1573 /* Lookbehind, or need to calculate parens/evals/stclass: */
1574 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1575 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1576 /* Lookahead/lookbehind */
cb434fcc 1577 I32 deltanext, minnext, fake = 0;
c277df42 1578 regnode *nscan;
653099ff
GS
1579 struct regnode_charclass_class intrnl;
1580 int f = 0;
c277df42
IZ
1581
1582 data_fake.flags = 0;
b81d288d 1583 if (data) {
2c2d71f5 1584 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1585 data_fake.last_closep = data->last_closep;
1586 }
1587 else
1588 data_fake.last_closep = &fake;
653099ff
GS
1589 if ( flags & SCF_DO_STCLASS && !scan->flags
1590 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1591 cl_init(pRExC_state, &intrnl);
653099ff 1592 data_fake.start_class = &intrnl;
e1901655 1593 f |= SCF_DO_STCLASS_AND;
653099ff 1594 }
e1901655
IZ
1595 if (flags & SCF_WHILEM_VISITED_POS)
1596 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1597 next = regnext(scan);
1598 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1599 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1600 if (scan->flags) {
1601 if (deltanext) {
9baa0206 1602 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1603 }
1604 else if (minnext > U8_MAX) {
9baa0206 1605 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1606 }
eb160463 1607 scan->flags = (U8)minnext;
c277df42
IZ
1608 }
1609 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1610 pars++;
405ff068 1611 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1612 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1613 if (data)
1614 data->whilem_c = data_fake.whilem_c;
e1901655 1615 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1616 int was = (data->start_class->flags & ANYOF_EOS);
1617
1618 cl_and(data->start_class, &intrnl);
1619 if (was)
1620 data->start_class->flags |= ANYOF_EOS;
1621 }
a0ed51b3
LW
1622 }
1623 else if (OP(scan) == OPEN) {
c277df42 1624 pars++;
a0ed51b3 1625 }
cb434fcc 1626 else if (OP(scan) == CLOSE) {
eb160463 1627 if ((I32)ARG(scan) == is_par) {
cb434fcc 1628 next = regnext(scan);
c277df42 1629
cb434fcc
IZ
1630 if ( next && (OP(next) != WHILEM) && next < last)
1631 is_par = 0; /* Disable optimization */
1632 }
1633 if (data)
1634 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1635 }
1636 else if (OP(scan) == EVAL) {
c277df42
IZ
1637 if (data)
1638 data->flags |= SF_HAS_EVAL;
1639 }
96776eda 1640 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1641 if (flags & SCF_DO_SUBSTR) {
830247a4 1642 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1643 data->longest = &(data->longest_float);
1644 }
1645 is_inf = is_inf_internal = 1;
653099ff 1646 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1647 cl_anything(pRExC_state, data->start_class);
96776eda 1648 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1649 }
c277df42
IZ
1650 /* Else: zero-length, ignore. */
1651 scan = regnext(scan);
1652 }
1653
1654 finish:
1655 *scanp = scan;
aca2d497 1656 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1657 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1658 data->pos_delta = I32_MAX - data->pos_min;
1659 if (is_par > U8_MAX)
1660 is_par = 0;
1661 if (is_par && pars==1 && data) {
1662 data->flags |= SF_IN_PAR;
1663 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1664 }
1665 else if (pars && data) {
c277df42
IZ
1666 data->flags |= SF_HAS_PAR;
1667 data->flags &= ~SF_IN_PAR;
1668 }
653099ff
GS
1669 if (flags & SCF_DO_STCLASS_OR)
1670 cl_and(data->start_class, &and_with);
c277df42
IZ
1671 return min;
1672}
1673
76e3520e 1674STATIC I32
830247a4 1675S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1676{
830247a4 1677 if (RExC_rx->data) {
b81d288d
AB
1678 Renewc(RExC_rx->data,
1679 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1680 char, struct reg_data);
830247a4
IZ
1681 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1682 RExC_rx->data->count += n;
a0ed51b3
LW
1683 }
1684 else {
830247a4 1685 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1686 char, struct reg_data);
830247a4
IZ
1687 New(1208, RExC_rx->data->what, n, U8);
1688 RExC_rx->data->count = n;
c277df42 1689 }
830247a4
IZ
1690 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1691 return RExC_rx->data->count - n;
c277df42
IZ
1692}
1693
d88dccdf 1694void
864dbfa3 1695Perl_reginitcolors(pTHX)
d88dccdf 1696{
d88dccdf
IZ
1697 int i = 0;
1698 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1699
d88dccdf
IZ
1700 if (s) {
1701 PL_colors[0] = s = savepv(s);
1702 while (++i < 6) {
1703 s = strchr(s, '\t');
1704 if (s) {
1705 *s = '\0';
1706 PL_colors[i] = ++s;
1707 }
1708 else
c712d376 1709 PL_colors[i] = s = "";
d88dccdf
IZ
1710 }
1711 } else {
b81d288d 1712 while (i < 6)
d88dccdf
IZ
1713 PL_colors[i++] = "";
1714 }
1715 PL_colorset = 1;
1716}
1717
8615cb43 1718
a687059c 1719/*
e50aee73 1720 - pregcomp - compile a regular expression into internal code
a687059c
LW
1721 *
1722 * We can't allocate space until we know how big the compiled form will be,
1723 * but we can't compile it (and thus know how big it is) until we've got a
1724 * place to put the code. So we cheat: we compile it twice, once with code
1725 * generation turned off and size counting turned on, and once "for real".
1726 * This also means that we don't allocate space until we are sure that the
1727 * thing really will compile successfully, and we never have to move the
1728 * code and thus invalidate pointers into it. (Note that it has to be in
1729 * one piece because free() must be able to free it all.) [NB: not true in perl]
1730 *
1731 * Beware that the optimization-preparation code in here knows about some
1732 * of the structure of the compiled regexp. [I'll say.]
1733 */
1734regexp *
864dbfa3 1735Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1736{
a0d0e21e 1737 register regexp *r;
c277df42 1738 regnode *scan;
c277df42 1739 regnode *first;
a0d0e21e 1740 I32 flags;
a0d0e21e
LW
1741 I32 minlen = 0;
1742 I32 sawplus = 0;
1743 I32 sawopen = 0;
2c2d71f5 1744 scan_data_t data;
830247a4
IZ
1745 RExC_state_t RExC_state;
1746 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1747
1748 if (exp == NULL)
c277df42 1749 FAIL("NULL regexp argument");
a0d0e21e 1750
a5961de5 1751 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1752
5cfc7842 1753 RExC_precomp = exp;
a5961de5
JH
1754 DEBUG_r({
1755 if (!PL_colorset) reginitcolors();
1756 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1757 PL_colors[4],PL_colors[5],PL_colors[0],
1758 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1759 });
e2509266 1760 RExC_flags = pm->op_pmflags;
830247a4 1761 RExC_sawback = 0;
bbce6d69 1762
830247a4
IZ
1763 RExC_seen = 0;
1764 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1765 RExC_seen_evals = 0;
1766 RExC_extralen = 0;
c277df42 1767
bbce6d69 1768 /* First pass: determine size, legality. */
830247a4 1769 RExC_parse = exp;
fac92740 1770 RExC_start = exp;
830247a4
IZ
1771 RExC_end = xend;
1772 RExC_naughty = 0;
1773 RExC_npar = 1;
1774 RExC_size = 0L;
1775 RExC_emit = &PL_regdummy;
1776 RExC_whilem_seen = 0;
85ddcde9
JH
1777#if 0 /* REGC() is (currently) a NOP at the first pass.
1778 * Clever compilers notice this and complain. --jhi */
830247a4 1779 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1780#endif
830247a4 1781 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1782 RExC_precomp = Nullch;
a0d0e21e
LW
1783 return(NULL);
1784 }
830247a4 1785 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1786
c277df42
IZ
1787 /* Small enough for pointer-storage convention?
1788 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1789 if (RExC_size >= 0x10000L && RExC_extralen)
1790 RExC_size += RExC_extralen;
c277df42 1791 else
830247a4
IZ
1792 RExC_extralen = 0;
1793 if (RExC_whilem_seen > 15)
1794 RExC_whilem_seen = 15;
a0d0e21e 1795
bbce6d69 1796 /* Allocate space and initialize. */
830247a4 1797 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1798 char, regexp);
a0d0e21e 1799 if (r == NULL)
b45f050a
JF
1800 FAIL("Regexp out of space");
1801
0f79a09d
GS
1802#ifdef DEBUGGING
1803 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1804 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1805#endif
c277df42 1806 r->refcnt = 1;
bbce6d69 1807 r->prelen = xend - exp;
5cfc7842 1808 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1809 r->subbeg = NULL;
ed252734
NC
1810#ifdef PERL_COPY_ON_WRITE
1811 r->saved_copy = Nullsv;
1812#endif
cf93c79d 1813 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1814 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1815
1816 r->substrs = 0; /* Useful during FAIL. */
1817 r->startp = 0; /* Useful during FAIL. */
1818 r->endp = 0; /* Useful during FAIL. */
1819
fac92740
MJD
1820 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1821 if (r->offsets) {
1822 r->offsets[0] = RExC_size;
1823 }
1824 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1825 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1826 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1827 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1828
830247a4 1829 RExC_rx = r;
bbce6d69 1830
1831 /* Second pass: emit code. */
e2509266 1832 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1833 RExC_parse = exp;
1834 RExC_end = xend;
1835 RExC_naughty = 0;
1836 RExC_npar = 1;
fac92740 1837 RExC_emit_start = r->program;
830247a4 1838 RExC_emit = r->program;
2cd61cdb 1839 /* Store the count of eval-groups for security checks: */
eb160463 1840 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1841 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1842 r->data = 0;
830247a4 1843 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1844 return(NULL);
1845
1846 /* Dig out information for optimizations. */
cf93c79d 1847 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1848 pm->op_pmflags = RExC_flags;
a0ed51b3 1849 if (UTF)
5ff6fc6d 1850 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1851 r->regstclass = NULL;
830247a4 1852 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1853 r->reganch |= ROPT_NAUGHTY;
c277df42 1854 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1855
1856 /* XXXX To minimize changes to RE engine we always allocate
1857 3-units-long substrs field. */
1858 Newz(1004, r->substrs, 1, struct reg_substr_data);
1859
2c2d71f5 1860 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1861 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1862 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1863 I32 fake;
c5254dd6 1864 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1865 struct regnode_charclass_class ch_class;
1866 int stclass_flag;
cb434fcc 1867 I32 last_close = 0;
a0d0e21e
LW
1868
1869 first = scan;
c277df42 1870 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1871 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1872 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1873 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1874 (OP(first) == PLUS) ||
1875 (OP(first) == MINMOD) ||
653099ff 1876 /* An {n,m} with n>0 */
22c35a8c 1877 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1878 if (OP(first) == PLUS)
1879 sawplus = 1;
1880 else
1881 first += regarglen[(U8)OP(first)];
1882 first = NEXTOPER(first);
a687059c
LW
1883 }
1884
a0d0e21e
LW
1885 /* Starting-point info. */
1886 again:
653099ff 1887 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1888 if (OP(first) == EXACT)
1889 ; /* Empty, get anchored substr later. */
1890 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1891 r->regstclass = first;
1892 }
653099ff 1893 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1894 r->regstclass = first;
22c35a8c
GS
1895 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1896 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1897 r->regstclass = first;
22c35a8c 1898 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1899 r->reganch |= (OP(first) == MBOL
1900 ? ROPT_ANCH_MBOL
1901 : (OP(first) == SBOL
1902 ? ROPT_ANCH_SBOL
1903 : ROPT_ANCH_BOL));
a0d0e21e 1904 first = NEXTOPER(first);
774d564b 1905 goto again;
1906 }
1907 else if (OP(first) == GPOS) {
1908 r->reganch |= ROPT_ANCH_GPOS;
1909 first = NEXTOPER(first);
1910 goto again;
a0d0e21e 1911 }
e09294f4 1912 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1913 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1914 !(r->reganch & ROPT_ANCH) )
1915 {
1916 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1917 int type = OP(NEXTOPER(first));
1918
ffc61ed2 1919 if (type == REG_ANY)
cad2e5aa
JH
1920 type = ROPT_ANCH_MBOL;
1921 else
1922 type = ROPT_ANCH_SBOL;
1923
1924 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1925 first = NEXTOPER(first);
774d564b 1926 goto again;
a0d0e21e 1927 }
b81d288d 1928 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1929 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1930 /* x+ must match at the 1st pos of run of x's */
1931 r->reganch |= ROPT_SKIP;
a0d0e21e 1932
c277df42 1933 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1934 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1935 (IV)(first - scan + 1)));
a0d0e21e
LW
1936 /*
1937 * If there's something expensive in the r.e., find the
1938 * longest literal string that must appear and make it the
1939 * regmust. Resolve ties in favor of later strings, since
1940 * the regstart check works with the beginning of the r.e.
1941 * and avoiding duplication strengthens checking. Not a
1942 * strong reason, but sufficient in the absence of others.
1943 * [Now we resolve ties in favor of the earlier string if
c277df42 1944 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1945 * earlier string may buy us something the later one won't.]
1946 */
a0d0e21e 1947 minlen = 0;
a687059c 1948
79cb57f6
GS
1949 data.longest_fixed = newSVpvn("",0);
1950 data.longest_float = newSVpvn("",0);
1951 data.last_found = newSVpvn("",0);
c277df42
IZ
1952 data.longest = &(data.longest_fixed);
1953 first = scan;
653099ff 1954 if (!r->regstclass) {
830247a4 1955 cl_init(pRExC_state, &ch_class);
653099ff
GS
1956 data.start_class = &ch_class;
1957 stclass_flag = SCF_DO_STCLASS_AND;
1958 } else /* XXXX Check for BOUND? */
1959 stclass_flag = 0;
cb434fcc 1960 data.last_closep = &last_close;
653099ff 1961
830247a4 1962 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1963 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1964 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1965 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1966 && !RExC_seen_zerolen
1967 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1968 r->reganch |= ROPT_CHECK_ALL;
830247a4 1969 scan_commit(pRExC_state, &data);
c277df42
IZ
1970 SvREFCNT_dec(data.last_found);
1971
a0ed51b3 1972 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1973 if (longest_float_length
c277df42
IZ
1974 || (data.flags & SF_FL_BEFORE_EOL
1975 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1976 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1977 int t;
1978
a0ed51b3 1979 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1980 && data.offset_fixed == data.offset_float_min
1981 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1982 goto remove_float; /* As in (a)+. */
1983
33b8afdf
JH
1984 if (SvUTF8(data.longest_float)) {
1985 r->float_utf8 = data.longest_float;
1986 r->float_substr = Nullsv;
1987 } else {
1988 r->float_substr = data.longest_float;
1989 r->float_utf8 = Nullsv;
1990 }
c277df42
IZ
1991 r->float_min_offset = data.offset_float_min;
1992 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1993 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1994 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1995 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1996 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1997 }
1998 else {
aca2d497 1999 remove_float:
33b8afdf 2000 r->float_substr = r->float_utf8 = Nullsv;
c277df42 2001 SvREFCNT_dec(data.longest_float);
c5254dd6 2002 longest_float_length = 0;
a0d0e21e 2003 }
c277df42 2004
a0ed51b3 2005 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2006 if (longest_fixed_length
c277df42
IZ
2007 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2008 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2009 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2010 int t;
2011
33b8afdf
JH
2012 if (SvUTF8(data.longest_fixed)) {
2013 r->anchored_utf8 = data.longest_fixed;
2014 r->anchored_substr = Nullsv;
2015 } else {
2016 r->anchored_substr = data.longest_fixed;
2017 r->anchored_utf8 = Nullsv;
2018 }
c277df42 2019 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2020 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2021 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2022 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2023 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2024 }
2025 else {
33b8afdf 2026 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2027 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2028 longest_fixed_length = 0;
a0d0e21e 2029 }
b81d288d 2030 if (r->regstclass
ffc61ed2 2031 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2032 r->regstclass = NULL;
33b8afdf
JH
2033 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2034 && stclass_flag
653099ff 2035 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2036 && !cl_is_anything(data.start_class))
2037 {
830247a4 2038 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2039
b81d288d 2040 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2041 struct regnode_charclass_class);
2042 StructCopy(data.start_class,
830247a4 2043 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2044 struct regnode_charclass_class);
830247a4 2045 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2046 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2047 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2048 DEBUG_r({ SV *sv = sv_newmortal();
2049 regprop(sv, (regnode*)data.start_class);
2050 PerlIO_printf(Perl_debug_log,
2051 "synthetic stclass `%s'.\n",
2052 SvPVX(sv));});
653099ff 2053 }
c277df42
IZ
2054
2055 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2056 if (longest_fixed_length > longest_float_length) {
c277df42 2057 r->check_substr = r->anchored_substr;
33b8afdf 2058 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2059 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2060 if (r->reganch & ROPT_ANCH_SINGLE)
2061 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2062 }
2063 else {
c277df42 2064 r->check_substr = r->float_substr;
33b8afdf 2065 r->check_utf8 = r->float_utf8;
c277df42
IZ
2066 r->check_offset_min = data.offset_float_min;
2067 r->check_offset_max = data.offset_float_max;
a0d0e21e 2068 }
30382c73
IZ
2069 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2070 This should be changed ASAP! */
33b8afdf 2071 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2072 r->reganch |= RE_USE_INTUIT;
33b8afdf 2073 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2074 r->reganch |= RE_INTUIT_TAIL;
2075 }
a0ed51b3
LW
2076 }
2077 else {
c277df42
IZ
2078 /* Several toplevels. Best we can is to set minlen. */
2079 I32 fake;
653099ff 2080 struct regnode_charclass_class ch_class;
cb434fcc 2081 I32 last_close = 0;
c277df42
IZ
2082
2083 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2084 scan = r->program + 1;
830247a4 2085 cl_init(pRExC_state, &ch_class);
653099ff 2086 data.start_class = &ch_class;
cb434fcc 2087 data.last_closep = &last_close;
e1901655 2088 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2089 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2090 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2091 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2092 && !cl_is_anything(data.start_class))
2093 {
830247a4 2094 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2095
b81d288d 2096 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2097 struct regnode_charclass_class);
2098 StructCopy(data.start_class,
830247a4 2099 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2100 struct regnode_charclass_class);
830247a4 2101 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2102 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2103 DEBUG_r({ SV* sv = sv_newmortal();
2104 regprop(sv, (regnode*)data.start_class);
2105 PerlIO_printf(Perl_debug_log,
2106 "synthetic stclass `%s'.\n",
2107 SvPVX(sv));});
653099ff 2108 }
a0d0e21e
LW
2109 }
2110
a0d0e21e 2111 r->minlen = minlen;
b81d288d 2112 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2113 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2114 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2115 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2116 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2117 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2118 if (RExC_seen & REG_SEEN_CANY)
2119 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2120 Newz(1002, r->startp, RExC_npar, I32);
2121 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2122 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2123 DEBUG_r(regdump(r));
2124 return(r);
a687059c
LW
2125}
2126
2127/*
2128 - reg - regular expression, i.e. main body or parenthesized thing
2129 *
2130 * Caller must absorb opening parenthesis.
2131 *
2132 * Combining parenthesis handling with the base level of regular expression
2133 * is a trifle forced, but the need to tie the tails of the branches to what
2134 * follows makes it hard to avoid.
2135 */
76e3520e 2136STATIC regnode *
830247a4 2137S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2138 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2139{
c277df42
IZ
2140 register regnode *ret; /* Will be the head of the group. */
2141 register regnode *br;
2142 register regnode *lastbr;
2143 register regnode *ender = 0;
a0d0e21e 2144 register I32 parno = 0;
e2509266 2145 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2146
2147 /* for (?g), (?gc), and (?o) warnings; warning
2148 about (?c) will warn about (?g) -- japhy */
2149
2150 I32 wastedflags = 0x00,
2151 wasted_o = 0x01,
2152 wasted_g = 0x02,
2153 wasted_gc = 0x02 | 0x04,
2154 wasted_c = 0x04;
2155
fac92740 2156 char * parse_start = RExC_parse; /* MJD */
830247a4 2157 char *oregcomp_parse = RExC_parse;
c277df42 2158 char c;
a0d0e21e 2159
821b33a5 2160 *flagp = 0; /* Tentatively. */
a0d0e21e 2161
9d1d55b5 2162
a0d0e21e
LW
2163 /* Make an OPEN node, if parenthesized. */
2164 if (paren) {
fac92740 2165 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2166 U32 posflags = 0, negflags = 0;
2167 U32 *flagsp = &posflags;
0f5d15d6 2168 int logical = 0;
830247a4 2169 char *seqstart = RExC_parse;
ca9dfc88 2170
830247a4
IZ
2171 RExC_parse++;
2172 paren = *RExC_parse++;
c277df42 2173 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2174 switch (paren) {
fac92740 2175 case '<': /* (?<...) */
830247a4 2176 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2177 if (*RExC_parse == '!')
c277df42 2178 paren = ',';
b81d288d 2179 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2180 goto unknown;
830247a4 2181 RExC_parse++;
fac92740
MJD
2182 case '=': /* (?=...) */
2183 case '!': /* (?!...) */
830247a4 2184 RExC_seen_zerolen++;
fac92740
MJD
2185 case ':': /* (?:...) */
2186 case '>': /* (?>...) */
a0d0e21e 2187 break;
fac92740
MJD
2188 case '$': /* (?$...) */
2189 case '@': /* (?@...) */
8615cb43 2190 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2191 break;
fac92740 2192 case '#': /* (?#...) */
830247a4
IZ
2193 while (*RExC_parse && *RExC_parse != ')')
2194 RExC_parse++;
2195 if (*RExC_parse != ')')
c277df42 2196 FAIL("Sequence (?#... not terminated");
830247a4 2197 nextchar(pRExC_state);
a0d0e21e
LW
2198 *flagp = TRYAGAIN;
2199 return NULL;
fac92740 2200 case 'p': /* (?p...) */
9014280d 2201 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2202 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2203 /* FALL THROUGH*/
fac92740 2204 case '?': /* (??...) */
0f5d15d6 2205 logical = 1;
438a3801
YST
2206 if (*RExC_parse != '{')
2207 goto unknown;
830247a4 2208 paren = *RExC_parse++;
0f5d15d6 2209 /* FALL THROUGH */
fac92740 2210 case '{': /* (?{...}) */
c277df42 2211 {
c277df42
IZ
2212 I32 count = 1, n = 0;
2213 char c;
830247a4 2214 char *s = RExC_parse;
c277df42
IZ
2215 SV *sv;
2216 OP_4tree *sop, *rop;
2217
830247a4
IZ
2218 RExC_seen_zerolen++;
2219 RExC_seen |= REG_SEEN_EVAL;
2220 while (count && (c = *RExC_parse)) {
2221 if (c == '\\' && RExC_parse[1])
2222 RExC_parse++;
b81d288d 2223 else if (c == '{')
c277df42 2224 count++;
b81d288d 2225 else if (c == '}')
c277df42 2226 count--;
830247a4 2227 RExC_parse++;
c277df42 2228 }
830247a4 2229 if (*RExC_parse != ')')
b45f050a 2230 {
b81d288d 2231 RExC_parse = s;
b45f050a
JF
2232 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2233 }
c277df42 2234 if (!SIZE_ONLY) {
f3548bdc 2235 PAD *pad;
b81d288d
AB
2236
2237 if (RExC_parse - 1 - s)
830247a4 2238 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2239 else
79cb57f6 2240 sv = newSVpvn("", 0);
c277df42 2241
569233ed
SB
2242 ENTER;
2243 Perl_save_re_context(aTHX);
f3548bdc 2244 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2245 sop->op_private |= OPpREFCOUNTED;
2246 /* re_dup will OpREFCNT_inc */
2247 OpREFCNT_set(sop, 1);
569233ed 2248 LEAVE;
c277df42 2249
830247a4
IZ
2250 n = add_data(pRExC_state, 3, "nop");
2251 RExC_rx->data->data[n] = (void*)rop;
2252 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2253 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2254 SvREFCNT_dec(sv);
a0ed51b3 2255 }
e24b16f9 2256 else { /* First pass */
830247a4 2257 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 2258 && IN_PERL_RUNTIME)
2cd61cdb
IZ
2259 /* No compiled RE interpolated, has runtime
2260 components ===> unsafe. */
2261 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2262 if (PL_tainting && PL_tainted)
cc6b7395 2263 FAIL("Eval-group in insecure regular expression");
923e4eb5 2264 if (IN_PERL_COMPILETIME)
b5c19bd7 2265 PL_cv_has_eval = 1;
c277df42 2266 }
b5c19bd7 2267
830247a4 2268 nextchar(pRExC_state);
0f5d15d6 2269 if (logical) {
830247a4 2270 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2271 if (!SIZE_ONLY)
2272 ret->flags = 2;
830247a4 2273 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2274 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2275 return ret;
2276 }
ccb2c380
MP
2277 ret = reganode(pRExC_state, EVAL, n);
2278 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2279 Set_Node_Offset(ret, parse_start);
2280 return ret;
c277df42 2281 }
fac92740 2282 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2283 {
fac92740 2284 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2285 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2286 || RExC_parse[1] == '<'
830247a4 2287 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2288 I32 flag;
2289
830247a4 2290 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2291 if (!SIZE_ONLY)
2292 ret->flags = 1;
830247a4 2293 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2294 goto insert_if;
b81d288d 2295 }
a0ed51b3 2296 }
830247a4 2297 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2298 /* (?(1)...) */
830247a4 2299 parno = atoi(RExC_parse++);
c277df42 2300
830247a4
IZ
2301 while (isDIGIT(*RExC_parse))
2302 RExC_parse++;
fac92740
MJD
2303 ret = reganode(pRExC_state, GROUPP, parno);
2304
830247a4 2305 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2306 vFAIL("Switch condition not recognized");
c277df42 2307 insert_if:
830247a4
IZ
2308 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2309 br = regbranch(pRExC_state, &flags, 1);
c277df42 2310 if (br == NULL)
830247a4 2311 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2312 else
830247a4
IZ
2313 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2314 c = *nextchar(pRExC_state);
d1b80229
IZ
2315 if (flags&HASWIDTH)
2316 *flagp |= HASWIDTH;
c277df42 2317 if (c == '|') {
830247a4
IZ
2318 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2319 regbranch(pRExC_state, &flags, 1);
2320 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2321 if (flags&HASWIDTH)
2322 *flagp |= HASWIDTH;
830247a4 2323 c = *nextchar(pRExC_state);
a0ed51b3
LW
2324 }
2325 else
c277df42
IZ
2326 lastbr = NULL;
2327 if (c != ')')
8615cb43 2328 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2329 ender = reg_node(pRExC_state, TAIL);
2330 regtail(pRExC_state, br, ender);
c277df42 2331 if (lastbr) {
830247a4
IZ
2332 regtail(pRExC_state, lastbr, ender);
2333 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2334 }
2335 else
830247a4 2336 regtail(pRExC_state, ret, ender);
c277df42 2337 return ret;
a0ed51b3
LW
2338 }
2339 else {
830247a4 2340 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2341 }
2342 }
1b1626e4 2343 case 0:
830247a4 2344 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2345 vFAIL("Sequence (? incomplete");
1b1626e4 2346 break;
a0d0e21e 2347 default:
830247a4 2348 --RExC_parse;
fac92740 2349 parse_flags: /* (?i) */
830247a4 2350 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2351 /* (?g), (?gc) and (?o) are useless here
2352 and must be globally applied -- japhy */
2353
2354 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2355 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2356 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2357 if (! (wastedflags & wflagbit) ) {
2358 wastedflags |= wflagbit;
2359 vWARN5(
2360 RExC_parse + 1,
2361 "Useless (%s%c) - %suse /%c modifier",
2362 flagsp == &negflags ? "?-" : "?",
2363 *RExC_parse,
2364 flagsp == &negflags ? "don't " : "",
2365 *RExC_parse
2366 );
2367 }
2368 }
2369 }
2370 else if (*RExC_parse == 'c') {
2371 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2372 if (! (wastedflags & wasted_c) ) {
2373 wastedflags |= wasted_gc;
2374 vWARN3(
2375 RExC_parse + 1,
2376 "Useless (%sc) - %suse /gc modifier",
2377 flagsp == &negflags ? "?-" : "?",
2378 flagsp == &negflags ? "don't " : ""
2379 );
2380 }
2381 }
2382 }
2383 else { pmflag(flagsp, *RExC_parse); }
2384
830247a4 2385 ++RExC_parse;
ca9dfc88 2386 }
830247a4 2387 if (*RExC_parse == '-') {
ca9dfc88 2388 flagsp = &negflags;
9d1d55b5 2389 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2390 ++RExC_parse;
ca9dfc88 2391 goto parse_flags;
48c036b1 2392 }
e2509266
JH
2393 RExC_flags |= posflags;
2394 RExC_flags &= ~negflags;
830247a4
IZ
2395 if (*RExC_parse == ':') {
2396 RExC_parse++;
ca9dfc88
IZ
2397 paren = ':';
2398 break;
2399 }
c277df42 2400 unknown:
830247a4
IZ
2401 if (*RExC_parse != ')') {
2402 RExC_parse++;
2403 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2404 }
830247a4 2405 nextchar(pRExC_state);
a0d0e21e
LW
2406 *flagp = TRYAGAIN;
2407 return NULL;
2408 }
2409 }
fac92740 2410 else { /* (...) */
830247a4
IZ
2411 parno = RExC_npar;
2412 RExC_npar++;
2413 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2414 Set_Node_Length(ret, 1); /* MJD */
2415 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2416 open = 1;
a0d0e21e 2417 }
a0ed51b3 2418 }
fac92740 2419 else /* ! paren */
a0d0e21e
LW
2420 ret = NULL;
2421
2422 /* Pick up the branches, linking them together. */
fac92740 2423 parse_start = RExC_parse; /* MJD */
830247a4 2424 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2425 /* branch_len = (paren != 0); */
2426
a0d0e21e
LW
2427 if (br == NULL)
2428 return(NULL);
830247a4
IZ
2429 if (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2432 }
fac92740 2433 else { /* MJD */
830247a4 2434 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2435 Set_Node_Length(br, paren != 0);
2436 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2437 }
c277df42
IZ
2438 have_branch = 1;
2439 if (SIZE_ONLY)
830247a4 2440 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2441 }
2442 else if (paren == ':') {
c277df42
IZ
2443 *flagp |= flags&SIMPLE;
2444 }
2445 if (open) { /* Starts with OPEN. */
830247a4 2446 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2447 }
2448 else if (paren != '?') /* Not Conditional */
a0d0e21e 2449 ret = br;
32a0ca98 2450 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2451 lastbr = br;
830247a4
IZ
2452 while (*RExC_parse == '|') {
2453 if (!SIZE_ONLY && RExC_extralen) {
2454 ender = reganode(pRExC_state, LONGJMP,0);
2455 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2456 }
2457 if (SIZE_ONLY)
830247a4
IZ
2458 RExC_extralen += 2; /* Account for LONGJMP. */
2459 nextchar(pRExC_state);
2460 br = regbranch(pRExC_state, &flags, 0);
fac92740 2461
a687059c 2462 if (br == NULL)
a0d0e21e 2463 return(NULL);
830247a4 2464 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2465 lastbr = br;
821b33a5
IZ
2466 if (flags&HASWIDTH)
2467 *flagp |= HASWIDTH;
a687059c 2468 *flagp |= flags&SPSTART;
a0d0e21e
LW
2469 }
2470
c277df42
IZ
2471 if (have_branch || paren != ':') {
2472 /* Make a closing node, and hook it on the end. */
2473 switch (paren) {
2474 case ':':
830247a4 2475 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2476 break;
2477 case 1:
830247a4 2478 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2479 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2480 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2481 break;
2482 case '<':
c277df42
IZ
2483 case ',':
2484 case '=':
2485 case '!':
c277df42 2486 *flagp &= ~HASWIDTH;
821b33a5
IZ
2487 /* FALL THROUGH */
2488 case '>':
830247a4 2489 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2490 break;
2491 case 0:
830247a4 2492 ender = reg_node(pRExC_state, END);
c277df42
IZ
2493 break;
2494 }
830247a4 2495 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2496
c277df42
IZ
2497 if (have_branch) {
2498 /* Hook the tails of the branches to the closing node. */
2499 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2500 regoptail(pRExC_state, br, ender);
c277df42
IZ
2501 }
2502 }
a0d0e21e 2503 }
c277df42
IZ
2504
2505 {
2506 char *p;
2507 static char parens[] = "=!<,>";
2508
2509 if (paren && (p = strchr(parens, paren))) {
eb160463 2510 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2511 int flag = (p - parens) > 1;
2512
2513 if (paren == '>')
2514 node = SUSPEND, flag = 0;
830247a4 2515 reginsert(pRExC_state, node,ret);
45948336
EP
2516 Set_Node_Cur_Length(ret);
2517 Set_Node_Offset(ret, parse_start + 1);
c277df42 2518 ret->flags = flag;
830247a4 2519 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2520 }
a0d0e21e
LW
2521 }
2522
2523 /* Check for proper termination. */
ce3e6498 2524 if (paren) {
e2509266 2525 RExC_flags = oregflags;
830247a4
IZ
2526 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2527 RExC_parse = oregcomp_parse;
380a0633 2528 vFAIL("Unmatched (");
ce3e6498 2529 }
a0ed51b3 2530 }
830247a4
IZ
2531 else if (!paren && RExC_parse < RExC_end) {
2532 if (*RExC_parse == ')') {
2533 RExC_parse++;
380a0633 2534 vFAIL("Unmatched )");
a0ed51b3
LW
2535 }
2536 else
b45f050a 2537 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2538 /* NOTREACHED */
2539 }
a687059c 2540
a0d0e21e 2541 return(ret);
a687059c
LW
2542}
2543
2544/*
2545 - regbranch - one alternative of an | operator
2546 *
2547 * Implements the concatenation operator.
2548 */
76e3520e 2549STATIC regnode *
830247a4 2550S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2551{
c277df42
IZ
2552 register regnode *ret;
2553 register regnode *chain = NULL;
2554 register regnode *latest;
2555 I32 flags = 0, c = 0;
a0d0e21e 2556
b81d288d 2557 if (first)
c277df42
IZ
2558 ret = NULL;
2559 else {
b81d288d 2560 if (!SIZE_ONLY && RExC_extralen)
830247a4 2561 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2562 else {
830247a4 2563 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2564 Set_Node_Length(ret, 1);
2565 }
c277df42
IZ
2566 }
2567
b81d288d 2568 if (!first && SIZE_ONLY)
830247a4 2569 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2570
c277df42 2571 *flagp = WORST; /* Tentatively. */
a0d0e21e 2572
830247a4
IZ
2573 RExC_parse--;
2574 nextchar(pRExC_state);
2575 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2576 flags &= ~TRYAGAIN;
830247a4 2577 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2578 if (latest == NULL) {
2579 if (flags & TRYAGAIN)
2580 continue;
2581 return(NULL);
a0ed51b3
LW
2582 }
2583 else if (ret == NULL)
c277df42 2584 ret = latest;
a0d0e21e 2585 *flagp |= flags&HASWIDTH;
c277df42 2586 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2587 *flagp |= flags&SPSTART;
2588 else {
830247a4
IZ
2589 RExC_naughty++;
2590 regtail(pRExC_state, chain, latest);
a687059c 2591 }
a0d0e21e 2592 chain = latest;
c277df42
IZ
2593 c++;
2594 }
2595 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2596 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2597 if (ret == NULL)
2598 ret = chain;
2599 }
2600 if (c == 1) {
2601 *flagp |= flags&SIMPLE;
a0d0e21e 2602 }
a687059c 2603
a0d0e21e 2604 return(ret);
a687059c
LW
2605}
2606
2607/*
2608 - regpiece - something followed by possible [*+?]
2609 *
2610 * Note that the branching code sequences used for ? and the general cases
2611 * of * and + are somewhat optimized: they use the same NOTHING node as
2612 * both the endmarker for their branch list and the body of the last branch.
2613 * It might seem that this node could be dispensed with entirely, but the
2614 * endmarker role is not redundant.
2615 */
76e3520e 2616STATIC regnode *
830247a4 2617S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2618{
c277df42 2619 register regnode *ret;
a0d0e21e
LW
2620 register char op;
2621 register char *next;
2622 I32 flags;
830247a4 2623 char *origparse = RExC_parse;
a0d0e21e
LW
2624 char *maxpos;
2625 I32 min;
c277df42 2626 I32 max = REG_INFTY;
fac92740 2627 char *parse_start;
a0d0e21e 2628
830247a4 2629 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2630 if (ret == NULL) {
2631 if (flags & TRYAGAIN)
2632 *flagp |= TRYAGAIN;
2633 return(NULL);
2634 }
2635
830247a4 2636 op = *RExC_parse;
a0d0e21e 2637
830247a4 2638 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2639 parse_start = RExC_parse; /* MJD */
830247a4 2640 next = RExC_parse + 1;
a0d0e21e
LW
2641 maxpos = Nullch;
2642 while (isDIGIT(*next) || *next == ',') {
2643 if (*next == ',') {
2644 if (maxpos)
2645 break;
2646 else
2647 maxpos = next;
a687059c 2648 }
a0d0e21e
LW
2649 next++;
2650 }
2651 if (*next == '}') { /* got one */
2652 if (!maxpos)
2653 maxpos = next;
830247a4
IZ
2654 RExC_parse++;
2655 min = atoi(RExC_parse);
a0d0e21e
LW
2656 if (*maxpos == ',')
2657 maxpos++;
2658 else
830247a4 2659 maxpos = RExC_parse;
a0d0e21e
LW
2660 max = atoi(maxpos);
2661 if (!max && *maxpos != '0')
c277df42
IZ
2662 max = REG_INFTY; /* meaning "infinity" */
2663 else if (max >= REG_INFTY)
8615cb43 2664 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2665 RExC_parse = next;
2666 nextchar(pRExC_state);
a0d0e21e
LW
2667
2668 do_curly:
2669 if ((flags&SIMPLE)) {
830247a4
IZ
2670 RExC_naughty += 2 + RExC_naughty / 2;
2671 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2672 Set_Node_Offset(ret, parse_start+1); /* MJD */
2673 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2674 }
2675 else {
830247a4 2676 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2677
2678 w->flags = 0;
830247a4
IZ
2679 regtail(pRExC_state, ret, w);
2680 if (!SIZE_ONLY && RExC_extralen) {
2681 reginsert(pRExC_state, LONGJMP,ret);
2682 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2683 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2684 }
830247a4 2685 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2686 /* MJD hk */
2687 Set_Node_Offset(ret, parse_start+1);
2688 Set_Node_Length(ret,
2689 op == '{' ? (RExC_parse - parse_start) : 1);
2690
830247a4 2691 if (!SIZE_ONLY && RExC_extralen)
c277df42 2692 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2693 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2694 if (SIZE_ONLY)
830247a4
IZ
2695 RExC_whilem_seen++, RExC_extralen += 3;
2696 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2697 }
c277df42 2698 ret->flags = 0;
a0d0e21e
LW
2699
2700 if (min > 0)
821b33a5
IZ
2701 *flagp = WORST;
2702 if (max > 0)
2703 *flagp |= HASWIDTH;
a0d0e21e 2704 if (max && max < min)
8615cb43 2705 vFAIL("Can't do {n,m} with n > m");
c277df42 2706 if (!SIZE_ONLY) {
eb160463
GS
2707 ARG1_SET(ret, (U16)min);
2708 ARG2_SET(ret, (U16)max);
a687059c 2709 }
a687059c 2710
a0d0e21e 2711 goto nest_check;
a687059c 2712 }
a0d0e21e 2713 }
a687059c 2714
a0d0e21e
LW
2715 if (!ISMULT1(op)) {
2716 *flagp = flags;
a687059c 2717 return(ret);
a0d0e21e 2718 }
bb20fd44 2719
c277df42 2720#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2721
2722 /* if this is reinstated, don't forget to put this back into perldiag:
2723
2724 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2725
2726 (F) The part of the regexp subject to either the * or + quantifier
2727 could match an empty string. The {#} shows in the regular
2728 expression about where the problem was discovered.
2729
2730 */
2731
bb20fd44 2732 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2733 vFAIL("Regexp *+ operand could be empty");
b81d288d 2734#endif
bb20fd44 2735
fac92740 2736 parse_start = RExC_parse;
830247a4 2737 nextchar(pRExC_state);
a0d0e21e 2738
821b33a5 2739 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2740
2741 if (op == '*' && (flags&SIMPLE)) {
830247a4 2742 reginsert(pRExC_state, STAR, ret);
c277df42 2743 ret->flags = 0;
830247a4 2744 RExC_naughty += 4;
a0d0e21e
LW
2745 }
2746 else if (op == '*') {
2747 min = 0;
2748 goto do_curly;
a0ed51b3
LW
2749 }
2750 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2751 reginsert(pRExC_state, PLUS, ret);
c277df42 2752 ret->flags = 0;
830247a4 2753 RExC_naughty += 3;
a0d0e21e
LW
2754 }
2755 else if (op == '+') {
2756 min = 1;
2757 goto do_curly;
a0ed51b3
LW
2758 }
2759 else if (op == '?') {
a0d0e21e
LW
2760 min = 0; max = 1;
2761 goto do_curly;
2762 }
2763 nest_check:
e476b1b5 2764 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2765 vWARN3(RExC_parse,
b45f050a 2766 "%.*s matches null string many times",
830247a4 2767 RExC_parse - origparse,
b45f050a 2768 origparse);
a0d0e21e
LW
2769 }
2770
830247a4
IZ
2771 if (*RExC_parse == '?') {
2772 nextchar(pRExC_state);
2773 reginsert(pRExC_state, MINMOD, ret);
2774 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2775 }
830247a4
IZ
2776 if (ISMULT2(RExC_parse)) {
2777 RExC_parse++;
b45f050a
JF
2778 vFAIL("Nested quantifiers");
2779 }
a0d0e21e
LW
2780
2781 return(ret);
a687059c
LW
2782}
2783
2784/*
2785 - regatom - the lowest level
2786 *
2787 * Optimization: gobbles an entire sequence of ordinary characters so that
2788 * it can turn them into a single node, which is smaller to store and
2789 * faster to run. Backslashed characters are exceptions, each becoming a
2790 * separate node; the code is simpler that way and it's not worth fixing.
2791 *
b45f050a 2792 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2793STATIC regnode *
830247a4 2794S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2795{
c277df42 2796 register regnode *ret = 0;
a0d0e21e 2797 I32 flags;
45948336 2798 char *parse_start = RExC_parse;
a0d0e21e
LW
2799
2800 *flagp = WORST; /* Tentatively. */
2801
2802tryagain:
830247a4 2803 switch (*RExC_parse) {
a0d0e21e 2804 case '^':
830247a4
IZ
2805 RExC_seen_zerolen++;
2806 nextchar(pRExC_state);
e2509266 2807 if (RExC_flags & PMf_MULTILINE)
830247a4 2808 ret = reg_node(pRExC_state, MBOL);
e2509266 2809 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2810 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2811 else
830247a4 2812 ret = reg_node(pRExC_state, BOL);
fac92740 2813 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2814 break;
2815 case '$':
830247a4 2816 nextchar(pRExC_state);
b81d288d 2817 if (*RExC_parse)
830247a4 2818 RExC_seen_zerolen++;
e2509266 2819 if (RExC_flags & PMf_MULTILINE)
830247a4 2820 ret = reg_node(pRExC_state, MEOL);
e2509266 2821 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2822 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2823 else
830247a4 2824 ret = reg_node(pRExC_state, EOL);
fac92740 2825 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2826 break;
2827 case '.':
830247a4 2828 nextchar(pRExC_state);
e2509266 2829 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2830 ret = reg_node(pRExC_state, SANY);
2831 else
2832 ret = reg_node(pRExC_state, REG_ANY);
2833 *flagp |= HASWIDTH|SIMPLE;
830247a4 2834 RExC_naughty++;
fac92740 2835 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2836 break;
2837 case '[':
b45f050a 2838 {
830247a4 2839 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2840 ret = regclass(pRExC_state);
830247a4
IZ
2841 if (*RExC_parse != ']') {
2842 RExC_parse = oregcomp_parse;
b45f050a
JF
2843 vFAIL("Unmatched [");
2844 }
830247a4 2845 nextchar(pRExC_state);
a0d0e21e 2846 *flagp |= HASWIDTH|SIMPLE;
fac92740 2847 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2848 break;
b45f050a 2849 }
a0d0e21e 2850 case '(':
830247a4
IZ
2851 nextchar(pRExC_state);
2852 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2853 if (ret == NULL) {
bf93d4cc 2854 if (flags & TRYAGAIN) {
830247a4 2855 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2856 /* Make parent create an empty node if needed. */
2857 *flagp |= TRYAGAIN;
2858 return(NULL);
2859 }
a0d0e21e 2860 goto tryagain;
bf93d4cc 2861 }
a0d0e21e
LW
2862 return(NULL);
2863 }
c277df42 2864 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2865 break;
2866 case '|':
2867 case ')':
2868 if (flags & TRYAGAIN) {
2869 *flagp |= TRYAGAIN;
2870 return NULL;
2871 }
b45f050a 2872 vFAIL("Internal urp");
a0d0e21e
LW
2873 /* Supposed to be caught earlier. */
2874 break;
85afd4ae 2875 case '{':
830247a4
IZ
2876 if (!regcurly(RExC_parse)) {
2877 RExC_parse++;
85afd4ae
CS
2878 goto defchar;
2879 }
2880 /* FALL THROUGH */
a0d0e21e
LW
2881 case '?':
2882 case '+':
2883 case '*':
830247a4 2884 RExC_parse++;
b45f050a 2885 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2886 break;
2887 case '\\':
830247a4 2888 switch (*++RExC_parse) {
a0d0e21e 2889 case 'A':
830247a4
IZ
2890 RExC_seen_zerolen++;
2891 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2892 *flagp |= SIMPLE;
830247a4 2893 nextchar(pRExC_state);
fac92740 2894 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2895 break;
2896 case 'G':
830247a4
IZ
2897 ret = reg_node(pRExC_state, GPOS);
2898 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2899 *flagp |= SIMPLE;
830247a4 2900 nextchar(pRExC_state);
fac92740 2901 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2902 break;
2903 case 'Z':
830247a4 2904 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2905 *flagp |= SIMPLE;
a1917ab9 2906 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2907 nextchar(pRExC_state);
a0d0e21e 2908 break;
b85d18e9 2909 case 'z':
830247a4 2910 ret = reg_node(pRExC_state, EOS);
b85d18e9 2911 *flagp |= SIMPLE;
830247a4
IZ
2912 RExC_seen_zerolen++; /* Do not optimize RE away */
2913 nextchar(pRExC_state);
fac92740 2914 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2915 break;
4a2d328f 2916 case 'C':
f33976b4
DB
2917 ret = reg_node(pRExC_state, CANY);
2918 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2919 *flagp |= HASWIDTH|SIMPLE;
830247a4 2920 nextchar(pRExC_state);
fac92740 2921 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2922 break;
2923 case 'X':
830247a4 2924 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2925 *flagp |= HASWIDTH;
830247a4 2926 nextchar(pRExC_state);
fac92740 2927 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2928 break;
a0d0e21e 2929 case 'w':
eb160463 2930 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2931 *flagp |= HASWIDTH|SIMPLE;
830247a4 2932 nextchar(pRExC_state);
fac92740 2933 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2934 break;
2935 case 'W':
eb160463 2936 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2937 *flagp |= HASWIDTH|SIMPLE;
830247a4 2938 nextchar(pRExC_state);
fac92740 2939 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2940 break;
2941 case 'b':
830247a4
IZ
2942 RExC_seen_zerolen++;
2943 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2944 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2945 *flagp |= SIMPLE;
830247a4 2946 nextchar(pRExC_state);
fac92740 2947 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2948 break;
2949 case 'B':
830247a4
IZ
2950 RExC_seen_zerolen++;
2951 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2952 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2953 *flagp |= SIMPLE;
830247a4 2954 nextchar(pRExC_state);
fac92740 2955 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2956 break;
2957 case 's':
eb160463 2958 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2959 *flagp |= HASWIDTH|SIMPLE;
830247a4 2960 nextchar(pRExC_state);
fac92740 2961 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2962 break;
2963 case 'S':
eb160463 2964 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2965 *flagp |= HASWIDTH|SIMPLE;
830247a4 2966 nextchar(pRExC_state);
fac92740 2967 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2968 break;
2969 case 'd':
ffc61ed2 2970 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2971 *flagp |= HASWIDTH|SIMPLE;
830247a4 2972 nextchar(pRExC_state);
fac92740 2973 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2974 break;
2975 case 'D':
ffc61ed2 2976 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2977 *flagp |= HASWIDTH|SIMPLE;
830247a4 2978 nextchar(pRExC_state);
fac92740 2979 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2980 break;
a14b48bc
LW
2981 case 'p':
2982 case 'P':
3568d838 2983 {
830247a4 2984 char* oldregxend = RExC_end;
ccb2c380 2985 char* parse_start = RExC_parse - 2;
a14b48bc 2986
830247a4 2987 if (RExC_parse[1] == '{') {
3568d838 2988 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2989 RExC_end = strchr(RExC_parse, '}');
2990 if (!RExC_end) {
0da60cf5 2991 U8 c = (U8)*RExC_parse;
830247a4
IZ
2992 RExC_parse += 2;
2993 RExC_end = oldregxend;
0da60cf5 2994 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2995 }
830247a4 2996 RExC_end++;
a14b48bc 2997 }
af6f566e 2998 else {
830247a4 2999 RExC_end = RExC_parse + 2;
af6f566e
HS
3000 if (RExC_end > oldregxend)
3001 RExC_end = oldregxend;
3002 }
830247a4 3003 RExC_parse--;
a14b48bc 3004
ffc61ed2 3005 ret = regclass(pRExC_state);
a14b48bc 3006
830247a4
IZ
3007 RExC_end = oldregxend;
3008 RExC_parse--;
ccb2c380
MP
3009
3010 Set_Node_Offset(ret, parse_start + 2);
3011 Set_Node_Cur_Length(ret);
830247a4 3012 nextchar(pRExC_state);
a14b48bc
LW
3013 *flagp |= HASWIDTH|SIMPLE;
3014 }
3015 break;
a0d0e21e
LW
3016 case 'n':
3017 case 'r':
3018 case 't':
3019 case 'f':
3020 case 'e':
3021 case 'a':
3022 case 'x':
3023 case 'c':
3024 case '0':
3025 goto defchar;
3026 case '1': case '2': case '3': case '4':
3027 case '5': case '6': case '7': case '8': case '9':
3028 {
830247a4 3029 I32 num = atoi(RExC_parse);
a0d0e21e 3030
830247a4 3031 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3032 goto defchar;
3033 else {
fac92740 3034 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3035 while (isDIGIT(*RExC_parse))
3036 RExC_parse++;
b45f050a 3037
eb160463 3038 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3039 vFAIL("Reference to nonexistent group");
830247a4 3040 RExC_sawback = 1;
eb160463
GS
3041 ret = reganode(pRExC_state,
3042 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3043 num);
a0d0e21e 3044 *flagp |= HASWIDTH;
fac92740
MJD
3045
3046 /* override incorrect value set in reganode MJD */
3047 Set_Node_Offset(ret, parse_start+1);
3048 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3049 RExC_parse--;
3050 nextchar(pRExC_state);
a0d0e21e
LW
3051 }
3052 }
3053 break;
3054 case '\0':
830247a4 3055 if (RExC_parse >= RExC_end)
b45f050a 3056 FAIL("Trailing \\");
a0d0e21e
LW
3057 /* FALL THROUGH */
3058 default:
c9f97d15
IZ
3059 /* Do not generate `unrecognized' warnings here, we fall
3060 back into the quick-grab loop below */
45948336 3061 parse_start--;
a0d0e21e
LW
3062 goto defchar;
3063 }
3064 break;
4633a7c4
LW
3065
3066 case '#':
e2509266 3067 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3068 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3069 if (RExC_parse < RExC_end)
4633a7c4
LW
3070 goto tryagain;
3071 }
3072 /* FALL THROUGH */
3073
a0d0e21e 3074 default: {
ba210ebe 3075 register STRLEN len;
58ae7d3f 3076 register UV ender;
a0d0e21e 3077 register char *p;
c277df42 3078 char *oldp, *s;
ba210ebe 3079 STRLEN numlen;
80aecb99 3080 STRLEN foldlen;
60a8b682 3081 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3082
3083 parse_start = RExC_parse - 1;
a0d0e21e 3084
830247a4 3085 RExC_parse++;
a0d0e21e
LW
3086
3087 defchar:
58ae7d3f 3088 ender = 0;
eb160463
GS
3089 ret = reg_node(pRExC_state,
3090 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3091 s = STRING(ret);
830247a4
IZ
3092 for (len = 0, p = RExC_parse - 1;
3093 len < 127 && p < RExC_end;
a0d0e21e
LW
3094 len++)
3095 {
3096 oldp = p;
5b5a24f7 3097
e2509266 3098 if (RExC_flags & PMf_EXTENDED)
830247a4 3099 p = regwhite(p, RExC_end);
a0d0e21e
LW
3100 switch (*p) {
3101 case '^':
3102 case '$':
3103 case '.':
3104 case '[':
3105 case '(':
3106 case ')':
3107 case '|':
3108 goto loopdone;
3109 case '\\':
3110 switch (*++p) {
3111 case 'A':
1ed8eac0
JF
3112 case 'C':
3113 case 'X':
a0d0e21e
LW
3114 case 'G':
3115 case 'Z':
b85d18e9 3116 case 'z':
a0d0e21e
LW
3117 case 'w':
3118 case 'W':
3119 case 'b':
3120 case 'B':
3121 case 's':
3122 case 'S':
3123 case 'd':
3124 case 'D':
a14b48bc
LW
3125 case 'p':
3126 case 'P':
a0d0e21e
LW
3127 --p;
3128 goto loopdone;
3129 case 'n':
3130 ender = '\n';
3131 p++;
a687059c 3132 break;
a0d0e21e
LW
3133 case 'r':
3134 ender = '\r';
3135 p++;
a687059c 3136 break;
a0d0e21e
LW
3137 case 't':
3138 ender = '\t';
3139 p++;
a687059c 3140 break;
a0d0e21e
LW
3141 case 'f':
3142 ender = '\f';
3143 p++;
a687059c 3144 break;
a0d0e21e 3145 case 'e':
c7f1f016 3146 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3147 p++;
a687059c 3148 break;
a0d0e21e 3149 case 'a':
c7f1f016 3150 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3151 p++;
a687059c 3152 break;
a0d0e21e 3153 case 'x':
a0ed51b3
LW
3154 if (*++p == '{') {
3155 char* e = strchr(p, '}');
b81d288d 3156
b45f050a 3157 if (!e) {
830247a4 3158 RExC_parse = p + 1;
b45f050a
JF
3159 vFAIL("Missing right brace on \\x{}");
3160 }
de5f0749 3161 else {
a4c04bdc
NC
3162 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3163 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3164 numlen = e - p - 1;
3165 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3166 if (ender > 0xff)
3167 RExC_utf8 = 1;
a0ed51b3
LW
3168 p = e + 1;
3169 }
a0ed51b3
LW
3170 }
3171 else {
a4c04bdc 3172 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3173 numlen = 2;
3174 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3175 p += numlen;
3176 }
a687059c 3177 break;
a0d0e21e
LW
3178 case 'c':
3179 p++;
bbce6d69 3180 ender = UCHARAT(p++);
3181 ender = toCTRL(ender);
a687059c 3182 break;
a0d0e21e
LW
3183 case '0': case '1': case '2': case '3':case '4':
3184 case '5': case '6': case '7': case '8':case '9':
3185 if (*p == '0' ||
830247a4 3186 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3187 I32 flags = 0;
3188 numlen = 3;
3189 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3190 p += numlen;
3191 }
3192 else {
3193 --p;
3194 goto loopdone;
a687059c
LW
3195 }
3196 break;
a0d0e21e 3197 case '\0':
830247a4 3198 if (p >= RExC_end)
b45f050a 3199 FAIL("Trailing \\");
a687059c 3200 /* FALL THROUGH */
a0d0e21e 3201 default:
e476b1b5 3202 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3203 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3204 goto normal_default;
a0d0e21e
LW
3205 }
3206 break;
a687059c 3207 default:
a0ed51b3 3208 normal_default:
fd400ab9 3209 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3210 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3211 &numlen, 0);
a0ed51b3
LW
3212 p += numlen;
3213 }
3214 else
3215 ender = *p++;
a0d0e21e 3216 break;
a687059c 3217 }
e2509266 3218 if (RExC_flags & PMf_EXTENDED)
830247a4 3219 p = regwhite(p, RExC_end);
60a8b682
JH
3220 if (UTF && FOLD) {
3221 /* Prime the casefolded buffer. */
ac7e0132 3222 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3223 }
a0d0e21e
LW
3224 if (ISMULT2(p)) { /* Back off on ?+*. */
3225 if (len)
3226 p = oldp;
16ea2a2e 3227 else if (UTF) {
0ebc6274
JH
3228 STRLEN unilen;
3229
80aecb99 3230 if (FOLD) {
60a8b682 3231 /* Emit all the Unicode characters. */
80aecb99
JH
3232 for (foldbuf = tmpbuf;
3233 foldlen;
3234 foldlen -= numlen) {
3235 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3236 if (numlen > 0) {
0ebc6274
JH
3237 reguni(pRExC_state, ender, s, &unilen);
3238 s += unilen;
3239 len += unilen;
3240 /* In EBCDIC the numlen
3241 * and unilen can differ. */
9dc45d57 3242 foldbuf += numlen;
47654450
JH
3243 if (numlen >= foldlen)
3244 break;
9dc45d57
JH
3245 }
3246 else
3247 break; /* "Can't happen." */
80aecb99
JH
3248 }
3249 }
3250 else {
0ebc6274 3251 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3252 if (unilen > 0) {
0ebc6274
JH
3253 s += unilen;
3254 len += unilen;
9dc45d57 3255 }
80aecb99 3256 }
a0ed51b3 3257 }
a0d0e21e
LW
3258 else {
3259 len++;
eb160463 3260 REGC((char)ender, s++);
a0d0e21e
LW
3261 }
3262 break;
a687059c 3263 }
16ea2a2e 3264 if (UTF) {
0ebc6274
JH
3265 STRLEN unilen;
3266
80aecb99 3267 if (FOLD) {
60a8b682 3268 /* Emit all the Unicode characters. */
80aecb99
JH
3269 for (foldbuf = tmpbuf;
3270 foldlen;
3271 foldlen -= numlen) {
3272 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3273 if (numlen > 0) {
0ebc6274
JH
3274 reguni(pRExC_state, ender, s, &unilen);
3275 len += unilen;
3276 s += unilen;
3277 /* In EBCDIC the numlen
3278 * and unilen can differ. */
9dc45d57 3279 foldbuf += numlen;
47654450
JH
3280 if (numlen >= foldlen)
3281 break;
9dc45d57
JH
3282 }
3283 else
3284 break;
80aecb99
JH
3285 }
3286 }
3287 else {
0ebc6274 3288 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3289 if (unilen > 0) {
0ebc6274
JH
3290 s += unilen;
3291 len += unilen;
9dc45d57 3292 }
80aecb99
JH
3293 }
3294 len--;
a0ed51b3
LW
3295 }
3296 else
eb160463 3297 REGC((char)ender, s++);
a0d0e21e
LW
3298 }
3299 loopdone:
830247a4 3300 RExC_parse = p - 1;
fac92740 3301 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3302 nextchar(pRExC_state);
793db0cb
JH
3303 {
3304 /* len is STRLEN which is unsigned, need to copy to signed */
3305 IV iv = len;
3306 if (iv < 0)
3307 vFAIL("Internal disaster");
3308 }
a0d0e21e
LW
3309 if (len > 0)
3310 *flagp |= HASWIDTH;
090f7165 3311 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 3312 *flagp |= SIMPLE;
c277df42 3313 if (!SIZE_ONLY)
cd439c50
IZ
3314 STR_LEN(ret) = len;
3315 if (SIZE_ONLY)
830247a4 3316 RExC_size += STR_SZ(len);
cd439c50 3317 else
830247a4 3318 RExC_emit += STR_SZ(len);
a687059c 3319 }
a0d0e21e
LW
3320 break;
3321 }
a687059c 3322
60a8b682
JH
3323 /* If the encoding pragma is in effect recode the text of
3324 * any EXACT-kind nodes. */
22c54be3 3325 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3326 STRLEN oldlen = STR_LEN(ret);
3327 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3328
3329 if (RExC_utf8)
3330 SvUTF8_on(sv);
3331 if (sv_utf8_downgrade(sv, TRUE)) {
3332 char *s = sv_recode_to_utf8(sv, PL_encoding);
3333 STRLEN newlen = SvCUR(sv);
3334
3335 if (SvUTF8(sv))
3336 RExC_utf8 = 1;
3337 if (!SIZE_ONLY) {
3338 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3339 (int)oldlen, STRING(ret),
3340 (int)newlen, s));
3341 Copy(s, STRING(ret), newlen, char);
3342 STR_LEN(ret) += newlen - oldlen;
3343 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3344 } else
3345 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3346 }
a72c7584
JH
3347 }
3348
a0d0e21e 3349 return(ret);
a687059c
LW
3350}
3351
873ef191 3352STATIC char *
cea2e8a9 3353S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3354{
3355 while (p < e) {
3356 if (isSPACE(*p))
3357 ++p;
3358 else if (*p == '#') {
3359 do {
3360 p++;
3361 } while (p < e && *p != '\n');
3362 }
3363 else
3364 break;
3365 }
3366 return p;
3367}
3368
b8c5462f
JH
3369/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3370 Character classes ([:foo:]) can also be negated ([:^foo:]).
3371 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3372 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3373 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3374
3375#define POSIXCC_DONE(c) ((c) == ':')
3376#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3377#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3378
b8c5462f 3379STATIC I32
830247a4 3380S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3381{
3382 char *posixcc = 0;
936ed897 3383 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3384
830247a4 3385 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3386 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3387 POSIXCC(UCHARAT(RExC_parse))) {
3388 char c = UCHARAT(RExC_parse);
830247a4 3389 char* s = RExC_parse++;
b81d288d 3390
9a86a77b 3391 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3392 RExC_parse++;
3393 if (RExC_parse == RExC_end)
620e46c5 3394 /* Grandfather lone [:, [=, [. */
830247a4 3395 RExC_parse = s;
620e46c5 3396 else {
830247a4 3397 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3398
9a86a77b 3399 if (UCHARAT(RExC_parse) == ']') {
830247a4 3400 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3401 posixcc = s + 1;
3402 if (*s == ':') {
3403 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3404 I32 skip = 5; /* the most common skip */
3405
3406 switch (*posixcc) {
3407 case 'a':
3408 if (strnEQ(posixcc, "alnum", 5))
3409 namedclass =
3410 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3411 else if (strnEQ(posixcc, "alpha", 5))
3412 namedclass =
3413 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3414 else if (strnEQ(posixcc, "ascii", 5))
3415 namedclass =
3416 complement ? ANYOF_NASCII : ANYOF_ASCII;
3417 break;
aaa51d5e
JF
3418 case 'b':
3419 if (strnEQ(posixcc, "blank", 5))
3420 namedclass =
3421 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3422 break;
b8c5462f
JH
3423 case 'c':
3424 if (strnEQ(posixcc, "cntrl", 5))
3425 namedclass =
3426 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3427 break;
3428 case 'd':
3429 if (strnEQ(posixcc, "digit", 5))
3430 namedclass =
3431 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3432 break;
3433 case 'g':
3434 if (strnEQ(posixcc, "graph", 5))
3435 namedclass =
3436 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3437 break;
3438 case 'l':
3439 if (strnEQ(posixcc, "lower", 5))
3440 namedclass =
3441 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3442 break;
3443 case 'p':
3444 if (strnEQ(posixcc, "print", 5))
3445 namedclass =
3446 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3447 else if (strnEQ(posixcc, "punct", 5))
3448 namedclass =
3449 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3450 break;
3451 case 's':
3452 if (strnEQ(posixcc, "space", 5))
3453 namedclass =
aaa51d5e 3454 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3455 break;
b8c5462f
JH
3456 case 'u':
3457 if (strnEQ(posixcc, "upper", 5))
3458 namedclass =
3459 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3460 break;
3461 case 'w': /* this is not POSIX, this is the Perl \w */
3462 if (strnEQ(posixcc, "word", 4)) {
3463 namedclass =
3464 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3465 skip = 4;
3466 }
3467 break;
3468 case 'x':
3469 if (strnEQ(posixcc, "xdigit", 6)) {
3470 namedclass =
3471 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3472 skip = 6;
3473 }
3474 break;
3475 }
ac561586
JH
3476 if (namedclass == OOB_NAMEDCLASS ||
3477 posixcc[skip] != ':' ||
3478 posixcc[skip+1] != ']')
b45f050a
JF
3479 {
3480 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3481 t - s - 1, s + 1);
3482 }
3483 } else if (!SIZE_ONLY) {
b8c5462f 3484 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3485
830247a4 3486 /* adjust RExC_parse so the warning shows after
b45f050a 3487 the class closes */
9a86a77b 3488 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3489 RExC_parse++;
b45f050a
JF
3490 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3491 }
b8c5462f
JH
3492 } else {
3493 /* Maternal grandfather:
3494 * "[:" ending in ":" but not in ":]" */
830247a4 3495 RExC_parse = s;
767d463e 3496 }
620e46c5
JH
3497 }
3498 }
3499
b8c5462f
JH
3500 return namedclass;
3501}
3502
3503STATIC void
830247a4 3504S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3505{
b938889d 3506 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3507 char *s = RExC_parse;
93733859 3508 char c = *s++;
b8c5462f
JH
3509
3510 while(*s && isALNUM(*s))
3511 s++;
3512 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3513 if (ckWARN(WARN_REGEXP))
3514 vWARN3(s+2,
3515 "POSIX syntax [%c %c] belongs inside character classes",
3516 c, c);
b45f050a
JF
3517
3518 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3519 if (POSIXCC_NOTYET(c)) {
830247a4 3520 /* adjust RExC_parse so the error shows after
b45f050a 3521 the class closes */
9a86a77b 3522 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3523 ;
3524 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3525 }
b8c5462f
JH
3526 }
3527 }
620e46c5
JH
3528}
3529
76e3520e 3530STATIC regnode *
830247a4 3531S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3532{
ffc61ed2 3533 register UV value;
9a86a77b 3534 register UV nextvalue;
3568d838 3535 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3536 register IV range = 0;
c277df42 3537 register regnode *ret;
ba210ebe 3538 STRLEN numlen;
ffc61ed2 3539 IV namedclass;
9c5ffd7c 3540 char *rangebegin = 0;
936ed897 3541 bool need_class = 0;
9c5ffd7c 3542 SV *listsv = Nullsv;
ffc61ed2
JH
3543 register char *e;
3544 UV n;
9e55ce06
JH
3545 bool optimize_invert = TRUE;
3546 AV* unicode_alternate = 0;
1b2d223b
JH
3547#ifdef EBCDIC
3548 UV literal_endpoint = 0;
3549#endif
ffc61ed2
JH
3550
3551 ret = reganode(pRExC_state, ANYOF, 0);
3552
3553 if (!SIZE_ONLY)
3554 ANYOF_FLAGS(ret) = 0;
3555
9a86a77b 3556 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3557 RExC_naughty++;
3558 RExC_parse++;
3559 if (!SIZE_ONLY)
3560 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3561 }
a0d0e21e 3562
936ed897 3563 if (SIZE_ONLY)
830247a4 3564 RExC_size += ANYOF_SKIP;
936ed897 3565 else {
830247a4 3566 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3567 if (FOLD)
3568 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3569 if (LOC)
3570 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3571 ANYOF_BITMAP_ZERO(ret);
3572 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3573 }
b8c5462f 3574
9a86a77b
JH
3575 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3576
b938889d 3577 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3578 checkposixcc(pRExC_state);
b8c5462f 3579
f064b6ad
HS
3580 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3581 if (UCHARAT(RExC_parse) == ']')
3582 goto charclassloop;
ffc61ed2 3583
9a86a77b 3584 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3585
3586 charclassloop:
3587
3588 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3589
73b437c8 3590 if (!range)
830247a4 3591 rangebegin = RExC_parse;
ffc61ed2 3592 if (UTF) {
5e12f4fb 3593 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3594 RExC_end - RExC_parse,
3595 &numlen, 0);
ffc61ed2
JH
3596 RExC_parse += numlen;
3597 }
3598 else
3599 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3600 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3601 if (value == '[' && POSIXCC(nextvalue))
830247a4 3602 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3603 else if (value == '\\') {
ffc61ed2 3604 if (UTF) {
5e12f4fb 3605 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3606 RExC_end - RExC_parse,
3607 &numlen, 0);
3608 RExC_parse += numlen;
3609 }
3610 else
3611 value = UCHARAT(RExC_parse++);
470c3474 3612 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3613 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3614 * be a problem later if we want switch on Unicode.
3615 * A similar issue a little bit later when switching on
3616 * namedclass. --jhi */
ffc61ed2 3617 switch ((I32)value) {
b8c5462f
JH
3618 case 'w': namedclass = ANYOF_ALNUM; break;
3619 case 'W': namedclass = ANYOF_NALNUM; break;
3620 case 's': namedclass = ANYOF_SPACE; break;
3621 case 'S': namedclass = ANYOF_NSPACE; break;
3622 case 'd': namedclass = ANYOF_DIGIT; break;
3623 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3624 case 'p':
3625 case 'P':
af6f566e 3626 if (RExC_parse >= RExC_end)
2a4859cd 3627 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3628 if (*RExC_parse == '{') {
0da60cf5 3629 U8 c = (U8)value;
ffc61ed2
JH
3630 e = strchr(RExC_parse++, '}');
3631 if (!e)
0da60cf5 3632 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3633 while (isSPACE(UCHARAT(RExC_parse)))
3634 RExC_parse++;
3635 if (e == RExC_parse)
0da60cf5 3636 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3637 n = e - RExC_parse;
ab13f0c7
JH
3638 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3639 n--;
ffc61ed2
JH
3640 }
3641 else {
3642 e = RExC_parse;
3643 n = 1;
3644 }
3645 if (!SIZE_ONLY) {
ab13f0c7
JH
3646 if (UCHARAT(RExC_parse) == '^') {
3647 RExC_parse++;
3648 n--;
3649 value = value == 'p' ? 'P' : 'p'; /* toggle */
3650 while (isSPACE(UCHARAT(RExC_parse))) {
3651 RExC_parse++;
3652 n--;
3653 }
3654 }
ffc61ed2 3655 if (value == 'p')
ab13f0c7
JH
3656 Perl_sv_catpvf(aTHX_ listsv,
3657 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3658 else
ab13f0c7
JH
3659 Perl_sv_catpvf(aTHX_ listsv,
3660 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3661 }
3662 RExC_parse = e + 1;
3663 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
3664 namedclass = ANYOF_MAX; /* no official name, but it's named */
3665 break;
b8c5462f
JH
3666 case 'n': value = '\n'; break;
3667 case 'r': value = '\r'; break;
3668 case 't': value = '\t'; break;
3669 case 'f': value = '\f'; break;
3670 case 'b': value = '\b'; break;
c7f1f016
NIS
3671 case 'e': value = ASCII_TO_NATIVE('\033');break;
3672 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3673 case 'x':
ffc61ed2 3674 if (*RExC_parse == '{') {
a4c04bdc
NC
3675 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3676 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3677 e = strchr(RExC_parse++, '}');
b81d288d 3678 if (!e)
ffc61ed2 3679 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3680
3681 numlen = e - RExC_parse;
3682 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3683 RExC_parse = e + 1;
3684 }
3685 else {
a4c04bdc 3686 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3687 numlen = 2;
3688 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3689 RExC_parse += numlen;
3690 }
b8c5462f
JH
3691 break;
3692 case 'c':
830247a4 3693 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3694 value = toCTRL(value);
3695 break;
3696 case '0': case '1': case '2': case '3': case '4':
3697 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3698 {
3699 I32 flags = 0;
3700 numlen = 3;
3701 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3702 RExC_parse += numlen;
b8c5462f 3703 break;
53305cf1 3704 }
1028017a 3705 default:
e476b1b5 3706 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3707 vWARN2(RExC_parse,
3708 "Unrecognized escape \\%c in character class passed through",
3709 (int)value);
1028017a 3710 break;
b8c5462f 3711 }
ffc61ed2 3712 } /* end of \blah */
1b2d223b
JH
3713#ifdef EBCDIC
3714 else
3715 literal_endpoint++;
3716#endif
ffc61ed2
JH
3717
3718 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3719
3720 if (!SIZE_ONLY && !need_class)
936ed897 3721 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3722
936ed897 3723 need_class = 1;
ffc61ed2
JH
3724
3725 /* a bad range like a-\d, a-[:digit:] ? */
3726 if (range) {
73b437c8 3727 if (!SIZE_ONLY) {
e476b1b5 3728 if (ckWARN(WARN_REGEXP))
830247a4 3729 vWARN4(RExC_parse,
b45f050a 3730 "False [] range \"%*.*s\"",
830247a4
IZ
3731 RExC_parse - rangebegin,
3732 RExC_parse - rangebegin,
b45f050a 3733 rangebegin);
3568d838
JH
3734 if (prevvalue < 256) {
3735 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3736 ANYOF_BITMAP_SET(ret, '-');
3737 }
3738 else {
3739 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3740 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3741 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3742 }
b8c5462f 3743 }
ffc61ed2
JH
3744
3745 range = 0; /* this was not a true range */
73b437c8 3746 }
ffc61ed2 3747
73b437c8 3748 if (!SIZE_ONLY) {
3568d838
JH
3749 if (namedclass > OOB_NAMEDCLASS)
3750 optimize_invert = FALSE;
e2962f66
JH
3751 /* Possible truncation here but in some 64-bit environments
3752 * the compiler gets heartburn about switch on 64-bit values.
3753 * A similar issue a little earlier when switching on value.
98f323fa 3754 * --jhi */
e2962f66 3755 switch ((I32)namedclass) {
73b437c8
JH
3756 case ANYOF_ALNUM:
3757 if (LOC)
936ed897 3758 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3759 else {
3760 for (value = 0; value < 256; value++)
3761 if (isALNUM(value))
936ed897 3762 ANYOF_BITMAP_SET(ret, value);
73b437c8 3763 }
ffc61ed2 3764 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3765 break;
3766 case ANYOF_NALNUM:
3767 if (LOC)
936ed897 3768 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3769 else {
3770 for (value = 0; value < 256; value++)
3771 if (!isALNUM(value))
936ed897 3772 ANYOF_BITMAP_SET(ret, value);
73b437c8 3773 }
ffc61ed2 3774 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3775 break;
ffc61ed2 3776 case ANYOF_ALNUMC:
73b437c8 3777 if (LOC)
ffc61ed2 3778 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3779 else {
3780 for (value = 0; value < 256; value++)
ffc61ed2 3781 if (isALNUMC(value))
936ed897 3782 ANYOF_BITMAP_SET(ret, value);
73b437c8 3783 }
ffc61ed2 3784 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3785 break;
3786 case ANYOF_NALNUMC:
3787 if (LOC)
936ed897 3788 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3789 else {
3790 for (value = 0; value < 256; value++)
3791 if (!isALNUMC(value))
936ed897 3792 ANYOF_BITMAP_SET(ret, value);
73b437c8 3793 }
ffc61ed2 3794 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3795 break;
3796 case ANYOF_ALPHA:
3797 if (LOC)
936ed897 3798 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3799 else {
3800 for (value = 0; value < 256; value++)
3801 if (isALPHA(value))
936ed897 3802 ANYOF_BITMAP_SET(ret, value);
73b437c8 3803 }
ffc61ed2 3804 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3805 break;
3806 case ANYOF_NALPHA:
3807 if (LOC)
936ed897 3808 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3809 else {
3810 for (value = 0; value < 256; value++)
3811 if (!isALPHA(value))
936ed897 3812 ANYOF_BITMAP_SET(ret, value);
73b437c8 3813 }
ffc61ed2 3814 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3815 break;
3816 case ANYOF_ASCII:
3817 if (LOC)
936ed897 3818 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3819 else {
c7f1f016 3820#ifndef EBCDIC
1ba5c669
JH
3821 for (value = 0; value < 128; value++)
3822 ANYOF_BITMAP_SET(ret, value);
3823#else /* EBCDIC */
ffbc6a93 3824 for (value = 0; value < 256; value++) {
3a3c4447
JH
3825 if (isASCII(value))
3826 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3827 }
1ba5c669 3828#endif /* EBCDIC */
73b437c8 3829 }
ffc61ed2 3830 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3831 break;
3832 case ANYOF_NASCII:
3833 if (LOC)
936ed897 3834 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3835 else {
c7f1f016 3836#ifndef EBCDIC
1ba5c669
JH
3837 for (value = 128; value < 256; value++)
3838 ANYOF_BITMAP_SET(ret, value);
3839#else /* EBCDIC */
ffbc6a93 3840 for (value = 0; value < 256; value++) {
3a3c4447
JH
3841 if (!isASCII(value))
3842 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3843 }
1ba5c669 3844#endif /* EBCDIC */
73b437c8 3845 }
ffc61ed2 3846 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3847 break;
aaa51d5e
JF
3848 case ANYOF_BLANK:
3849 if (LOC)
3850 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3851 else {
3852 for (value = 0; value < 256; value++)
3853 if (isBLANK(value))
3854 ANYOF_BITMAP_SET(ret, value);
3855 }
ffc61ed2 3856 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3857 break;
3858 case ANYOF_NBLANK:
3859 if (LOC)
3860 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3861 else {
3862 for (value = 0; value < 256; value++)
3863 if (!isBLANK(value))
3864 ANYOF_BITMAP_SET(ret, value);
3865 }
ffc61ed2 3866 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3867 break;
73b437c8
JH
3868 case ANYOF_CNTRL:
3869 if (LOC)
936ed897 3870 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3871 else {
3872 for (value = 0; value < 256; value++)
3873 if (isCNTRL(value))
936ed897 3874 ANYOF_BITMAP_SET(ret, value);
73b437c8 3875 }
ffc61ed2 3876 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3877 break;
3878 case ANYOF_NCNTRL:
3879 if (LOC)
936ed897 3880 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3881 else {
3882 for (value = 0; value < 256; value++)
3883 if (!isCNTRL(value))
936ed897 3884 ANYOF_BITMAP_SET(ret, value);
73b437c8 3885 }
ffc61ed2
JH
3886 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3887 break;
3888 case ANYOF_DIGIT:
3889 if (LOC)
3890 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3891 else {
3892 /* consecutive digits assumed */
3893 for (value = '0'; value <= '9'; value++)
3894 ANYOF_BITMAP_SET(ret, value);
3895 }
3896 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3897 break;
3898 case ANYOF_NDIGIT:
3899 if (LOC)
3900 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3901 else {
3902 /* consecutive digits assumed */
3903 for (value = 0; value < '0'; value++)
3904 ANYOF_BITMAP_SET(ret, value);
3905 for (value = '9' + 1; value < 256; value++)
3906 ANYOF_BITMAP_SET(ret, value);
3907 }
3908 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3909 break;
3910 case ANYOF_GRAPH:
3911 if (LOC)
936ed897 3912 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3913 else {
3914 for (value = 0; value < 256; value++)
3915 if (isGRAPH(value))
936ed897 3916 ANYOF_BITMAP_SET(ret, value);
73b437c8 3917 }
ffc61ed2 3918 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3919 break;
3920 case ANYOF_NGRAPH:
3921 if (LOC)
936ed897 3922 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3923 else {
3924 for (value = 0; value < 256; value++)
3925 if (!isGRAPH(value))
936ed897 3926 ANYOF_BITMAP_SET(ret, value);
73b437c8 3927 }
ffc61ed2 3928 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3929 break;
3930 case ANYOF_LOWER:
3931 if (LOC)
936ed897 3932 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3933 else {
3934 for (value = 0; value < 256; value++)
3935 if (isLOWER(value))
936ed897 3936 ANYOF_BITMAP_SET(ret, value);
73b437c8 3937 }
ffc61ed2 3938 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3939 break;
3940 case ANYOF_NLOWER:
3941 if (LOC)
936ed897 3942 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3943 else {
3944 for (value = 0; value < 256; value++)
3945 if (!isLOWER(value))
936ed897 3946 ANYOF_BITMAP_SET(ret, value);
73b437c8 3947 }
ffc61ed2 3948 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3949 break;
3950 case ANYOF_PRINT:
3951 if (LOC)
936ed897 3952 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3953 else {
3954 for (value = 0; value < 256; value++)
3955 if (isPRINT(value))
936ed897 3956 ANYOF_BITMAP_SET(ret, value);
73b437c8 3957 }
ffc61ed2 3958 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3959 break;
3960 case ANYOF_NPRINT:
3961 if (LOC)
936ed897 3962 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3963 else {
3964 for (value = 0; value < 256; value++)
3965 if (!isPRINT(value))
936ed897 3966 ANYOF_BITMAP_SET(ret, value);
73b437c8 3967 }
ffc61ed2 3968 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3969 break;
aaa51d5e
JF
3970 case ANYOF_PSXSPC:
3971 if (LOC)
3972 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3973 else {
3974 for (value = 0; value < 256; value++)
3975 if (isPSXSPC(value))
3976 ANYOF_BITMAP_SET(ret, value);
3977 }
ffc61ed2 3978 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3979 break;
3980 case ANYOF_NPSXSPC:
3981 if (LOC)
3982 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3983 else {
3984 for (value = 0; value < 256; value++)
3985 if (!isPSXSPC(value))
3986 ANYOF_BITMAP_SET(ret, value);
3987 }
ffc61ed2 3988 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3989 break;
73b437c8
JH
3990 case ANYOF_PUNCT:
3991 if (LOC)
936ed897 3992 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3993 else {
3994 for (value = 0; value < 256; value++)
3995 if (isPUNCT(value))
936ed897 3996 ANYOF_BITMAP_SET(ret, value);
73b437c8 3997 }
ffc61ed2 3998 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3999 break;
4000 case ANYOF_NPUNCT:
4001 if (LOC)
936ed897 4002 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
4003 else {
4004 for (value = 0; value < 256; value++)
4005 if (!isPUNCT(value))
936ed897 4006 ANYOF_BITMAP_SET(ret, value);
73b437c8 4007 }
ffc61ed2
JH
4008 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4009 break;
4010 case ANYOF_SPACE:
4011 if (LOC)
4012 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4013 else {
4014 for (value = 0; value < 256; value++)
4015 if (isSPACE(value))
4016 ANYOF_BITMAP_SET(ret, value);
4017 }
4018 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4019 break;
4020 case ANYOF_NSPACE:
4021 if (LOC)
4022 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4023 else {
4024 for (value = 0; value < 256; value++)
4025 if (!isSPACE(value))
4026 ANYOF_BITMAP_SET(ret, value);
4027 }
4028 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
4029 break;
4030 case ANYOF_UPPER:
4031 if (LOC)
936ed897 4032 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4033 else {
4034 for (value = 0; value < 256; value++)
4035 if (isUPPER(value))
936ed897 4036 ANYOF_BITMAP_SET(ret, value);
73b437c8 4037 }
ffc61ed2 4038 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4039 break;
4040 case ANYOF_NUPPER:
4041 if (LOC)
936ed897 4042 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4043 else {
4044 for (value = 0; value < 256; value++)
4045 if (!isUPPER(value))
936ed897 4046 ANYOF_BITMAP_SET(ret, value);
73b437c8 4047 }
ffc61ed2 4048 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4049 break;
4050 case ANYOF_XDIGIT:
4051 if (LOC)
936ed897 4052 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4053 else {
4054 for (value = 0; value < 256; value++)
4055 if (isXDIGIT(value))
936ed897 4056 ANYOF_BITMAP_SET(ret, value);
73b437c8 4057 }
ffc61ed2 4058 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4059 break;
4060 case ANYOF_NXDIGIT:
4061 if (LOC)
936ed897 4062 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4063 else {
4064 for (value = 0; value < 256; value++)
4065 if (!isXDIGIT(value))
936ed897 4066 ANYOF_BITMAP_SET(ret, value);
73b437c8 4067 }
ffc61ed2 4068 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8 4069 break;
f81125e2
JP
4070 case ANYOF_MAX:
4071 /* this is to handle \p and \P */
4072 break;
73b437c8 4073 default:
b45f050a 4074 vFAIL("Invalid [::] class");
73b437c8 4075 break;
b8c5462f 4076 }
b8c5462f 4077 if (LOC)
936ed897 4078 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4079 continue;
a0d0e21e 4080 }
ffc61ed2
JH
4081 } /* end of namedclass \blah */
4082
a0d0e21e 4083 if (range) {
eb160463 4084 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4085 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4086 RExC_parse - rangebegin,
4087 RExC_parse - rangebegin,
b45f050a 4088 rangebegin);
3568d838 4089 range = 0; /* not a valid range */
73b437c8 4090 }
a0d0e21e
LW
4091 }
4092 else {
3568d838 4093 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4094 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4095 RExC_parse[1] != ']') {
4096 RExC_parse++;
ffc61ed2
JH
4097
4098 /* a bad range like \w-, [:word:]- ? */
4099 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4100 if (ckWARN(WARN_REGEXP))
830247a4 4101 vWARN4(RExC_parse,
b45f050a 4102 "False [] range \"%*.*s\"",
830247a4
IZ
4103 RExC_parse - rangebegin,
4104 RExC_parse - rangebegin,
b45f050a 4105 rangebegin);
73b437c8 4106 if (!SIZE_ONLY)
936ed897 4107 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4108 } else
ffc61ed2
JH
4109 range = 1; /* yeah, it's a range! */
4110 continue; /* but do it the next time */
a0d0e21e 4111 }
a687059c 4112 }
ffc61ed2 4113
93733859 4114 /* now is the next time */
ae5c130c 4115 if (!SIZE_ONLY) {
3568d838
JH
4116 IV i;
4117
4118 if (prevvalue < 256) {
4119 IV ceilvalue = value < 256 ? value : 255;
4120
4121#ifdef EBCDIC
1b2d223b
JH
4122 /* In EBCDIC [\x89-\x91] should include
4123 * the \x8e but [i-j] should not. */
4124 if (literal_endpoint == 2 &&
4125 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4126 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4127 {
3568d838
JH
4128 if (isLOWER(prevvalue)) {
4129 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4130 if (isLOWER(i))
4131 ANYOF_BITMAP_SET(ret, i);
4132 } else {
3568d838 4133 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4134 if (isUPPER(i))
4135 ANYOF_BITMAP_SET(ret, i);
4136 }
8ada0baa 4137 }
ffc61ed2 4138 else
8ada0baa 4139#endif
a5961de5
JH
4140 for (i = prevvalue; i <= ceilvalue; i++)
4141 ANYOF_BITMAP_SET(ret, i);
3568d838 4142 }
a5961de5 4143 if (value > 255 || UTF) {
b08decb7
JH
4144 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4145 UV natvalue = NATIVE_TO_UNI(value);
4146
ffc61ed2 4147 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4148 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4149 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4150 prevnatvalue, natvalue);
4151 }
4152 else if (prevnatvalue == natvalue) {
4153 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4154 if (FOLD) {
254ba52a
JH
4155 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4156 STRLEN foldlen;
2f3bf011 4157 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4158
c840d2a2
JH
4159 /* If folding and foldable and a single
4160 * character, insert also the folded version
4161 * to the charclass. */
9e55ce06 4162 if (f != value) {
eb160463 4163 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4164 Perl_sv_catpvf(aTHX_ listsv,
4165 "%04"UVxf"\n", f);
4166 else {
4167 /* Any multicharacter foldings
4168 * require the following transform:
4169 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4170 * where E folds into "pq" and F folds
4171 * into "rst", all other characters
4172 * fold to single characters. We save
4173 * away these multicharacter foldings,
4174 * to be later saved as part of the
4175 * additional "s" data. */
4176 SV *sv;
4177
4178 if (!unicode_alternate)
4179 unicode_alternate = newAV();
4180 sv = newSVpvn((char*)foldbuf, foldlen);
4181 SvUTF8_on(sv);
4182 av_push(unicode_alternate, sv);
4183 }
4184 }
254ba52a 4185
60a8b682
JH
4186 /* If folding and the value is one of the Greek
4187 * sigmas insert a few more sigmas to make the
4188 * folding rules of the sigmas to work right.
4189 * Note that not all the possible combinations
4190 * are handled here: some of them are handled
9e55ce06
JH
4191 * by the standard folding rules, and some of
4192 * them (literal or EXACTF cases) are handled
4193 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4194 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4195 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4196 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4197 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4198 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4199 }
4200 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4201 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4202 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4203 }
4204 }
ffc61ed2 4205 }
1b2d223b
JH
4206#ifdef EBCDIC
4207 literal_endpoint = 0;
4208#endif
8ada0baa 4209 }
ffc61ed2
JH
4210
4211 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4212 }
ffc61ed2 4213
936ed897 4214 if (need_class) {
4f66b38d 4215 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4216 if (SIZE_ONLY)
830247a4 4217 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4218 else
830247a4 4219 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4220 }
ffc61ed2 4221
ae5c130c 4222 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4223 if (!SIZE_ONLY &&
ffc61ed2 4224 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4225 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4226 ) {
a0ed51b3 4227 for (value = 0; value < 256; ++value) {
936ed897 4228 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4229 UV fold = PL_fold[value];
ffc61ed2
JH
4230
4231 if (fold != value)
4232 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4233 }
4234 }
936ed897 4235 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4236 }
ffc61ed2 4237
ae5c130c 4238 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4239 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4240 /* If the only flag is inversion. */
4241 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4242 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4243 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4244 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4245 }
a0d0e21e 4246
b81d288d 4247 if (!SIZE_ONLY) {
fde631ed 4248 AV *av = newAV();
ffc61ed2
JH
4249 SV *rv;
4250
9e55ce06 4251 /* The 0th element stores the character class description
6a0407ee 4252 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4253 * to initialize the appropriate swash (which gets stored in
4254 * the 1st element), and also useful for dumping the regnode.
4255 * The 2nd element stores the multicharacter foldings,
6a0407ee 4256 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4257 av_store(av, 0, listsv);
4258 av_store(av, 1, NULL);
9e55ce06 4259 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4260 rv = newRV_noinc((SV*)av);
19860706 4261 n = add_data(pRExC_state, 1, "s");
830247a4 4262 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4263 ARG_SET(ret, n);
a0ed51b3
LW
4264 }
4265
4266 return ret;
4267}
4268
76e3520e 4269STATIC char*
830247a4 4270S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4271{
830247a4 4272 char* retval = RExC_parse++;
a0d0e21e 4273
4633a7c4 4274 for (;;) {
830247a4
IZ
4275 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4276 RExC_parse[2] == '#') {
e994fd66
AE
4277 while (*RExC_parse != ')') {
4278 if (RExC_parse == RExC_end)
4279 FAIL("Sequence (?#... not terminated");
830247a4 4280 RExC_parse++;
e994fd66 4281 }
830247a4 4282 RExC_parse++;
4633a7c4
LW
4283 continue;
4284 }
e2509266 4285 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4286 if (isSPACE(*RExC_parse)) {
4287 RExC_parse++;
748a9306
LW
4288 continue;
4289 }
830247a4 4290 else if (*RExC_parse == '#') {
e994fd66
AE
4291 while (RExC_parse < RExC_end)
4292 if (*RExC_parse++ == '\n') break;
748a9306
LW
4293 continue;
4294 }
748a9306 4295 }
4633a7c4 4296 return retval;
a0d0e21e 4297 }
a687059c
LW
4298}
4299
4300/*
c277df42 4301- reg_node - emit a node
a0d0e21e 4302*/
76e3520e 4303STATIC regnode * /* Location. */
830247a4 4304S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4305{
c277df42
IZ
4306 register regnode *ret;
4307 register regnode *ptr;
a687059c 4308
830247a4 4309 ret = RExC_emit;
c277df42 4310 if (SIZE_ONLY) {
830247a4
IZ
4311 SIZE_ALIGN(RExC_size);
4312 RExC_size += 1;
a0d0e21e
LW
4313 return(ret);
4314 }
a687059c 4315
c277df42 4316 NODE_ALIGN_FILL(ret);
a0d0e21e 4317 ptr = ret;
c277df42 4318 FILL_ADVANCE_NODE(ptr, op);
fac92740 4319 if (RExC_offsets) { /* MJD */
ccb2c380 4320 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4321 "reg_node", __LINE__,
4322 reg_name[op],
4323 RExC_emit - RExC_emit_start > RExC_offsets[0]
4324 ? "Overwriting end of array!\n" : "OK",
4325 RExC_emit - RExC_emit_start,
4326 RExC_parse - RExC_start,
4327 RExC_offsets[0]));
ccb2c380 4328 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4329 }
4330
830247a4 4331 RExC_emit = ptr;
a687059c 4332
a0d0e21e 4333 return(ret);
a687059c
LW
4334}
4335
4336/*
a0d0e21e
LW
4337- reganode - emit a node with an argument
4338*/
76e3520e 4339STATIC regnode * /* Location. */
830247a4 4340S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4341{
c277df42
IZ
4342 register regnode *ret;
4343 register regnode *ptr;
fe14fcc3 4344
830247a4 4345 ret = RExC_emit;
c277df42 4346 if (SIZE_ONLY) {
830247a4
IZ
4347 SIZE_ALIGN(RExC_size);
4348 RExC_size += 2;
a0d0e21e
LW
4349 return(ret);
4350 }
fe14fcc3 4351
c277df42 4352 NODE_ALIGN_FILL(ret);
a0d0e21e 4353 ptr = ret;
c277df42 4354 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4355 if (RExC_offsets) { /* MJD */
ccb2c380 4356 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4357 "reganode",
ccb2c380
MP
4358 __LINE__,
4359 reg_name[op],
fac92740
MJD
4360 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4361 "Overwriting end of array!\n" : "OK",
4362 RExC_emit - RExC_emit_start,
4363 RExC_parse - RExC_start,
4364 RExC_offsets[0]));
ccb2c380 4365 Set_Cur_Node_Offset;
fac92740
MJD
4366 }
4367
830247a4 4368 RExC_emit = ptr;
fe14fcc3 4369
a0d0e21e 4370 return(ret);
fe14fcc3
LW
4371}
4372
4373/*
cd439c50 4374- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4375*/
4376STATIC void
830247a4 4377S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4378{
5e12f4fb 4379 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4380}
4381
4382/*
a0d0e21e
LW
4383- reginsert - insert an operator in front of already-emitted operand
4384*
4385* Means relocating the operand.
4386*/
76e3520e 4387STATIC void
830247a4 4388S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4389{
c277df42
IZ
4390 register regnode *src;
4391 register regnode *dst;
4392 register regnode *place;
4393 register int offset = regarglen[(U8)op];
b81d288d 4394
22c35a8c 4395/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4396
4397 if (SIZE_ONLY) {
830247a4 4398 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4399 return;
4400 }
a687059c 4401
830247a4
IZ
4402 src = RExC_emit;
4403 RExC_emit += NODE_STEP_REGNODE + offset;
4404 dst = RExC_emit;
fac92740 4405 while (src > opnd) {
c277df42 4406 StructCopy(--src, --dst, regnode);
fac92740 4407 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4408 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4409 "reg_insert",
ccb2c380
MP
4410 __LINE__,
4411 reg_name[op],
fac92740
MJD
4412 dst - RExC_emit_start > RExC_offsets[0]
4413 ? "Overwriting end of array!\n" : "OK",
4414 src - RExC_emit_start,
4415 dst - RExC_emit_start,
4416 RExC_offsets[0]));
ccb2c380
MP
4417 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4418 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4419 }
4420 }
4421
a0d0e21e
LW
4422
4423 place = opnd; /* Op node, where operand used to be. */
fac92740 4424 if (RExC_offsets) { /* MJD */
ccb2c380 4425 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4426 "reginsert",
ccb2c380
MP
4427 __LINE__,
4428 reg_name[op],
fac92740
MJD
4429 place - RExC_emit_start > RExC_offsets[0]
4430 ? "Overwriting end of array!\n" : "OK",
4431 place - RExC_emit_start,
4432 RExC_parse - RExC_start,
4433 RExC_offsets[0]));
ccb2c380 4434 Set_Node_Offset(place, RExC_parse);
45948336 4435 Set_Node_Length(place, 1);
fac92740 4436 }
c277df42
IZ
4437 src = NEXTOPER(place);
4438 FILL_ADVANCE_NODE(place, op);
4439 Zero(src, offset, regnode);
a687059c
LW
4440}
4441
4442/*
c277df42 4443- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4444*/
76e3520e 4445STATIC void
830247a4 4446S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4447{
c277df42
IZ
4448 register regnode *scan;
4449 register regnode *temp;
a0d0e21e 4450
c277df42 4451 if (SIZE_ONLY)
a0d0e21e
LW
4452 return;
4453
4454 /* Find last node. */
4455 scan = p;
4456 for (;;) {
4457 temp = regnext(scan);
4458 if (temp == NULL)
4459 break;
4460 scan = temp;
4461 }
a687059c 4462
c277df42
IZ
4463 if (reg_off_by_arg[OP(scan)]) {
4464 ARG_SET(scan, val - scan);
a0ed51b3
LW
4465 }
4466 else {
c277df42
IZ
4467 NEXT_OFF(scan) = val - scan;
4468 }
a687059c
LW
4469}
4470
4471/*
a0d0e21e
LW
4472- regoptail - regtail on operand of first argument; nop if operandless
4473*/
76e3520e 4474STATIC void
830247a4 4475S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4476{
a0d0e21e 4477 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4478 if (p == NULL || SIZE_ONLY)
4479 return;
22c35a8c 4480 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4481 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4482 }
22c35a8c 4483 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4484 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4485 }
4486 else
a0d0e21e 4487 return;
a687059c
LW
4488}
4489
4490/*
4491 - regcurly - a little FSA that accepts {\d+,?\d*}
4492 */
79072805 4493STATIC I32
cea2e8a9 4494S_regcurly(pTHX_ register char *s)
a687059c
LW
4495{
4496 if (*s++ != '{')
4497 return FALSE;
f0fcb552 4498 if (!isDIGIT(*s))
a687059c 4499 return FALSE;
f0fcb552 4500 while (isDIGIT(*s))
a687059c
LW
4501 s++;
4502 if (*s == ',')
4503 s++;
f0fcb552 4504 while (isDIGIT(*s))
a687059c
LW
4505 s++;
4506 if (*s != '}')
4507 return FALSE;
4508 return TRUE;
4509}
4510
a687059c 4511
8fa7f367
JH
4512#ifdef DEBUGGING
4513
76e3520e 4514STATIC regnode *
cea2e8a9 4515S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4516{
f248d071 4517 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4518 register regnode *next;
c277df42
IZ
4519
4520 while (op != END && (!last || node < last)) {
4521 /* While that wasn't END last time... */
4522
4523 NODE_ALIGN(node);
4524 op = OP(node);
4525 if (op == CLOSE)
4526 l--;
4527 next = regnext(node);
4528 /* Where, what. */
4529 if (OP(node) == OPTIMIZED)
4530 goto after_print;
4531 regprop(sv, node);
b900a521 4532 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4533 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4534 if (next == NULL) /* Next ptr. */
4535 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4536 else
b900a521 4537 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4538 (void)PerlIO_putc(Perl_debug_log, '\n');
4539 after_print:
22c35a8c 4540 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4541 register regnode *nnode = (OP(next) == LONGJMP
4542 ? regnext(next)
c277df42
IZ
4543 : next);
4544 if (last && nnode > last)
4545 nnode = last;
4546 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4547 }
22c35a8c 4548 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4549 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4550 }
4551 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4552 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4553 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4554 }
22c35a8c 4555 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4556 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4557 next, sv, l + 1);
a0ed51b3
LW
4558 }
4559 else if ( op == PLUS || op == STAR) {
c277df42 4560 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4561 }
4562 else if (op == ANYOF) {
4f66b38d
HS
4563 /* arglen 1 + class block */
4564 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4565 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4566 node = NEXTOPER(node);
a0ed51b3 4567 }
22c35a8c 4568 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4569 /* Literal string, where present. */
cd439c50 4570 node += NODE_SZ_STR(node) - 1;
c277df42 4571 node = NEXTOPER(node);
a0ed51b3
LW
4572 }
4573 else {
c277df42
IZ
4574 node = NEXTOPER(node);
4575 node += regarglen[(U8)op];
4576 }
4577 if (op == CURLYX || op == OPEN)
4578 l++;
4579 else if (op == WHILEM)
4580 l--;
4581 }
4582 return node;
4583}
4584
8fa7f367
JH
4585#endif /* DEBUGGING */
4586
a687059c 4587/*
fd181c75 4588 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4589 */
4590void
864dbfa3 4591Perl_regdump(pTHX_ regexp *r)
a687059c 4592{
35ff7856 4593#ifdef DEBUGGING
46fc3d4c 4594 SV *sv = sv_newmortal();
a687059c 4595
c277df42 4596 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4597
4598 /* Header fields of interest. */
c277df42 4599 if (r->anchored_substr)
7b0972df 4600 PerlIO_printf(Perl_debug_log,
b81d288d 4601 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4602 PL_colors[0],
7b0972df 4603 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4604 SvPVX(r->anchored_substr),
3280af22 4605 PL_colors[1],
c277df42 4606 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4607 (IV)r->anchored_offset);
33b8afdf
JH
4608 else if (r->anchored_utf8)
4609 PerlIO_printf(Perl_debug_log,
4610 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4611 PL_colors[0],
4612 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4613 SvPVX(r->anchored_utf8),
4614 PL_colors[1],
4615 SvTAIL(r->anchored_utf8) ? "$" : "",
4616 (IV)r->anchored_offset);
c277df42 4617 if (r->float_substr)
7b0972df 4618 PerlIO_printf(Perl_debug_log,
b81d288d 4619 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4620 PL_colors[0],
b81d288d 4621 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4622 SvPVX(r->float_substr),
3280af22 4623 PL_colors[1],
c277df42 4624 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4625 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4626 else if (r->float_utf8)
4627 PerlIO_printf(Perl_debug_log,
4628 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4629 PL_colors[0],
4630 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4631 SvPVX(r->float_utf8),
4632 PL_colors[1],
4633 SvTAIL(r->float_utf8) ? "$" : "",
4634 (IV)r->float_min_offset, (UV)r->float_max_offset);
4635 if (r->check_substr || r->check_utf8)
b81d288d
AB
4636 PerlIO_printf(Perl_debug_log,
4637 r->check_substr == r->float_substr
33b8afdf 4638 && r->check_utf8 == r->float_utf8
c277df42
IZ
4639 ? "(checking floating" : "(checking anchored");
4640 if (r->reganch & ROPT_NOSCAN)
4641 PerlIO_printf(Perl_debug_log, " noscan");
4642 if (r->reganch & ROPT_CHECK_ALL)
4643 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4644 if (r->check_substr || r->check_utf8)
c277df42
IZ
4645 PerlIO_printf(Perl_debug_log, ") ");
4646
46fc3d4c 4647 if (r->regstclass) {
4648 regprop(sv, r->regstclass);
4649 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4650 }
774d564b 4651 if (r->reganch & ROPT_ANCH) {
4652 PerlIO_printf(Perl_debug_log, "anchored");
4653 if (r->reganch & ROPT_ANCH_BOL)
4654 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4655 if (r->reganch & ROPT_ANCH_MBOL)
4656 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4657 if (r->reganch & ROPT_ANCH_SBOL)
4658 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4659 if (r->reganch & ROPT_ANCH_GPOS)
4660 PerlIO_printf(Perl_debug_log, "(GPOS)");
4661 PerlIO_putc(Perl_debug_log, ' ');
4662 }
c277df42
IZ
4663 if (r->reganch & ROPT_GPOS_SEEN)
4664 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4665 if (r->reganch & ROPT_SKIP)
760ac839 4666 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4667 if (r->reganch & ROPT_IMPLICIT)
760ac839 4668 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4669 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4670 if (r->reganch & ROPT_EVAL_SEEN)
4671 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4672 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4673 if (r->offsets) {
4674 U32 i;
4675 U32 len = r->offsets[0];
392fbf5d 4676 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4677 for (i = 1; i <= len; i++)
392fbf5d
RB
4678 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4679 (UV)r->offsets[i*2-1],
4680 (UV)r->offsets[i*2]);
fac92740
MJD
4681 PerlIO_printf(Perl_debug_log, "\n");
4682 }
17c3b450 4683#endif /* DEBUGGING */
a687059c
LW
4684}
4685
8fa7f367
JH
4686#ifdef DEBUGGING
4687
653099ff
GS
4688STATIC void
4689S_put_byte(pTHX_ SV *sv, int c)
4690{
7be5a6cf 4691 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4692 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4693 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4694 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4695 else
4696 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4697}
4698
8fa7f367
JH
4699#endif /* DEBUGGING */
4700
a687059c 4701/*
a0d0e21e
LW
4702- regprop - printable representation of opcode
4703*/
46fc3d4c 4704void
864dbfa3 4705Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4706{
35ff7856 4707#ifdef DEBUGGING
9b155405 4708 register int k;
a0d0e21e 4709
54dc92de 4710 sv_setpvn(sv, "", 0);
9b155405 4711 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4712 /* It would be nice to FAIL() here, but this may be called from
4713 regexec.c, and it would be hard to supply pRExC_state. */
4714 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4715 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4716
4717 k = PL_regkind[(U8)OP(o)];
4718
2a782b5b
JH
4719 if (k == EXACT) {
4720 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4721 /* Using is_utf8_string() is a crude hack but it may
4722 * be the best for now since we have no flag "this EXACTish
4723 * node was UTF-8" --jhi */
4724 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4725 char *s = do_utf8 ?
c728cb41
JH
4726 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4727 UNI_DISPLAY_REGEX) :
2a782b5b 4728 STRING(o);
40eddc46 4729 int len = do_utf8 ?
2a782b5b
JH
4730 strlen(s) :
4731 STR_LEN(o);
4732 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4733 PL_colors[0],
4734 len, s,
4735 PL_colors[1]);
4736 }
9b155405 4737 else if (k == CURLY) {
cb434fcc 4738 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4739 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4740 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4741 }
2c2d71f5
JH
4742 else if (k == WHILEM && o->flags) /* Ordinal/of */
4743 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4744 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4745 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4746 else if (k == LOGICAL)
04ebc1ab 4747 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4748 else if (k == ANYOF) {
4749 int i, rangestart = -1;
ffc61ed2 4750 U8 flags = ANYOF_FLAGS(o);
a6d05634 4751 const char * const anyofs[] = { /* Should be synchronized with
19860706 4752 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4753 "\\w",
4754 "\\W",
4755 "\\s",
4756 "\\S",
4757 "\\d",
4758 "\\D",
4759 "[:alnum:]",
4760 "[:^alnum:]",
4761 "[:alpha:]",
4762 "[:^alpha:]",
4763 "[:ascii:]",
4764 "[:^ascii:]",
4765 "[:ctrl:]",
4766 "[:^ctrl:]",
4767 "[:graph:]",
4768 "[:^graph:]",
4769 "[:lower:]",
4770 "[:^lower:]",
4771 "[:print:]",
4772 "[:^print:]",
4773 "[:punct:]",
4774 "[:^punct:]",
4775 "[:upper:]",
aaa51d5e 4776 "[:^upper:]",
653099ff 4777 "[:xdigit:]",
aaa51d5e
JF
4778 "[:^xdigit:]",
4779 "[:space:]",
4780 "[:^space:]",
4781 "[:blank:]",
4782 "[:^blank:]"
653099ff
GS
4783 };
4784
19860706 4785 if (flags & ANYOF_LOCALE)
653099ff 4786 sv_catpv(sv, "{loc}");
19860706 4787 if (flags & ANYOF_FOLD)
653099ff
GS
4788 sv_catpv(sv, "{i}");
4789 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4790 if (flags & ANYOF_INVERT)
653099ff 4791 sv_catpv(sv, "^");
ffc61ed2
JH
4792 for (i = 0; i <= 256; i++) {
4793 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4794 if (rangestart == -1)
4795 rangestart = i;
4796 } else if (rangestart != -1) {
4797 if (i <= rangestart + 3)
4798 for (; rangestart < i; rangestart++)
653099ff 4799 put_byte(sv, rangestart);
ffc61ed2
JH
4800 else {
4801 put_byte(sv, rangestart);
4802 sv_catpv(sv, "-");
4803 put_byte(sv, i - 1);
653099ff 4804 }
ffc61ed2 4805 rangestart = -1;
653099ff 4806 }
847a199f 4807 }
ffc61ed2
JH
4808
4809 if (o->flags & ANYOF_CLASS)
4810 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4811 if (ANYOF_CLASS_TEST(o,i))
4812 sv_catpv(sv, anyofs[i]);
4813
4814 if (flags & ANYOF_UNICODE)
4815 sv_catpv(sv, "{unicode}");
1aa99e6b 4816 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4817 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4818
4819 {
4820 SV *lv;
9e55ce06 4821 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4822
ffc61ed2
JH
4823 if (lv) {
4824 if (sw) {
ffc61ed2 4825 U8 s[UTF8_MAXLEN+1];
b81d288d 4826
ffc61ed2 4827 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4828 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4829
3568d838 4830 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4831 if (rangestart == -1)
4832 rangestart = i;
4833 } else if (rangestart != -1) {
4834 U8 *p;
b81d288d 4835
ffc61ed2
JH
4836 if (i <= rangestart + 3)
4837 for (; rangestart < i; rangestart++) {
2b9d42f0 4838 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4839 put_byte(sv, *p);
4840 }
4841 else {
2b9d42f0 4842 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4843 put_byte(sv, *p);
4844 sv_catpv(sv, "-");
2b9d42f0 4845 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4846 put_byte(sv, *p);
4847 }
4848 rangestart = -1;
4849 }
19860706 4850 }
ffc61ed2
JH
4851
4852 sv_catpv(sv, "..."); /* et cetera */
19860706 4853 }
fde631ed 4854
ffc61ed2
JH
4855 {
4856 char *s = savepv(SvPVX(lv));
4857 char *origs = s;
b81d288d 4858
ffc61ed2 4859 while(*s && *s != '\n') s++;
b81d288d 4860
ffc61ed2
JH
4861 if (*s == '\n') {
4862 char *t = ++s;
4863
4864 while (*s) {
4865 if (*s == '\n')
4866 *s = ' ';
4867 s++;
4868 }
4869 if (s[-1] == ' ')
4870 s[-1] = 0;
4871
4872 sv_catpv(sv, t);
fde631ed 4873 }
b81d288d 4874
ffc61ed2 4875 Safefree(origs);
fde631ed
JH
4876 }
4877 }
653099ff 4878 }
ffc61ed2 4879
653099ff
GS
4880 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4881 }
9b155405 4882 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4883 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4884#endif /* DEBUGGING */
35ff7856 4885}
a687059c 4886
cad2e5aa
JH
4887SV *
4888Perl_re_intuit_string(pTHX_ regexp *prog)
4889{ /* Assume that RE_INTUIT is set */
4890 DEBUG_r(
4891 { STRLEN n_a;
33b8afdf
JH
4892 char *s = SvPV(prog->check_substr
4893 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4894
4895 if (!PL_colorset) reginitcolors();
4896 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4897 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4898 PL_colors[4],
4899 prog->check_substr ? "" : "utf8 ",
4900 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4901 s,
4902 PL_colors[1],
4903 (strlen(s) > 60 ? "..." : ""));
4904 } );
4905
33b8afdf 4906 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4907}
4908
2b69d0c2 4909void
864dbfa3 4910Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4911{
9e55ce06
JH
4912#ifdef DEBUGGING
4913 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4914#endif
7821416a
IZ
4915
4916 if (!r || (--r->refcnt > 0))
4917 return;
9e55ce06 4918 DEBUG_r({
d103360b
HS
4919 int len;
4920 char *s;
4921
4922 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4923 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4924 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4925 len = SvCUR(dsv);
9e55ce06
JH
4926 if (!PL_colorset)
4927 reginitcolors();
4928 PerlIO_printf(Perl_debug_log,
4929 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4930 PL_colors[4],PL_colors[5],PL_colors[0],
4931 len, len, s,
4932 PL_colors[1],
4933 len > 60 ? "..." : "");
4934 });
cad2e5aa 4935
c277df42 4936 if (r->precomp)
a0d0e21e 4937 Safefree(r->precomp);
fac92740
MJD
4938 if (r->offsets) /* 20010421 MJD */
4939 Safefree(r->offsets);
ed252734
NC
4940 RX_MATCH_COPY_FREE(r);
4941#ifdef PERL_COPY_ON_WRITE
4942 if (r->saved_copy)
4943 SvREFCNT_dec(r->saved_copy);
4944#endif
a193d654
GS
4945 if (r->substrs) {
4946 if (r->anchored_substr)
4947 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4948 if (r->anchored_utf8)
4949 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4950 if (r->float_substr)
4951 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4952 if (r->float_utf8)
4953 SvREFCNT_dec(r->float_utf8);
2779dcf1 4954 Safefree(r->substrs);
a193d654 4955 }
c277df42
IZ
4956 if (r->data) {
4957 int n = r->data->count;
f3548bdc
DM
4958 PAD* new_comppad = NULL;
4959 PAD* old_comppad;
dfad63ad 4960
c277df42 4961 while (--n >= 0) {
261faec3 4962 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4963 switch (r->data->what[n]) {
4964 case 's':
4965 SvREFCNT_dec((SV*)r->data->data[n]);
4966 break;
653099ff
GS
4967 case 'f':
4968 Safefree(r->data->data[n]);
4969 break;
dfad63ad
HS
4970 case 'p':
4971 new_comppad = (AV*)r->data->data[n];
4972 break;
c277df42 4973 case 'o':
dfad63ad 4974 if (new_comppad == NULL)
cea2e8a9 4975 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
4976 PAD_SAVE_LOCAL(old_comppad,
4977 /* Watch out for global destruction's random ordering. */
4978 (SvTYPE(new_comppad) == SVt_PVAV) ?
4979 new_comppad : Null(PAD *)
4980 );
9b978d73
DM
4981 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4982 op_free((OP_4tree*)r->data->data[n]);
4983 }
4984
f3548bdc 4985 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4986 SvREFCNT_dec((SV*)new_comppad);
4987 new_comppad = NULL;
c277df42
IZ
4988 break;
4989 case 'n':
9e55ce06 4990 break;
c277df42 4991 default:
830247a4 4992 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4993 }
4994 }
4995 Safefree(r->data->what);
4996 Safefree(r->data);
a0d0e21e
LW
4997 }
4998 Safefree(r->startp);
4999 Safefree(r->endp);
5000 Safefree(r);
a687059c 5001}
c277df42
IZ
5002
5003/*
5004 - regnext - dig the "next" pointer out of a node
5005 *
5006 * [Note, when REGALIGN is defined there are two places in regmatch()
5007 * that bypass this code for speed.]
5008 */
5009regnode *
864dbfa3 5010Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
5011{
5012 register I32 offset;
5013
3280af22 5014 if (p == &PL_regdummy)
c277df42
IZ
5015 return(NULL);
5016
5017 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5018 if (offset == 0)
5019 return(NULL);
5020
c277df42 5021 return(p+offset);
c277df42
IZ
5022}
5023
01f988be 5024STATIC void
cea2e8a9 5025S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
5026{
5027 va_list args;
5028 STRLEN l1 = strlen(pat1);
5029 STRLEN l2 = strlen(pat2);
5030 char buf[512];
06bf62c7 5031 SV *msv;
c277df42
IZ
5032 char *message;
5033
5034 if (l1 > 510)
5035 l1 = 510;
5036 if (l1 + l2 > 510)
5037 l2 = 510 - l1;
5038 Copy(pat1, buf, l1 , char);
5039 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5040 buf[l1 + l2] = '\n';
5041 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5042#ifdef I_STDARG
5043 /* ANSI variant takes additional second argument */
c277df42 5044 va_start(args, pat2);
8736538c
AS
5045#else
5046 va_start(args);
5047#endif
5a844595 5048 msv = vmess(buf, &args);
c277df42 5049 va_end(args);
06bf62c7 5050 message = SvPV(msv,l1);
c277df42
IZ
5051 if (l1 > 512)
5052 l1 = 512;
5053 Copy(message, buf, l1 , char);
197cf9b9 5054 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 5055 Perl_croak(aTHX_ "%s", buf);
c277df42 5056}
a0ed51b3
LW
5057
5058/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5059
5060void
864dbfa3 5061Perl_save_re_context(pTHX)
b81d288d 5062{
830247a4 5063 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5064 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5065 SAVEPPTR(PL_reginput); /* String-input pointer. */
5066 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5067 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5068 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5069 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5070 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 5071 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5072 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5073 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5074 PL_reg_start_tmp = 0;
a0ed51b3
LW
5075 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5076 PL_reg_start_tmpl = 0;
7766f137 5077 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5078 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5079 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5080 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5081 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5082 SAVEVPTR(PL_regcc); /* from regexec.c */
5083 SAVEVPTR(PL_curcop);
7766f137
GS
5084 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5085 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5086 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5087 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5088 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5089 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5090 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5091 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5092 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
5093 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5094 PL_reg_oldsaved = Nullch;
5095 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5096 PL_reg_oldsavedlen = 0;
ed252734
NC
5097#ifdef PERL_COPY_ON_WRITE
5098 SAVESPTR(PL_nrs);
5099 PL_nrs = Nullsv;
5100#endif
a5db57d6
GS
5101 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5102 PL_reg_maxiter = 0;
5103 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5104 PL_reg_leftiter = 0;
5105 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5106 PL_reg_poscache = Nullch;
5107 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5108 PL_reg_poscache_size = 0;
5109 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5110 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5111 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5112
5113 {
5114 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5115 U32 i;
ada6e8a9
AMS
5116 GV *mgv;
5117 REGEXP *rx;
5118 char digits[16];
5119
5120 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5121 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5122 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5123 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5124 save_scalar(mgv);
5125 }
5126 }
5127 }
5128
54b6e2fa 5129#ifdef DEBUGGING
b81d288d 5130 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5131#endif
a0ed51b3 5132}
51371543 5133
51371543 5134static void
acfe0abc 5135clear_re(pTHX_ void *r)
51371543
GS
5136{
5137 ReREFCNT_dec((regexp *)r);
5138}
ffbc6a93 5139