This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: ../lib/ExtUtils/t/Embed.t failure on Win32/GCC
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
a687059c
LW
8/* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
10 */
11
12/* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
15 */
16
e50aee73
AD
17/* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
20*/
21
b9d5759e
AD
22#ifdef PERL_EXT_RE_BUILD
23/* need to replace pregcomp et al, so enable that */
24# ifndef PERL_IN_XSUB_RE
25# define PERL_IN_XSUB_RE
26# endif
27/* need access to debugger hooks */
cad2e5aa 28# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
29# define DEBUGGING
30# endif
31#endif
32
33#ifdef PERL_IN_XSUB_RE
d06ea78c 34/* We *really* need to overwrite these symbols: */
56953603
IZ
35# define Perl_pregcomp my_regcomp
36# define Perl_regdump my_regdump
37# define Perl_regprop my_regprop
d06ea78c 38# define Perl_pregfree my_regfree
cad2e5aa
JH
39# define Perl_re_intuit_string my_re_intuit_string
40/* *These* symbols are masked to allow static link. */
d06ea78c 41# define Perl_regnext my_regnext
f0b8d043 42# define Perl_save_re_context my_save_re_context
b81d288d 43# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
44
45# define PERL_NO_GET_CONTEXT
b81d288d 46#endif
56953603 47
f0fcb552 48/*SUPPRESS 112*/
a687059c 49/*
e50aee73 50 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
51 *
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
54 *
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
58 *
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
61 * from defects in it.
62 *
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
65 *
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
68 *
69 *
70 **** Alterations to Henry's code are...
71 ****
4bb101f2
JH
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
a687059c 74 ****
9ef589d8
LW
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
77
a687059c
LW
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
864dbfa3 84#define PERL_IN_REGCOMP_C
a687059c 85#include "perl.h"
d06ea78c 86
acfe0abc 87#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
88# include "INTERN.h"
89#endif
c277df42
IZ
90
91#define REG_COMP_C
a687059c
LW
92#include "regcomp.h"
93
d4cce5f1 94#ifdef op
11343788 95#undef op
d4cce5f1 96#endif /* op */
11343788 97
fe14fcc3 98#ifdef MSDOS
7e4e8c89 99# if defined(BUGGY_MSC6)
fe14fcc3 100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 101# pragma optimize("a",off)
fe14fcc3 102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
103# pragma optimize("w",on )
104# endif /* BUGGY_MSC6 */
fe14fcc3
LW
105#endif /* MSDOS */
106
a687059c
LW
107#ifndef STATIC
108#define STATIC static
109#endif
110
830247a4 111typedef struct RExC_state_t {
e2509266 112 U32 flags; /* are we folding, multilining? */
830247a4
IZ
113 char *precomp; /* uncompiled string. */
114 regexp *rx;
fac92740 115 char *start; /* Start of input for compile */
830247a4
IZ
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 119 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 120 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
123 U32 seen;
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
126 I32 extralen;
127 I32 seen_zerolen;
128 I32 seen_evals;
1aa99e6b 129 I32 utf8;
830247a4
IZ
130#if ADD_TO_REGEXEC
131 char *starttry; /* -Dr: where regtry was called. */
132#define RExC_starttry (pRExC_state->starttry)
133#endif
134} RExC_state_t;
135
e2509266 136#define RExC_flags (pRExC_state->flags)
830247a4
IZ
137#define RExC_precomp (pRExC_state->precomp)
138#define RExC_rx (pRExC_state->rx)
fac92740 139#define RExC_start (pRExC_state->start)
830247a4
IZ
140#define RExC_end (pRExC_state->end)
141#define RExC_parse (pRExC_state->parse)
142#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 143#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 144#define RExC_emit (pRExC_state->emit)
fac92740 145#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
146#define RExC_naughty (pRExC_state->naughty)
147#define RExC_sawback (pRExC_state->sawback)
148#define RExC_seen (pRExC_state->seen)
149#define RExC_size (pRExC_state->size)
150#define RExC_npar (pRExC_state->npar)
151#define RExC_extralen (pRExC_state->extralen)
152#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 154#define RExC_utf8 (pRExC_state->utf8)
830247a4 155
a687059c
LW
156#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
a687059c 159
35c8bce7
LW
160#ifdef SPSTART
161#undef SPSTART /* dratted cpp namespace... */
162#endif
a687059c
LW
163/*
164 * Flags to be passed up and down.
165 */
a687059c 166#define WORST 0 /* Worst case. */
821b33a5 167#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
168#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169#define SPSTART 0x4 /* Starts with * or +. */
170#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 171
2c2d71f5
JH
172/* Length of a variant. */
173
174typedef struct scan_data_t {
175 I32 len_min;
176 I32 len_delta;
177 I32 pos_min;
178 I32 pos_delta;
179 SV *last_found;
180 I32 last_end; /* min value, <0 unless valid. */
181 I32 last_start_min;
182 I32 last_start_max;
183 SV **longest; /* Either &l_fixed, or &l_float. */
184 SV *longest_fixed;
185 I32 offset_fixed;
186 SV *longest_float;
187 I32 offset_float_min;
188 I32 offset_float_max;
189 I32 flags;
190 I32 whilem_c;
cb434fcc 191 I32 *last_closep;
653099ff 192 struct regnode_charclass_class *start_class;
2c2d71f5
JH
193} scan_data_t;
194
a687059c 195/*
e50aee73 196 * Forward declarations for pregcomp()'s friends.
a687059c 197 */
a0d0e21e 198
b81d288d 199static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
cb434fcc 200 0, 0, 0, 0, 0, 0};
c277df42
IZ
201
202#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203#define SF_BEFORE_SEOL 0x1
204#define SF_BEFORE_MEOL 0x2
205#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
207
09b7f37c
CB
208#ifdef NO_UNARY_PLUS
209# define SF_FIX_SHIFT_EOL (0+2)
210# define SF_FL_SHIFT_EOL (0+4)
211#else
212# define SF_FIX_SHIFT_EOL (+2)
213# define SF_FL_SHIFT_EOL (+4)
214#endif
c277df42
IZ
215
216#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218
219#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221#define SF_IS_INF 0x40
222#define SF_HAS_PAR 0x80
223#define SF_IN_PAR 0x100
224#define SF_HAS_EVAL 0x200
4bfe0158 225#define SCF_DO_SUBSTR 0x400
653099ff
GS
226#define SCF_DO_STCLASS_AND 0x0800
227#define SCF_DO_STCLASS_OR 0x1000
228#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 229#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 230
eb160463 231#define UTF (RExC_utf8 != 0)
e2509266
JH
232#define LOC ((RExC_flags & PMf_LOCALE) != 0)
233#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 234
ffc61ed2 235#define OOB_UNICODE 12345678
93733859 236#define OOB_NAMEDCLASS -1
b8c5462f 237
a0ed51b3
LW
238#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
240
8615cb43 241
b45f050a
JF
242/* length of regex to show in messages that don't mark a position within */
243#define RegexLengthToShowInErrorMessages 127
244
245/*
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
249 */
7253e4e3
RK
250#define MARKER1 "<-- HERE" /* marker as it appears in the description */
251#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 252
7253e4e3 253#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
254
255/*
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
258 * "...".
259 */
ccb2c380
MP
260#define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
263 \
264 if (!SIZE_ONLY) \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
269 ellipses = "..."; \
270 } \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
273} STMT_END
8615cb43 274
b45f050a
JF
275/*
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
278 * "...".
279 */
ccb2c380
MP
280#define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
283 \
284 if (!SIZE_ONLY) \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
289 ellipses = "..."; \
290 } \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
293} STMT_END
b45f050a
JF
294
295
296/*
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 */
ccb2c380
MP
299#define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
303} STMT_END
b45f050a
JF
304
305/*
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 */
ccb2c380
MP
308#define vFAIL(m) STMT_START { \
309 if (!SIZE_ONLY) \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
311 Simple_vFAIL(m); \
312} STMT_END
b45f050a
JF
313
314/*
315 * Like Simple_vFAIL(), but accepts two arguments.
316 */
ccb2c380
MP
317#define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
321} STMT_END
b45f050a
JF
322
323/*
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 */
ccb2c380
MP
326#define vFAIL2(m,a1) STMT_START { \
327 if (!SIZE_ONLY) \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
330} STMT_END
b45f050a
JF
331
332
333/*
334 * Like Simple_vFAIL(), but accepts three arguments.
335 */
ccb2c380
MP
336#define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
340} STMT_END
b45f050a
JF
341
342/*
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 */
ccb2c380
MP
345#define vFAIL3(m,a1,a2) STMT_START { \
346 if (!SIZE_ONLY) \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
349} STMT_END
b45f050a
JF
350
351/*
352 * Like Simple_vFAIL(), but accepts four arguments.
353 */
ccb2c380
MP
354#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
358} STMT_END
b45f050a
JF
359
360/*
361 * Like Simple_vFAIL(), but accepts five arguments.
362 */
ccb2c380
MP
363#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
367} STMT_END
368
369
370#define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
374} STMT_END
375
376#define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
381} STMT_END
382
383
384#define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
388} STMT_END
389
390#define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
394} STMT_END
395
396#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
400} STMT_END
401
402#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
406} STMT_END
9d1d55b5 407
8615cb43 408
cd439c50 409/* Allow for side effects in s */
ccb2c380
MP
410#define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
412} STMT_END
cd439c50 413
fac92740
MJD
414/* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
418 */
419
420#define MJD_OFFSET_DEBUG(x)
ccb2c380
MP
421/* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
422
423
424#define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 if (! SIZE_ONLY) { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
428 if((node) < 0) { \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 } else { \
431 RExC_offsets[2*(node)-1] = (byte); \
432 } \
433 } \
434} STMT_END
435
436#define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439
440#define Set_Node_Length_To_R(node,len) STMT_START { \
441 if (! SIZE_ONLY) { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
444 if((node) < 0) { \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 } else { \
447 RExC_offsets[2*(node)] = (len); \
448 } \
449 } \
450} STMT_END
451
452#define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455#define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
457
458/* Get offsets and lengths */
459#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461
acfe0abc 462static void clear_re(pTHX_ void *r);
4327152a 463
653099ff
GS
464/* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
467
4327152a 468STATIC void
830247a4 469S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 470{
a0ed51b3
LW
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 473
c277df42 474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 475 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
b81d288d 479 data->flags
c277df42
IZ
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 else
482 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
483 }
484 else {
c277df42 485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
486 data->offset_float_max = (l
487 ? data->last_start_max
c277df42 488 : data->pos_min + data->pos_delta);
9051bda5
HS
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
c277df42 491 if (data->flags & SF_BEFORE_EOL)
b81d288d 492 data->flags
c277df42
IZ
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 else
495 data->flags &= ~SF_FL_BEFORE_EOL;
496 }
497 }
498 SvCUR_set(data->last_found, 0);
0eda9292
JH
499 {
500 SV * sv = data->last_found;
501 MAGIC *mg =
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
504 mg->mg_len = 0;
505 }
c277df42
IZ
506 data->last_end = -1;
507 data->flags &= ~SF_BEFORE_EOL;
508}
509
653099ff
GS
510/* Can match anything (initialization) */
511STATIC void
830247a4 512S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 513{
653099ff 514 ANYOF_CLASS_ZERO(cl);
f8bef550 515 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
517 if (LOC)
518 cl->flags |= ANYOF_LOCALE;
519}
520
521/* Can match anything (initialization) */
522STATIC int
523S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
524{
525 int value;
526
aaa51d5e 527 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 return 1;
1aa99e6b
IH
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 return 0;
f8bef550
NC
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
533 return 0;
653099ff
GS
534 return 1;
535}
536
537/* Can match anything (initialization) */
538STATIC void
830247a4 539S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 540{
8ecf7187 541 Zero(cl, 1, struct regnode_charclass_class);
653099ff 542 cl->type = ANYOF;
830247a4 543 cl_anything(pRExC_state, cl);
653099ff
GS
544}
545
546STATIC void
830247a4 547S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 548{
8ecf7187 549 Zero(cl, 1, struct regnode_charclass_class);
653099ff 550 cl->type = ANYOF;
830247a4 551 cl_anything(pRExC_state, cl);
653099ff
GS
552 if (LOC)
553 cl->flags |= ANYOF_LOCALE;
554}
555
556/* 'And' a given class with another one. Can create false positives */
557/* We assume that cl is not inverted */
558STATIC void
559S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
561{
653099ff
GS
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
567 int i;
568
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
572 else
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
1aa99e6b 578
14ebb1a2
JH
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
581 cl->flags &= ~ANYOF_UNICODE_ALL;
582 cl->flags |= ANYOF_UNICODE;
583 ARG_SET(cl, ARG(and_with));
584 }
14ebb1a2
JH
585 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 587 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 590 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
591}
592
593/* 'OR' a given class with another one. Can create false positives */
594/* We assume that cl is not inverted */
595STATIC void
830247a4 596S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 597{
653099ff
GS
598 if (or_with->flags & ANYOF_INVERT) {
599 /* We do not use
600 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601 * <= (B1 | !B2) | (CL1 | !CL2)
602 * which is wasteful if CL2 is small, but we ignore CL2:
603 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604 * XXXX Can we handle case-fold? Unclear:
605 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 */
608 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609 && !(or_with->flags & ANYOF_FOLD)
610 && !(cl->flags & ANYOF_FOLD) ) {
611 int i;
612
613 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614 cl->bitmap[i] |= ~or_with->bitmap[i];
615 } /* XXXX: logic is complicated otherwise */
616 else {
830247a4 617 cl_anything(pRExC_state, cl);
653099ff
GS
618 }
619 } else {
620 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 622 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
623 || (cl->flags & ANYOF_FOLD)) ) {
624 int i;
625
626 /* OR char bitmap and class bitmap separately */
627 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628 cl->bitmap[i] |= or_with->bitmap[i];
629 if (or_with->flags & ANYOF_CLASS) {
630 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631 cl->classflags[i] |= or_with->classflags[i];
632 cl->flags |= ANYOF_CLASS;
633 }
634 }
635 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 636 cl_anything(pRExC_state, cl);
653099ff
GS
637 }
638 }
639 if (or_with->flags & ANYOF_EOS)
640 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
641
642 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643 ARG(cl) != ARG(or_with)) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
646 }
647 if (or_with->flags & ANYOF_UNICODE_ALL) {
648 cl->flags |= ANYOF_UNICODE_ALL;
649 cl->flags &= ~ANYOF_UNICODE;
650 }
653099ff
GS
651}
652
5d1c421c
JH
653/*
654 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655 * These need to be revisited when a newer toolchain becomes available.
656 */
657#if defined(__sparc64__) && defined(__GNUC__)
658# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659# undef SPARC64_GCC_WORKAROUND
660# define SPARC64_GCC_WORKAROUND 1
661# endif
662#endif
663
653099ff
GS
664/* REx optimizer. Converts nodes into quickier variants "in place".
665 Finds fixed substrings. */
666
c277df42
IZ
667/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668 to the position after last scanned or to NULL. */
669
76e3520e 670STATIC I32
830247a4 671S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
c277df42
IZ
672 /* scanp: Start here (read-write). */
673 /* deltap: Write maxlen-minlen here. */
674 /* last: Stop before this one. */
675{
676 I32 min = 0, pars = 0, code;
677 regnode *scan = *scanp, *next;
678 I32 delta = 0;
679 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 680 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
681 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682 scan_data_t data_fake;
653099ff 683 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
b81d288d 684
c277df42
IZ
685 while (scan && OP(scan) != END && scan < last) {
686 /* Peephole optimizer: */
687
22c35a8c 688 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 689 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
690 regnode *n = regnext(scan);
691 U32 stringok = 1;
692#ifdef DEBUGGING
693 regnode *stop = scan;
b81d288d 694#endif
c277df42 695
cd439c50 696 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
697 /* Skip NOTHING, merge EXACT*. */
698 while (n &&
b81d288d 699 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
700 (stringok && (OP(n) == OP(scan))))
701 && NEXT_OFF(n)
702 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703 if (OP(n) == TAIL || n > next)
704 stringok = 0;
22c35a8c 705 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
706 NEXT_OFF(scan) += NEXT_OFF(n);
707 next = n + NODE_STEP_REGNODE;
708#ifdef DEBUGGING
709 if (stringok)
710 stop = n;
b81d288d 711#endif
c277df42 712 n = regnext(n);
a0ed51b3 713 }
f49d4d0f 714 else if (stringok) {
cd439c50 715 int oldl = STR_LEN(scan);
c277df42 716 regnode *nnext = regnext(n);
f49d4d0f 717
b81d288d 718 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
719 break;
720 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
721 STR_LEN(scan) += STR_LEN(n);
722 next = n + NODE_SZ_STR(n);
c277df42 723 /* Now we can overwrite *n : */
f49d4d0f 724 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 725#ifdef DEBUGGING
f49d4d0f 726 stop = next - 1;
b81d288d 727#endif
c277df42
IZ
728 n = nnext;
729 }
730 }
61a36c01 731
d65e4eab 732 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
61a36c01
JH
733/*
734 Two problematic code points in Unicode casefolding of EXACT nodes:
735
736 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
738
739 which casefold to
740
741 Unicode UTF-8
742
743 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
744 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
745
746 This means that in case-insensitive matching (or "loose matching",
747 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748 length of the above casefolded versions) can match a target string
749 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750 This would rather mess up the minimum length computation.
751
752 What we'll do is to look for the tail four bytes, and then peek
753 at the preceding two bytes to see whether we need to decrease
754 the minimum length by four (six minus two).
755
756 Thanks to the design of UTF-8, there cannot be false matches:
757 A sequence of valid UTF-8 bytes cannot be a subsequence of
758 another valid sequence of UTF-8 bytes.
759
760*/
761 char *s0 = STRING(scan), *s, *t;
762 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763 char *t0 = "\xcc\x88\xcc\x81";
764 char *t1 = t0 + 3;
765
766 for (s = s0 + 2;
767 s < s2 && (t = ninstr(s, s1, t0, t1));
768 s = t + 4) {
769 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
771 min -= 4;
772 }
773 }
774
c277df42
IZ
775#ifdef DEBUGGING
776 /* Allow dumping */
cd439c50 777 n = scan + NODE_SZ_STR(scan);
c277df42 778 while (n <= stop) {
22c35a8c 779 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
780 OP(n) = OPTIMIZED;
781 NEXT_OFF(n) = 0;
782 }
783 n++;
784 }
653099ff 785#endif
c277df42 786 }
653099ff
GS
787 /* Follow the next-chain of the current node and optimize
788 away all the NOTHINGs from it. */
c277df42 789 if (OP(scan) != CURLYX) {
048cfca1
GS
790 int max = (reg_off_by_arg[OP(scan)]
791 ? I32_MAX
792 /* I32 may be smaller than U16 on CRAYs! */
793 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
794 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
795 int noff;
796 regnode *n = scan;
b81d288d 797
c277df42
IZ
798 /* Skip NOTHING and LONGJMP. */
799 while ((n = regnext(n))
22c35a8c 800 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
801 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
802 && off + noff < max)
803 off += noff;
804 if (reg_off_by_arg[OP(scan)])
805 ARG(scan) = off;
b81d288d 806 else
c277df42
IZ
807 NEXT_OFF(scan) = off;
808 }
653099ff
GS
809 /* The principal pseudo-switch. Cannot be a switch, since we
810 look into several different things. */
b81d288d 811 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
812 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813 next = regnext(scan);
814 code = OP(scan);
b81d288d
AB
815
816 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 817 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 818 struct regnode_charclass_class accum;
c277df42 819
653099ff 820 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 821 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 822 if (flags & SCF_DO_STCLASS)
830247a4 823 cl_init_zero(pRExC_state, &accum);
c277df42 824 while (OP(scan) == code) {
830247a4 825 I32 deltanext, minnext, f = 0, fake;
653099ff 826 struct regnode_charclass_class this_class;
c277df42
IZ
827
828 num++;
829 data_fake.flags = 0;
b81d288d 830 if (data) {
2c2d71f5 831 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
832 data_fake.last_closep = data->last_closep;
833 }
834 else
835 data_fake.last_closep = &fake;
c277df42
IZ
836 next = regnext(scan);
837 scan = NEXTOPER(scan);
838 if (code != BRANCH)
839 scan = NEXTOPER(scan);
653099ff 840 if (flags & SCF_DO_STCLASS) {
830247a4 841 cl_init(pRExC_state, &this_class);
653099ff
GS
842 data_fake.start_class = &this_class;
843 f = SCF_DO_STCLASS_AND;
b81d288d 844 }
e1901655
IZ
845 if (flags & SCF_WHILEM_VISITED_POS)
846 f |= SCF_WHILEM_VISITED_POS;
653099ff 847 /* we suppose the run is continuous, last=next...*/
830247a4
IZ
848 minnext = study_chunk(pRExC_state, &scan, &deltanext,
849 next, &data_fake, f);
b81d288d 850 if (min1 > minnext)
c277df42
IZ
851 min1 = minnext;
852 if (max1 < minnext + deltanext)
853 max1 = minnext + deltanext;
854 if (deltanext == I32_MAX)
aca2d497 855 is_inf = is_inf_internal = 1;
c277df42
IZ
856 scan = next;
857 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
858 pars++;
405ff068 859 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 860 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
861 if (data)
862 data->whilem_c = data_fake.whilem_c;
653099ff 863 if (flags & SCF_DO_STCLASS)
830247a4 864 cl_or(pRExC_state, &accum, &this_class);
b81d288d 865 if (code == SUSPEND)
c277df42
IZ
866 break;
867 }
868 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
869 min1 = 0;
870 if (flags & SCF_DO_SUBSTR) {
871 data->pos_min += min1;
872 data->pos_delta += max1 - min1;
873 if (max1 != min1 || is_inf)
874 data->longest = &(data->longest_float);
875 }
876 min += min1;
877 delta += max1 - min1;
653099ff 878 if (flags & SCF_DO_STCLASS_OR) {
830247a4 879 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
880 if (min1) {
881 cl_and(data->start_class, &and_with);
882 flags &= ~SCF_DO_STCLASS;
883 }
884 }
885 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
886 if (min1) {
887 cl_and(data->start_class, &accum);
653099ff 888 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
889 }
890 else {
b81d288d 891 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
892 * data->start_class */
893 StructCopy(data->start_class, &and_with,
894 struct regnode_charclass_class);
895 flags &= ~SCF_DO_STCLASS_AND;
896 StructCopy(&accum, data->start_class,
897 struct regnode_charclass_class);
898 flags |= SCF_DO_STCLASS_OR;
899 data->start_class->flags |= ANYOF_EOS;
900 }
653099ff 901 }
a0ed51b3
LW
902 }
903 else if (code == BRANCHJ) /* single branch is optimized. */
c277df42
IZ
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
907 continue;
a0ed51b3
LW
908 }
909 else if (OP(scan) == EXACT) {
cd439c50 910 I32 l = STR_LEN(scan);
1aa99e6b 911 UV uc = *((U8*)STRING(scan));
a0ed51b3 912 if (UTF) {
1aa99e6b
IH
913 U8 *s = (U8*)STRING(scan);
914 l = utf8_length(s, s + l);
9041c2e3 915 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
916 }
917 min += l;
c277df42 918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
b81d288d 924 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 925 }
cd439c50 926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
927 {
928 SV * sv = data->last_found;
929 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930 mg_find(sv, PERL_MAGIC_utf8) : NULL;
931 if (mg && mg->mg_len >= 0)
5e43f467
JH
932 mg->mg_len += utf8_length((U8*)STRING(scan),
933 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 934 }
33b8afdf
JH
935 if (UTF)
936 SvUTF8_on(data->last_found);
c277df42
IZ
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
940 }
653099ff
GS
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
943 int compat = 1;
944
1aa99e6b 945 if (uc >= 0x100 ||
516a5887 946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 948 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 950 )
653099ff
GS
951 compat = 0;
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
954 if (compat)
1aa99e6b 955 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 956 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
957 if (uc < 0x100)
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
959 }
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
1aa99e6b 962 if (uc < 0x100)
9b877dbb
IH
963 ANYOF_BITMAP_SET(data->start_class, uc);
964 else
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
968 }
969 flags &= ~SCF_DO_STCLASS;
a0ed51b3 970 }
653099ff 971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 972 I32 l = STR_LEN(scan);
1aa99e6b 973 UV uc = *((U8*)STRING(scan));
653099ff
GS
974
975 /* Search for fixed substrings supports EXACT only. */
b81d288d 976 if (flags & SCF_DO_SUBSTR)
830247a4 977 scan_commit(pRExC_state, data);
a0ed51b3 978 if (UTF) {
1aa99e6b
IH
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
9041c2e3 981 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
982 }
983 min += l;
c277df42 984 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 985 data->pos_min += l;
653099ff
GS
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
988 int compat = 1;
989
1aa99e6b 990 if (uc >= 0x100 ||
516a5887 991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
994 compat = 0;
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
997 if (compat) {
1aa99e6b 998 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1003 }
1004 }
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1aa99e6b
IH
1009 if (uc < 0x100)
1010 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
1011 data->start_class->flags &= ~ANYOF_EOS;
1012 }
1013 cl_and(data->start_class, &and_with);
1014 }
1015 flags &= ~SCF_DO_STCLASS;
a0ed51b3 1016 }
4d61ec05 1017 else if (strchr((char*)PL_varies,OP(scan))) {
9c5ffd7c 1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 1019 I32 f = flags, pos_before = 0;
c277df42 1020 regnode *oscan = scan;
653099ff
GS
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
727f22e3 1023 I32 next_is_eval = 0;
653099ff 1024
22c35a8c 1025 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1026 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
1027 scan = NEXTOPER(scan);
1028 goto finish;
1029 case PLUS:
653099ff 1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 1031 next = NEXTOPER(scan);
653099ff 1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
1033 mincount = 1;
1034 maxcount = REG_INFTY;
c277df42
IZ
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1037 goto do_curly;
1038 }
1039 }
1040 if (flags & SCF_DO_SUBSTR)
1041 data->pos_min++;
1042 min++;
1043 /* Fall through. */
1044 case STAR:
653099ff
GS
1045 if (flags & SCF_DO_STCLASS) {
1046 mincount = 0;
b81d288d 1047 maxcount = REG_INFTY;
653099ff
GS
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1050 goto do_curly;
1051 }
b81d288d 1052 is_inf = is_inf_internal = 1;
c277df42
IZ
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
830247a4 1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
1056 data->longest = &(data->longest_float);
1057 }
1058 goto optimize_curly_tail;
1059 case CURLY:
b81d288d 1060 mincount = ARG1(scan);
c277df42
IZ
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
cb434fcc
IZ
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1065
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1067 }
c277df42 1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 1069 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
1070 do_curly:
1071 if (flags & SCF_DO_SUBSTR) {
830247a4 1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
1073 pos_before = data->pos_min;
1074 }
1075 if (data) {
1076 fl = data->flags;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1078 if (is_inf)
1079 data->flags |= SF_IS_INF;
1080 }
653099ff 1081 if (flags & SCF_DO_STCLASS) {
830247a4 1082 cl_init(pRExC_state, &this_class);
653099ff
GS
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1087 }
e1901655
IZ
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
b81d288d 1091
e1901655
IZ
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 1096
c277df42 1097 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d
AB
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1099 mincount == 0
653099ff
GS
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1101
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1106 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1107 }
1108 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 1109 /* Switch to OR mode: cache the old value of
653099ff
GS
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1118 }
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1121 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
1122 cl_and(data->start_class, &and_with);
1123 }
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1127 }
c277df42
IZ
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1129 scan = next;
84037bb0 1130 if (ckWARN(WARN_REGEXP)
727f22e3
JP
1131 /* ? quantifier ok, except for (?{ ... }) */
1132 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 1133 && (minnext == 0) && (deltanext == 0)
99799961 1134 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 1135 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 1136 {
830247a4 1137 vWARN(RExC_parse,
b45f050a
JF
1138 "Quantifier unexpected on zero-length expression");
1139 }
1140
c277df42 1141 min += minnext * mincount;
b81d288d 1142 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
aca2d497 1145 is_inf |= is_inf_internal;
c277df42
IZ
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1147
1148 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 1149 if ( OP(oscan) == CURLYX && data
c277df42
IZ
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
1155 regnode *nxt1 = nxt;
1156#ifdef DEBUGGING
1157 regnode *nxt2;
1158#endif
c277df42
IZ
1159
1160 /* Skip open. */
1161 nxt = regnext(nxt);
4d61ec05 1162 if (!strchr((char*)PL_simple,OP(nxt))
22c35a8c 1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 1164 && STR_LEN(nxt) == 1))
c277df42 1165 goto nogo;
497b47a8 1166#ifdef DEBUGGING
c277df42 1167 nxt2 = nxt;
497b47a8 1168#endif
c277df42 1169 nxt = regnext(nxt);
b81d288d 1170 if (OP(nxt) != CLOSE)
c277df42
IZ
1171 goto nogo;
1172 /* Now we know that nxt2 is the only contents: */
eb160463 1173 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1174 OP(oscan) = CURLYN;
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1176#ifdef DEBUGGING
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 1183#endif
c277df42 1184 }
c277df42
IZ
1185 nogo:
1186
1187 /* Try optimization CURLYX => CURLYM. */
b81d288d 1188 if ( OP(oscan) == CURLYX && data
c277df42 1189 && !(data->flags & SF_HAS_PAR)
c277df42
IZ
1190 && !(data->flags & SF_HAS_EVAL)
1191 && !deltanext ) {
1192 /* XXXX How to optimize if data == 0? */
1193 /* Optimize to a simpler form. */
1194 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1195 regnode *nxt2;
1196
1197 OP(oscan) = CURLYM;
1198 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 1199 && (OP(nxt2) != WHILEM))
c277df42
IZ
1200 nxt = nxt2;
1201 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
1202 /* Need to optimize away parenths. */
1203 if (data->flags & SF_IN_PAR) {
1204 /* Set the parenth number. */
1205 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1206
b81d288d 1207 if (OP(nxt) != CLOSE)
b45f050a 1208 FAIL("Panic opt close");
eb160463 1209 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
1210 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1211 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1212#ifdef DEBUGGING
1213 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1214 OP(nxt + 1) = OPTIMIZED; /* was count. */
1215 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1216 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 1217#endif
c277df42
IZ
1218#if 0
1219 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1220 regnode *nnxt = regnext(nxt1);
b81d288d 1221
c277df42
IZ
1222 if (nnxt == nxt) {
1223 if (reg_off_by_arg[OP(nxt1)])
1224 ARG_SET(nxt1, nxt2 - nxt1);
1225 else if (nxt2 - nxt1 < U16_MAX)
1226 NEXT_OFF(nxt1) = nxt2 - nxt1;
1227 else
1228 OP(nxt) = NOTHING; /* Cannot beautify */
1229 }
1230 nxt1 = nnxt;
1231 }
1232#endif
1233 /* Optimize again: */
b81d288d 1234 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
e1901655 1235 NULL, 0);
a0ed51b3
LW
1236 }
1237 else
c277df42 1238 oscan->flags = 0;
c277df42 1239 }
e1901655
IZ
1240 else if ((OP(oscan) == CURLYX)
1241 && (flags & SCF_WHILEM_VISITED_POS)
1242 /* See the comment on a similar expression above.
1243 However, this time it not a subexpression
1244 we care about, but the expression itself. */
1245 && (maxcount == REG_INFTY)
1246 && data && ++data->whilem_c < 16) {
1247 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
1248 /* Find WHILEM (as in regexec.c) */
1249 regnode *nxt = oscan + NEXT_OFF(oscan);
1250
1251 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1252 nxt += ARG(nxt);
eb160463
GS
1253 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1254 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 1255 }
b81d288d 1256 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
1257 pars++;
1258 if (flags & SCF_DO_SUBSTR) {
1259 SV *last_str = Nullsv;
1260 int counted = mincount != 0;
1261
1262 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
1263#if defined(SPARC64_GCC_WORKAROUND)
1264 I32 b = 0;
1265 STRLEN l = 0;
1266 char *s = NULL;
1267 I32 old = 0;
1268
1269 if (pos_before >= data->last_start_min)
1270 b = pos_before;
1271 else
1272 b = data->last_start_min;
1273
1274 l = 0;
1275 s = SvPV(data->last_found, l);
1276 old = b - data->last_start_min;
1277
1278#else
b81d288d 1279 I32 b = pos_before >= data->last_start_min
c277df42
IZ
1280 ? pos_before : data->last_start_min;
1281 STRLEN l;
1282 char *s = SvPV(data->last_found, l);
a0ed51b3 1283 I32 old = b - data->last_start_min;
5d1c421c 1284#endif
a0ed51b3
LW
1285
1286 if (UTF)
1287 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 1288
a0ed51b3 1289 l -= old;
c277df42 1290 /* Get the added string: */
79cb57f6 1291 last_str = newSVpvn(s + old, l);
0e933229
IH
1292 if (UTF)
1293 SvUTF8_on(last_str);
c277df42
IZ
1294 if (deltanext == 0 && pos_before == b) {
1295 /* What was added is a constant string */
1296 if (mincount > 1) {
1297 SvGROW(last_str, (mincount * l) + 1);
b81d288d 1298 repeatcpy(SvPVX(last_str) + l,
c277df42
IZ
1299 SvPVX(last_str), l, mincount - 1);
1300 SvCUR(last_str) *= mincount;
1301 /* Add additional parts. */
b81d288d 1302 SvCUR_set(data->last_found,
c277df42
IZ
1303 SvCUR(data->last_found) - l);
1304 sv_catsv(data->last_found, last_str);
0eda9292
JH
1305 {
1306 SV * sv = data->last_found;
1307 MAGIC *mg =
1308 SvUTF8(sv) && SvMAGICAL(sv) ?
1309 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1310 if (mg && mg->mg_len >= 0)
1311 mg->mg_len += CHR_SVLEN(last_str);
1312 }
c277df42
IZ
1313 data->last_end += l * (mincount - 1);
1314 }
2a8d9689
HS
1315 } else {
1316 /* start offset must point into the last copy */
1317 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
1318 data->last_start_max += is_inf ? I32_MAX
1319 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
1320 }
1321 }
1322 /* It is counted once already... */
1323 data->pos_min += minnext * (mincount - counted);
1324 data->pos_delta += - counted * deltanext +
1325 (minnext + deltanext) * maxcount - minnext * mincount;
1326 if (mincount != maxcount) {
653099ff
GS
1327 /* Cannot extend fixed substrings found inside
1328 the group. */
830247a4 1329 scan_commit(pRExC_state,data);
c277df42
IZ
1330 if (mincount && last_str) {
1331 sv_setsv(data->last_found, last_str);
1332 data->last_end = data->pos_min;
b81d288d 1333 data->last_start_min =
a0ed51b3 1334 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
1335 data->last_start_max = is_inf
1336 ? I32_MAX
c277df42 1337 : data->pos_min + data->pos_delta
a0ed51b3 1338 - CHR_SVLEN(last_str);
c277df42
IZ
1339 }
1340 data->longest = &(data->longest_float);
1341 }
aca2d497 1342 SvREFCNT_dec(last_str);
c277df42 1343 }
405ff068 1344 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
1345 data->flags |= SF_HAS_EVAL;
1346 optimize_curly_tail:
c277df42 1347 if (OP(oscan) != CURLYX) {
22c35a8c 1348 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
1349 && NEXT_OFF(next))
1350 NEXT_OFF(oscan) += NEXT_OFF(next);
1351 }
c277df42 1352 continue;
653099ff 1353 default: /* REF and CLUMP only? */
c277df42 1354 if (flags & SCF_DO_SUBSTR) {
830247a4 1355 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
1356 data->longest = &(data->longest_float);
1357 }
aca2d497 1358 is_inf = is_inf_internal = 1;
653099ff 1359 if (flags & SCF_DO_STCLASS_OR)
830247a4 1360 cl_anything(pRExC_state, data->start_class);
653099ff 1361 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
1362 break;
1363 }
a0ed51b3 1364 }
ffc61ed2 1365 else if (strchr((char*)PL_simple,OP(scan))) {
9c5ffd7c 1366 int value = 0;
653099ff 1367
c277df42 1368 if (flags & SCF_DO_SUBSTR) {
830247a4 1369 scan_commit(pRExC_state,data);
c277df42
IZ
1370 data->pos_min++;
1371 }
1372 min++;
653099ff
GS
1373 if (flags & SCF_DO_STCLASS) {
1374 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1375
1376 /* Some of the logic below assumes that switching
1377 locale on will only add false positives. */
1378 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 1379 case SANY:
653099ff
GS
1380 default:
1381 do_default:
1382 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1383 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1384 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1385 break;
1386 case REG_ANY:
1387 if (OP(scan) == SANY)
1388 goto do_default;
1389 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1390 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1391 || (data->start_class->flags & ANYOF_CLASS));
830247a4 1392 cl_anything(pRExC_state, data->start_class);
653099ff
GS
1393 }
1394 if (flags & SCF_DO_STCLASS_AND || !value)
1395 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1396 break;
1397 case ANYOF:
1398 if (flags & SCF_DO_STCLASS_AND)
1399 cl_and(data->start_class,
1400 (struct regnode_charclass_class*)scan);
1401 else
830247a4 1402 cl_or(pRExC_state, data->start_class,
653099ff
GS
1403 (struct regnode_charclass_class*)scan);
1404 break;
1405 case ALNUM:
1406 if (flags & SCF_DO_STCLASS_AND) {
1407 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1409 for (value = 0; value < 256; value++)
1410 if (!isALNUM(value))
1411 ANYOF_BITMAP_CLEAR(data->start_class, value);
1412 }
1413 }
1414 else {
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1417 else {
1418 for (value = 0; value < 256; value++)
1419 if (isALNUM(value))
b81d288d 1420 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1421 }
1422 }
1423 break;
1424 case ALNUML:
1425 if (flags & SCF_DO_STCLASS_AND) {
1426 if (data->start_class->flags & ANYOF_LOCALE)
1427 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1428 }
1429 else {
1430 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1431 data->start_class->flags |= ANYOF_LOCALE;
1432 }
1433 break;
1434 case NALNUM:
1435 if (flags & SCF_DO_STCLASS_AND) {
1436 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1437 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1438 for (value = 0; value < 256; value++)
1439 if (isALNUM(value))
1440 ANYOF_BITMAP_CLEAR(data->start_class, value);
1441 }
1442 }
1443 else {
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1446 else {
1447 for (value = 0; value < 256; value++)
1448 if (!isALNUM(value))
b81d288d 1449 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1450 }
1451 }
1452 break;
1453 case NALNUML:
1454 if (flags & SCF_DO_STCLASS_AND) {
1455 if (data->start_class->flags & ANYOF_LOCALE)
1456 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1457 }
1458 else {
1459 data->start_class->flags |= ANYOF_LOCALE;
1460 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1461 }
1462 break;
1463 case SPACE:
1464 if (flags & SCF_DO_STCLASS_AND) {
1465 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1466 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1467 for (value = 0; value < 256; value++)
1468 if (!isSPACE(value))
1469 ANYOF_BITMAP_CLEAR(data->start_class, value);
1470 }
1471 }
1472 else {
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1475 else {
1476 for (value = 0; value < 256; value++)
1477 if (isSPACE(value))
b81d288d 1478 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1479 }
1480 }
1481 break;
1482 case SPACEL:
1483 if (flags & SCF_DO_STCLASS_AND) {
1484 if (data->start_class->flags & ANYOF_LOCALE)
1485 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1486 }
1487 else {
1488 data->start_class->flags |= ANYOF_LOCALE;
1489 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1490 }
1491 break;
1492 case NSPACE:
1493 if (flags & SCF_DO_STCLASS_AND) {
1494 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1495 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1496 for (value = 0; value < 256; value++)
1497 if (isSPACE(value))
1498 ANYOF_BITMAP_CLEAR(data->start_class, value);
1499 }
1500 }
1501 else {
1502 if (data->start_class->flags & ANYOF_LOCALE)
1503 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1504 else {
1505 for (value = 0; value < 256; value++)
1506 if (!isSPACE(value))
b81d288d 1507 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1508 }
1509 }
1510 break;
1511 case NSPACEL:
1512 if (flags & SCF_DO_STCLASS_AND) {
1513 if (data->start_class->flags & ANYOF_LOCALE) {
1514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1515 for (value = 0; value < 256; value++)
1516 if (!isSPACE(value))
1517 ANYOF_BITMAP_CLEAR(data->start_class, value);
1518 }
1519 }
1520 else {
1521 data->start_class->flags |= ANYOF_LOCALE;
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1523 }
1524 break;
1525 case DIGIT:
1526 if (flags & SCF_DO_STCLASS_AND) {
1527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1528 for (value = 0; value < 256; value++)
1529 if (!isDIGIT(value))
1530 ANYOF_BITMAP_CLEAR(data->start_class, value);
1531 }
1532 else {
1533 if (data->start_class->flags & ANYOF_LOCALE)
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1535 else {
1536 for (value = 0; value < 256; value++)
1537 if (isDIGIT(value))
b81d288d 1538 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1539 }
1540 }
1541 break;
1542 case NDIGIT:
1543 if (flags & SCF_DO_STCLASS_AND) {
1544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1545 for (value = 0; value < 256; value++)
1546 if (isDIGIT(value))
1547 ANYOF_BITMAP_CLEAR(data->start_class, value);
1548 }
1549 else {
1550 if (data->start_class->flags & ANYOF_LOCALE)
1551 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1552 else {
1553 for (value = 0; value < 256; value++)
1554 if (!isDIGIT(value))
b81d288d 1555 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
1556 }
1557 }
1558 break;
1559 }
1560 if (flags & SCF_DO_STCLASS_OR)
1561 cl_and(data->start_class, &and_with);
1562 flags &= ~SCF_DO_STCLASS;
1563 }
a0ed51b3 1564 }
22c35a8c 1565 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
1566 data->flags |= (OP(scan) == MEOL
1567 ? SF_BEFORE_MEOL
1568 : SF_BEFORE_SEOL);
a0ed51b3 1569 }
653099ff
GS
1570 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1571 /* Lookbehind, or need to calculate parens/evals/stclass: */
1572 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 1573 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 1574 /* Lookahead/lookbehind */
cb434fcc 1575 I32 deltanext, minnext, fake = 0;
c277df42 1576 regnode *nscan;
653099ff
GS
1577 struct regnode_charclass_class intrnl;
1578 int f = 0;
c277df42
IZ
1579
1580 data_fake.flags = 0;
b81d288d 1581 if (data) {
2c2d71f5 1582 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1583 data_fake.last_closep = data->last_closep;
1584 }
1585 else
1586 data_fake.last_closep = &fake;
653099ff
GS
1587 if ( flags & SCF_DO_STCLASS && !scan->flags
1588 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 1589 cl_init(pRExC_state, &intrnl);
653099ff 1590 data_fake.start_class = &intrnl;
e1901655 1591 f |= SCF_DO_STCLASS_AND;
653099ff 1592 }
e1901655
IZ
1593 if (flags & SCF_WHILEM_VISITED_POS)
1594 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
1595 next = regnext(scan);
1596 nscan = NEXTOPER(NEXTOPER(scan));
830247a4 1597 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
c277df42
IZ
1598 if (scan->flags) {
1599 if (deltanext) {
9baa0206 1600 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
1601 }
1602 else if (minnext > U8_MAX) {
9baa0206 1603 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 1604 }
eb160463 1605 scan->flags = (U8)minnext;
c277df42
IZ
1606 }
1607 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1608 pars++;
405ff068 1609 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1610 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1611 if (data)
1612 data->whilem_c = data_fake.whilem_c;
e1901655 1613 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
1614 int was = (data->start_class->flags & ANYOF_EOS);
1615
1616 cl_and(data->start_class, &intrnl);
1617 if (was)
1618 data->start_class->flags |= ANYOF_EOS;
1619 }
a0ed51b3
LW
1620 }
1621 else if (OP(scan) == OPEN) {
c277df42 1622 pars++;
a0ed51b3 1623 }
cb434fcc 1624 else if (OP(scan) == CLOSE) {
eb160463 1625 if ((I32)ARG(scan) == is_par) {
cb434fcc 1626 next = regnext(scan);
c277df42 1627
cb434fcc
IZ
1628 if ( next && (OP(next) != WHILEM) && next < last)
1629 is_par = 0; /* Disable optimization */
1630 }
1631 if (data)
1632 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
1633 }
1634 else if (OP(scan) == EVAL) {
c277df42
IZ
1635 if (data)
1636 data->flags |= SF_HAS_EVAL;
1637 }
96776eda 1638 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 1639 if (flags & SCF_DO_SUBSTR) {
830247a4 1640 scan_commit(pRExC_state,data);
0f5d15d6
IZ
1641 data->longest = &(data->longest_float);
1642 }
1643 is_inf = is_inf_internal = 1;
653099ff 1644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 1645 cl_anything(pRExC_state, data->start_class);
96776eda 1646 flags &= ~SCF_DO_STCLASS;
0f5d15d6 1647 }
c277df42
IZ
1648 /* Else: zero-length, ignore. */
1649 scan = regnext(scan);
1650 }
1651
1652 finish:
1653 *scanp = scan;
aca2d497 1654 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 1655 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
1656 data->pos_delta = I32_MAX - data->pos_min;
1657 if (is_par > U8_MAX)
1658 is_par = 0;
1659 if (is_par && pars==1 && data) {
1660 data->flags |= SF_IN_PAR;
1661 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
1662 }
1663 else if (pars && data) {
c277df42
IZ
1664 data->flags |= SF_HAS_PAR;
1665 data->flags &= ~SF_IN_PAR;
1666 }
653099ff
GS
1667 if (flags & SCF_DO_STCLASS_OR)
1668 cl_and(data->start_class, &and_with);
c277df42
IZ
1669 return min;
1670}
1671
76e3520e 1672STATIC I32
830247a4 1673S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
c277df42 1674{
830247a4 1675 if (RExC_rx->data) {
b81d288d
AB
1676 Renewc(RExC_rx->data,
1677 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 1678 char, struct reg_data);
830247a4
IZ
1679 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1680 RExC_rx->data->count += n;
a0ed51b3
LW
1681 }
1682 else {
830247a4 1683 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 1684 char, struct reg_data);
830247a4
IZ
1685 New(1208, RExC_rx->data->what, n, U8);
1686 RExC_rx->data->count = n;
c277df42 1687 }
830247a4
IZ
1688 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1689 return RExC_rx->data->count - n;
c277df42
IZ
1690}
1691
d88dccdf 1692void
864dbfa3 1693Perl_reginitcolors(pTHX)
d88dccdf 1694{
d88dccdf
IZ
1695 int i = 0;
1696 char *s = PerlEnv_getenv("PERL_RE_COLORS");
b81d288d 1697
d88dccdf
IZ
1698 if (s) {
1699 PL_colors[0] = s = savepv(s);
1700 while (++i < 6) {
1701 s = strchr(s, '\t');
1702 if (s) {
1703 *s = '\0';
1704 PL_colors[i] = ++s;
1705 }
1706 else
c712d376 1707 PL_colors[i] = s = "";
d88dccdf
IZ
1708 }
1709 } else {
b81d288d 1710 while (i < 6)
d88dccdf
IZ
1711 PL_colors[i++] = "";
1712 }
1713 PL_colorset = 1;
1714}
1715
8615cb43 1716
a687059c 1717/*
e50aee73 1718 - pregcomp - compile a regular expression into internal code
a687059c
LW
1719 *
1720 * We can't allocate space until we know how big the compiled form will be,
1721 * but we can't compile it (and thus know how big it is) until we've got a
1722 * place to put the code. So we cheat: we compile it twice, once with code
1723 * generation turned off and size counting turned on, and once "for real".
1724 * This also means that we don't allocate space until we are sure that the
1725 * thing really will compile successfully, and we never have to move the
1726 * code and thus invalidate pointers into it. (Note that it has to be in
1727 * one piece because free() must be able to free it all.) [NB: not true in perl]
1728 *
1729 * Beware that the optimization-preparation code in here knows about some
1730 * of the structure of the compiled regexp. [I'll say.]
1731 */
1732regexp *
864dbfa3 1733Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 1734{
a0d0e21e 1735 register regexp *r;
c277df42 1736 regnode *scan;
c277df42 1737 regnode *first;
a0d0e21e 1738 I32 flags;
a0d0e21e
LW
1739 I32 minlen = 0;
1740 I32 sawplus = 0;
1741 I32 sawopen = 0;
2c2d71f5 1742 scan_data_t data;
830247a4
IZ
1743 RExC_state_t RExC_state;
1744 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e
LW
1745
1746 if (exp == NULL)
c277df42 1747 FAIL("NULL regexp argument");
a0d0e21e 1748
a5961de5 1749 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 1750
5cfc7842 1751 RExC_precomp = exp;
a5961de5
JH
1752 DEBUG_r({
1753 if (!PL_colorset) reginitcolors();
1754 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1755 PL_colors[4],PL_colors[5],PL_colors[0],
1756 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1757 });
e2509266 1758 RExC_flags = pm->op_pmflags;
830247a4 1759 RExC_sawback = 0;
bbce6d69 1760
830247a4
IZ
1761 RExC_seen = 0;
1762 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1763 RExC_seen_evals = 0;
1764 RExC_extralen = 0;
c277df42 1765
bbce6d69 1766 /* First pass: determine size, legality. */
830247a4 1767 RExC_parse = exp;
fac92740 1768 RExC_start = exp;
830247a4
IZ
1769 RExC_end = xend;
1770 RExC_naughty = 0;
1771 RExC_npar = 1;
1772 RExC_size = 0L;
1773 RExC_emit = &PL_regdummy;
1774 RExC_whilem_seen = 0;
85ddcde9
JH
1775#if 0 /* REGC() is (currently) a NOP at the first pass.
1776 * Clever compilers notice this and complain. --jhi */
830247a4 1777 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 1778#endif
830247a4 1779 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 1780 RExC_precomp = Nullch;
a0d0e21e
LW
1781 return(NULL);
1782 }
830247a4 1783 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 1784
c277df42
IZ
1785 /* Small enough for pointer-storage convention?
1786 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
1787 if (RExC_size >= 0x10000L && RExC_extralen)
1788 RExC_size += RExC_extralen;
c277df42 1789 else
830247a4
IZ
1790 RExC_extralen = 0;
1791 if (RExC_whilem_seen > 15)
1792 RExC_whilem_seen = 15;
a0d0e21e 1793
bbce6d69 1794 /* Allocate space and initialize. */
830247a4 1795 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 1796 char, regexp);
a0d0e21e 1797 if (r == NULL)
b45f050a
JF
1798 FAIL("Regexp out of space");
1799
0f79a09d
GS
1800#ifdef DEBUGGING
1801 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 1802 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 1803#endif
c277df42 1804 r->refcnt = 1;
bbce6d69 1805 r->prelen = xend - exp;
5cfc7842 1806 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 1807 r->subbeg = NULL;
ed252734
NC
1808#ifdef PERL_COPY_ON_WRITE
1809 r->saved_copy = Nullsv;
1810#endif
cf93c79d 1811 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 1812 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
1813
1814 r->substrs = 0; /* Useful during FAIL. */
1815 r->startp = 0; /* Useful during FAIL. */
1816 r->endp = 0; /* Useful during FAIL. */
1817
fac92740
MJD
1818 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1819 if (r->offsets) {
1820 r->offsets[0] = RExC_size;
1821 }
1822 DEBUG_r(PerlIO_printf(Perl_debug_log,
392fbf5d 1823 "%s %"UVuf" bytes for offset annotations.\n",
fac92740 1824 r->offsets ? "Got" : "Couldn't get",
392fbf5d 1825 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 1826
830247a4 1827 RExC_rx = r;
bbce6d69 1828
1829 /* Second pass: emit code. */
e2509266 1830 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
1831 RExC_parse = exp;
1832 RExC_end = xend;
1833 RExC_naughty = 0;
1834 RExC_npar = 1;
fac92740 1835 RExC_emit_start = r->program;
830247a4 1836 RExC_emit = r->program;
2cd61cdb 1837 /* Store the count of eval-groups for security checks: */
eb160463 1838 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 1839 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 1840 r->data = 0;
830247a4 1841 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
1842 return(NULL);
1843
1844 /* Dig out information for optimizations. */
cf93c79d 1845 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 1846 pm->op_pmflags = RExC_flags;
a0ed51b3 1847 if (UTF)
5ff6fc6d 1848 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 1849 r->regstclass = NULL;
830247a4 1850 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 1851 r->reganch |= ROPT_NAUGHTY;
c277df42 1852 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
1853
1854 /* XXXX To minimize changes to RE engine we always allocate
1855 3-units-long substrs field. */
1856 Newz(1004, r->substrs, 1, struct reg_substr_data);
1857
2c2d71f5 1858 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 1859 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 1860 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 1861 I32 fake;
c5254dd6 1862 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
1863 struct regnode_charclass_class ch_class;
1864 int stclass_flag;
cb434fcc 1865 I32 last_close = 0;
a0d0e21e
LW
1866
1867 first = scan;
c277df42 1868 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 1869 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 1870 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
1871 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1872 (OP(first) == PLUS) ||
1873 (OP(first) == MINMOD) ||
653099ff 1874 /* An {n,m} with n>0 */
22c35a8c 1875 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
1876 if (OP(first) == PLUS)
1877 sawplus = 1;
1878 else
1879 first += regarglen[(U8)OP(first)];
1880 first = NEXTOPER(first);
a687059c
LW
1881 }
1882
a0d0e21e
LW
1883 /* Starting-point info. */
1884 again:
653099ff 1885 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
1886 if (OP(first) == EXACT)
1887 ; /* Empty, get anchored substr later. */
1888 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
1889 r->regstclass = first;
1890 }
653099ff 1891 else if (strchr((char*)PL_simple,OP(first)))
a0d0e21e 1892 r->regstclass = first;
22c35a8c
GS
1893 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1894 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 1895 r->regstclass = first;
22c35a8c 1896 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
1897 r->reganch |= (OP(first) == MBOL
1898 ? ROPT_ANCH_MBOL
1899 : (OP(first) == SBOL
1900 ? ROPT_ANCH_SBOL
1901 : ROPT_ANCH_BOL));
a0d0e21e 1902 first = NEXTOPER(first);
774d564b 1903 goto again;
1904 }
1905 else if (OP(first) == GPOS) {
1906 r->reganch |= ROPT_ANCH_GPOS;
1907 first = NEXTOPER(first);
1908 goto again;
a0d0e21e 1909 }
e09294f4 1910 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 1911 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
1912 !(r->reganch & ROPT_ANCH) )
1913 {
1914 /* turn .* into ^.* with an implied $*=1 */
cad2e5aa
JH
1915 int type = OP(NEXTOPER(first));
1916
ffc61ed2 1917 if (type == REG_ANY)
cad2e5aa
JH
1918 type = ROPT_ANCH_MBOL;
1919 else
1920 type = ROPT_ANCH_SBOL;
1921
1922 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 1923 first = NEXTOPER(first);
774d564b 1924 goto again;
a0d0e21e 1925 }
b81d288d 1926 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 1927 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
1928 /* x+ must match at the 1st pos of run of x's */
1929 r->reganch |= ROPT_SKIP;
a0d0e21e 1930
c277df42 1931 /* Scan is after the zeroth branch, first is atomic matcher. */
b81d288d 1932 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 1933 (IV)(first - scan + 1)));
a0d0e21e
LW
1934 /*
1935 * If there's something expensive in the r.e., find the
1936 * longest literal string that must appear and make it the
1937 * regmust. Resolve ties in favor of later strings, since
1938 * the regstart check works with the beginning of the r.e.
1939 * and avoiding duplication strengthens checking. Not a
1940 * strong reason, but sufficient in the absence of others.
1941 * [Now we resolve ties in favor of the earlier string if
c277df42 1942 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
1943 * earlier string may buy us something the later one won't.]
1944 */
a0d0e21e 1945 minlen = 0;
a687059c 1946
79cb57f6
GS
1947 data.longest_fixed = newSVpvn("",0);
1948 data.longest_float = newSVpvn("",0);
1949 data.last_found = newSVpvn("",0);
c277df42
IZ
1950 data.longest = &(data.longest_fixed);
1951 first = scan;
653099ff 1952 if (!r->regstclass) {
830247a4 1953 cl_init(pRExC_state, &ch_class);
653099ff
GS
1954 data.start_class = &ch_class;
1955 stclass_flag = SCF_DO_STCLASS_AND;
1956 } else /* XXXX Check for BOUND? */
1957 stclass_flag = 0;
cb434fcc 1958 data.last_closep = &last_close;
653099ff 1959
830247a4 1960 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
e1901655 1961 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
830247a4 1962 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 1963 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
1964 && !RExC_seen_zerolen
1965 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 1966 r->reganch |= ROPT_CHECK_ALL;
830247a4 1967 scan_commit(pRExC_state, &data);
c277df42
IZ
1968 SvREFCNT_dec(data.last_found);
1969
a0ed51b3 1970 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 1971 if (longest_float_length
c277df42
IZ
1972 || (data.flags & SF_FL_BEFORE_EOL
1973 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1974 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
1975 int t;
1976
a0ed51b3 1977 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
1978 && data.offset_fixed == data.offset_float_min
1979 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1980 goto remove_float; /* As in (a)+. */
1981
33b8afdf
JH
1982 if (SvUTF8(data.longest_float)) {
1983 r->float_utf8 = data.longest_float;
1984 r->float_substr = Nullsv;
1985 } else {
1986 r->float_substr = data.longest_float;
1987 r->float_utf8 = Nullsv;
1988 }
c277df42
IZ
1989 r->float_min_offset = data.offset_float_min;
1990 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
1991 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1992 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 1993 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 1994 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
1995 }
1996 else {
aca2d497 1997 remove_float:
33b8afdf 1998 r->float_substr = r->float_utf8 = Nullsv;
c277df42 1999 SvREFCNT_dec(data.longest_float);
c5254dd6 2000 longest_float_length = 0;
a0d0e21e 2001 }
c277df42 2002
a0ed51b3 2003 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 2004 if (longest_fixed_length
c277df42
IZ
2005 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2006 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2007 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
2008 int t;
2009
33b8afdf
JH
2010 if (SvUTF8(data.longest_fixed)) {
2011 r->anchored_utf8 = data.longest_fixed;
2012 r->anchored_substr = Nullsv;
2013 } else {
2014 r->anchored_substr = data.longest_fixed;
2015 r->anchored_utf8 = Nullsv;
2016 }
c277df42 2017 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
2018 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2019 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 2020 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 2021 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
2022 }
2023 else {
33b8afdf 2024 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 2025 SvREFCNT_dec(data.longest_fixed);
c5254dd6 2026 longest_fixed_length = 0;
a0d0e21e 2027 }
b81d288d 2028 if (r->regstclass
ffc61ed2 2029 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 2030 r->regstclass = NULL;
33b8afdf
JH
2031 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2032 && stclass_flag
653099ff 2033 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2034 && !cl_is_anything(data.start_class))
2035 {
830247a4 2036 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2037
b81d288d 2038 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2039 struct regnode_charclass_class);
2040 StructCopy(data.start_class,
830247a4 2041 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2042 struct regnode_charclass_class);
830247a4 2043 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2044 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 2045 PL_regdata = r->data; /* for regprop() */
9c5ffd7c
JH
2046 DEBUG_r({ SV *sv = sv_newmortal();
2047 regprop(sv, (regnode*)data.start_class);
2048 PerlIO_printf(Perl_debug_log,
2049 "synthetic stclass `%s'.\n",
2050 SvPVX(sv));});
653099ff 2051 }
c277df42
IZ
2052
2053 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 2054 if (longest_fixed_length > longest_float_length) {
c277df42 2055 r->check_substr = r->anchored_substr;
33b8afdf 2056 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
2057 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2058 if (r->reganch & ROPT_ANCH_SINGLE)
2059 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
2060 }
2061 else {
c277df42 2062 r->check_substr = r->float_substr;
33b8afdf 2063 r->check_utf8 = r->float_utf8;
c277df42
IZ
2064 r->check_offset_min = data.offset_float_min;
2065 r->check_offset_max = data.offset_float_max;
a0d0e21e 2066 }
30382c73
IZ
2067 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2068 This should be changed ASAP! */
33b8afdf 2069 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 2070 r->reganch |= RE_USE_INTUIT;
33b8afdf 2071 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
2072 r->reganch |= RE_INTUIT_TAIL;
2073 }
a0ed51b3
LW
2074 }
2075 else {
c277df42
IZ
2076 /* Several toplevels. Best we can is to set minlen. */
2077 I32 fake;
653099ff 2078 struct regnode_charclass_class ch_class;
cb434fcc 2079 I32 last_close = 0;
c277df42
IZ
2080
2081 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2082 scan = r->program + 1;
830247a4 2083 cl_init(pRExC_state, &ch_class);
653099ff 2084 data.start_class = &ch_class;
cb434fcc 2085 data.last_closep = &last_close;
e1901655 2086 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
33b8afdf
JH
2087 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2088 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 2089 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
2090 && !cl_is_anything(data.start_class))
2091 {
830247a4 2092 I32 n = add_data(pRExC_state, 1, "f");
653099ff 2093
b81d288d 2094 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
2095 struct regnode_charclass_class);
2096 StructCopy(data.start_class,
830247a4 2097 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 2098 struct regnode_charclass_class);
830247a4 2099 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 2100 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
9c5ffd7c
JH
2101 DEBUG_r({ SV* sv = sv_newmortal();
2102 regprop(sv, (regnode*)data.start_class);
2103 PerlIO_printf(Perl_debug_log,
2104 "synthetic stclass `%s'.\n",
2105 SvPVX(sv));});
653099ff 2106 }
a0d0e21e
LW
2107 }
2108
a0d0e21e 2109 r->minlen = minlen;
b81d288d 2110 if (RExC_seen & REG_SEEN_GPOS)
c277df42 2111 r->reganch |= ROPT_GPOS_SEEN;
830247a4 2112 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 2113 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 2114 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 2115 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
2116 if (RExC_seen & REG_SEEN_CANY)
2117 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
2118 Newz(1002, r->startp, RExC_npar, I32);
2119 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 2120 PL_regdata = r->data; /* for regprop() */
a0d0e21e
LW
2121 DEBUG_r(regdump(r));
2122 return(r);
a687059c
LW
2123}
2124
2125/*
2126 - reg - regular expression, i.e. main body or parenthesized thing
2127 *
2128 * Caller must absorb opening parenthesis.
2129 *
2130 * Combining parenthesis handling with the base level of regular expression
2131 * is a trifle forced, but the need to tie the tails of the branches to what
2132 * follows makes it hard to avoid.
2133 */
76e3520e 2134STATIC regnode *
830247a4 2135S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 2136 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 2137{
c277df42
IZ
2138 register regnode *ret; /* Will be the head of the group. */
2139 register regnode *br;
2140 register regnode *lastbr;
2141 register regnode *ender = 0;
a0d0e21e 2142 register I32 parno = 0;
e2509266 2143 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
2144
2145 /* for (?g), (?gc), and (?o) warnings; warning
2146 about (?c) will warn about (?g) -- japhy */
2147
2148 I32 wastedflags = 0x00,
2149 wasted_o = 0x01,
2150 wasted_g = 0x02,
2151 wasted_gc = 0x02 | 0x04,
2152 wasted_c = 0x04;
2153
fac92740 2154 char * parse_start = RExC_parse; /* MJD */
830247a4 2155 char *oregcomp_parse = RExC_parse;
c277df42 2156 char c;
a0d0e21e 2157
821b33a5 2158 *flagp = 0; /* Tentatively. */
a0d0e21e 2159
9d1d55b5 2160
a0d0e21e
LW
2161 /* Make an OPEN node, if parenthesized. */
2162 if (paren) {
fac92740 2163 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
2164 U32 posflags = 0, negflags = 0;
2165 U32 *flagsp = &posflags;
0f5d15d6 2166 int logical = 0;
830247a4 2167 char *seqstart = RExC_parse;
ca9dfc88 2168
830247a4
IZ
2169 RExC_parse++;
2170 paren = *RExC_parse++;
c277df42 2171 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 2172 switch (paren) {
fac92740 2173 case '<': /* (?<...) */
830247a4 2174 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 2175 if (*RExC_parse == '!')
c277df42 2176 paren = ',';
b81d288d 2177 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 2178 goto unknown;
830247a4 2179 RExC_parse++;
fac92740
MJD
2180 case '=': /* (?=...) */
2181 case '!': /* (?!...) */
830247a4 2182 RExC_seen_zerolen++;
fac92740
MJD
2183 case ':': /* (?:...) */
2184 case '>': /* (?>...) */
a0d0e21e 2185 break;
fac92740
MJD
2186 case '$': /* (?$...) */
2187 case '@': /* (?@...) */
8615cb43 2188 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 2189 break;
fac92740 2190 case '#': /* (?#...) */
830247a4
IZ
2191 while (*RExC_parse && *RExC_parse != ')')
2192 RExC_parse++;
2193 if (*RExC_parse != ')')
c277df42 2194 FAIL("Sequence (?#... not terminated");
830247a4 2195 nextchar(pRExC_state);
a0d0e21e
LW
2196 *flagp = TRYAGAIN;
2197 return NULL;
fac92740 2198 case 'p': /* (?p...) */
9014280d 2199 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 2200 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 2201 /* FALL THROUGH*/
fac92740 2202 case '?': /* (??...) */
0f5d15d6 2203 logical = 1;
438a3801
YST
2204 if (*RExC_parse != '{')
2205 goto unknown;
830247a4 2206 paren = *RExC_parse++;
0f5d15d6 2207 /* FALL THROUGH */
fac92740 2208 case '{': /* (?{...}) */
c277df42 2209 {
c277df42
IZ
2210 I32 count = 1, n = 0;
2211 char c;
830247a4 2212 char *s = RExC_parse;
c277df42
IZ
2213 SV *sv;
2214 OP_4tree *sop, *rop;
2215
830247a4
IZ
2216 RExC_seen_zerolen++;
2217 RExC_seen |= REG_SEEN_EVAL;
2218 while (count && (c = *RExC_parse)) {
2219 if (c == '\\' && RExC_parse[1])
2220 RExC_parse++;
b81d288d 2221 else if (c == '{')
c277df42 2222 count++;
b81d288d 2223 else if (c == '}')
c277df42 2224 count--;
830247a4 2225 RExC_parse++;
c277df42 2226 }
830247a4 2227 if (*RExC_parse != ')')
b45f050a 2228 {
b81d288d 2229 RExC_parse = s;
b45f050a
JF
2230 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2231 }
c277df42 2232 if (!SIZE_ONLY) {
f3548bdc 2233 PAD *pad;
b81d288d
AB
2234
2235 if (RExC_parse - 1 - s)
830247a4 2236 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 2237 else
79cb57f6 2238 sv = newSVpvn("", 0);
c277df42 2239
569233ed
SB
2240 ENTER;
2241 Perl_save_re_context(aTHX);
f3548bdc 2242 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
2243 sop->op_private |= OPpREFCOUNTED;
2244 /* re_dup will OpREFCNT_inc */
2245 OpREFCNT_set(sop, 1);
569233ed 2246 LEAVE;
c277df42 2247
830247a4
IZ
2248 n = add_data(pRExC_state, 3, "nop");
2249 RExC_rx->data->data[n] = (void*)rop;
2250 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 2251 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 2252 SvREFCNT_dec(sv);
a0ed51b3 2253 }
e24b16f9 2254 else { /* First pass */
830247a4 2255 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 2256 && IN_PERL_RUNTIME)
2cd61cdb
IZ
2257 /* No compiled RE interpolated, has runtime
2258 components ===> unsafe. */
2259 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 2260 if (PL_tainting && PL_tainted)
cc6b7395 2261 FAIL("Eval-group in insecure regular expression");
923e4eb5 2262 if (IN_PERL_COMPILETIME)
b5c19bd7 2263 PL_cv_has_eval = 1;
c277df42 2264 }
b5c19bd7 2265
830247a4 2266 nextchar(pRExC_state);
0f5d15d6 2267 if (logical) {
830247a4 2268 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2269 if (!SIZE_ONLY)
2270 ret->flags = 2;
830247a4 2271 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 2272 /* deal with the length of this later - MJD */
0f5d15d6
IZ
2273 return ret;
2274 }
ccb2c380
MP
2275 ret = reganode(pRExC_state, EVAL, n);
2276 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2277 Set_Node_Offset(ret, parse_start);
2278 return ret;
c277df42 2279 }
fac92740 2280 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 2281 {
fac92740 2282 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
2283 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2284 || RExC_parse[1] == '<'
830247a4 2285 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
2286 I32 flag;
2287
830247a4 2288 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
2289 if (!SIZE_ONLY)
2290 ret->flags = 1;
830247a4 2291 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 2292 goto insert_if;
b81d288d 2293 }
a0ed51b3 2294 }
830247a4 2295 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 2296 /* (?(1)...) */
830247a4 2297 parno = atoi(RExC_parse++);
c277df42 2298
830247a4
IZ
2299 while (isDIGIT(*RExC_parse))
2300 RExC_parse++;
fac92740
MJD
2301 ret = reganode(pRExC_state, GROUPP, parno);
2302
830247a4 2303 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 2304 vFAIL("Switch condition not recognized");
c277df42 2305 insert_if:
830247a4
IZ
2306 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2307 br = regbranch(pRExC_state, &flags, 1);
c277df42 2308 if (br == NULL)
830247a4 2309 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 2310 else
830247a4
IZ
2311 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2312 c = *nextchar(pRExC_state);
d1b80229
IZ
2313 if (flags&HASWIDTH)
2314 *flagp |= HASWIDTH;
c277df42 2315 if (c == '|') {
830247a4
IZ
2316 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2317 regbranch(pRExC_state, &flags, 1);
2318 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
2319 if (flags&HASWIDTH)
2320 *flagp |= HASWIDTH;
830247a4 2321 c = *nextchar(pRExC_state);
a0ed51b3
LW
2322 }
2323 else
c277df42
IZ
2324 lastbr = NULL;
2325 if (c != ')')
8615cb43 2326 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
2327 ender = reg_node(pRExC_state, TAIL);
2328 regtail(pRExC_state, br, ender);
c277df42 2329 if (lastbr) {
830247a4
IZ
2330 regtail(pRExC_state, lastbr, ender);
2331 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
2332 }
2333 else
830247a4 2334 regtail(pRExC_state, ret, ender);
c277df42 2335 return ret;
a0ed51b3
LW
2336 }
2337 else {
830247a4 2338 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
2339 }
2340 }
1b1626e4 2341 case 0:
830247a4 2342 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 2343 vFAIL("Sequence (? incomplete");
1b1626e4 2344 break;
a0d0e21e 2345 default:
830247a4 2346 --RExC_parse;
fac92740 2347 parse_flags: /* (?i) */
830247a4 2348 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
2349 /* (?g), (?gc) and (?o) are useless here
2350 and must be globally applied -- japhy */
2351
2352 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2353 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2354 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2355 if (! (wastedflags & wflagbit) ) {
2356 wastedflags |= wflagbit;
2357 vWARN5(
2358 RExC_parse + 1,
2359 "Useless (%s%c) - %suse /%c modifier",
2360 flagsp == &negflags ? "?-" : "?",
2361 *RExC_parse,
2362 flagsp == &negflags ? "don't " : "",
2363 *RExC_parse
2364 );
2365 }
2366 }
2367 }
2368 else if (*RExC_parse == 'c') {
2369 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2370 if (! (wastedflags & wasted_c) ) {
2371 wastedflags |= wasted_gc;
2372 vWARN3(
2373 RExC_parse + 1,
2374 "Useless (%sc) - %suse /gc modifier",
2375 flagsp == &negflags ? "?-" : "?",
2376 flagsp == &negflags ? "don't " : ""
2377 );
2378 }
2379 }
2380 }
2381 else { pmflag(flagsp, *RExC_parse); }
2382
830247a4 2383 ++RExC_parse;
ca9dfc88 2384 }
830247a4 2385 if (*RExC_parse == '-') {
ca9dfc88 2386 flagsp = &negflags;
9d1d55b5 2387 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 2388 ++RExC_parse;
ca9dfc88 2389 goto parse_flags;
48c036b1 2390 }
e2509266
JH
2391 RExC_flags |= posflags;
2392 RExC_flags &= ~negflags;
830247a4
IZ
2393 if (*RExC_parse == ':') {
2394 RExC_parse++;
ca9dfc88
IZ
2395 paren = ':';
2396 break;
2397 }
c277df42 2398 unknown:
830247a4
IZ
2399 if (*RExC_parse != ')') {
2400 RExC_parse++;
2401 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 2402 }
830247a4 2403 nextchar(pRExC_state);
a0d0e21e
LW
2404 *flagp = TRYAGAIN;
2405 return NULL;
2406 }
2407 }
fac92740 2408 else { /* (...) */
830247a4
IZ
2409 parno = RExC_npar;
2410 RExC_npar++;
2411 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
2412 Set_Node_Length(ret, 1); /* MJD */
2413 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 2414 open = 1;
a0d0e21e 2415 }
a0ed51b3 2416 }
fac92740 2417 else /* ! paren */
a0d0e21e
LW
2418 ret = NULL;
2419
2420 /* Pick up the branches, linking them together. */
fac92740 2421 parse_start = RExC_parse; /* MJD */
830247a4 2422 br = regbranch(pRExC_state, &flags, 1);
fac92740
MJD
2423 /* branch_len = (paren != 0); */
2424
a0d0e21e
LW
2425 if (br == NULL)
2426 return(NULL);
830247a4
IZ
2427 if (*RExC_parse == '|') {
2428 if (!SIZE_ONLY && RExC_extralen) {
2429 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 2430 }
fac92740 2431 else { /* MJD */
830247a4 2432 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
2433 Set_Node_Length(br, paren != 0);
2434 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2435 }
c277df42
IZ
2436 have_branch = 1;
2437 if (SIZE_ONLY)
830247a4 2438 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
2439 }
2440 else if (paren == ':') {
c277df42
IZ
2441 *flagp |= flags&SIMPLE;
2442 }
2443 if (open) { /* Starts with OPEN. */
830247a4 2444 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
2445 }
2446 else if (paren != '?') /* Not Conditional */
a0d0e21e 2447 ret = br;
32a0ca98 2448 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 2449 lastbr = br;
830247a4
IZ
2450 while (*RExC_parse == '|') {
2451 if (!SIZE_ONLY && RExC_extralen) {
2452 ender = reganode(pRExC_state, LONGJMP,0);
2453 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
2454 }
2455 if (SIZE_ONLY)
830247a4
IZ
2456 RExC_extralen += 2; /* Account for LONGJMP. */
2457 nextchar(pRExC_state);
2458 br = regbranch(pRExC_state, &flags, 0);
fac92740 2459
a687059c 2460 if (br == NULL)
a0d0e21e 2461 return(NULL);
830247a4 2462 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 2463 lastbr = br;
821b33a5
IZ
2464 if (flags&HASWIDTH)
2465 *flagp |= HASWIDTH;
a687059c 2466 *flagp |= flags&SPSTART;
a0d0e21e
LW
2467 }
2468
c277df42
IZ
2469 if (have_branch || paren != ':') {
2470 /* Make a closing node, and hook it on the end. */
2471 switch (paren) {
2472 case ':':
830247a4 2473 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
2474 break;
2475 case 1:
830247a4 2476 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
2477 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2478 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
2479 break;
2480 case '<':
c277df42
IZ
2481 case ',':
2482 case '=':
2483 case '!':
c277df42 2484 *flagp &= ~HASWIDTH;
821b33a5
IZ
2485 /* FALL THROUGH */
2486 case '>':
830247a4 2487 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
2488 break;
2489 case 0:
830247a4 2490 ender = reg_node(pRExC_state, END);
c277df42
IZ
2491 break;
2492 }
830247a4 2493 regtail(pRExC_state, lastbr, ender);
a0d0e21e 2494
c277df42
IZ
2495 if (have_branch) {
2496 /* Hook the tails of the branches to the closing node. */
2497 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 2498 regoptail(pRExC_state, br, ender);
c277df42
IZ
2499 }
2500 }
a0d0e21e 2501 }
c277df42
IZ
2502
2503 {
2504 char *p;
2505 static char parens[] = "=!<,>";
2506
2507 if (paren && (p = strchr(parens, paren))) {
eb160463 2508 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
2509 int flag = (p - parens) > 1;
2510
2511 if (paren == '>')
2512 node = SUSPEND, flag = 0;
830247a4 2513 reginsert(pRExC_state, node,ret);
45948336
EP
2514 Set_Node_Cur_Length(ret);
2515 Set_Node_Offset(ret, parse_start + 1);
c277df42 2516 ret->flags = flag;
830247a4 2517 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 2518 }
a0d0e21e
LW
2519 }
2520
2521 /* Check for proper termination. */
ce3e6498 2522 if (paren) {
e2509266 2523 RExC_flags = oregflags;
830247a4
IZ
2524 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2525 RExC_parse = oregcomp_parse;
380a0633 2526 vFAIL("Unmatched (");
ce3e6498 2527 }
a0ed51b3 2528 }
830247a4
IZ
2529 else if (!paren && RExC_parse < RExC_end) {
2530 if (*RExC_parse == ')') {
2531 RExC_parse++;
380a0633 2532 vFAIL("Unmatched )");
a0ed51b3
LW
2533 }
2534 else
b45f050a 2535 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
2536 /* NOTREACHED */
2537 }
a687059c 2538
a0d0e21e 2539 return(ret);
a687059c
LW
2540}
2541
2542/*
2543 - regbranch - one alternative of an | operator
2544 *
2545 * Implements the concatenation operator.
2546 */
76e3520e 2547STATIC regnode *
830247a4 2548S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 2549{
c277df42
IZ
2550 register regnode *ret;
2551 register regnode *chain = NULL;
2552 register regnode *latest;
2553 I32 flags = 0, c = 0;
a0d0e21e 2554
b81d288d 2555 if (first)
c277df42
IZ
2556 ret = NULL;
2557 else {
b81d288d 2558 if (!SIZE_ONLY && RExC_extralen)
830247a4 2559 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 2560 else {
830247a4 2561 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
2562 Set_Node_Length(ret, 1);
2563 }
c277df42
IZ
2564 }
2565
b81d288d 2566 if (!first && SIZE_ONLY)
830247a4 2567 RExC_extralen += 1; /* BRANCHJ */
b81d288d 2568
c277df42 2569 *flagp = WORST; /* Tentatively. */
a0d0e21e 2570
830247a4
IZ
2571 RExC_parse--;
2572 nextchar(pRExC_state);
2573 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 2574 flags &= ~TRYAGAIN;
830247a4 2575 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
2576 if (latest == NULL) {
2577 if (flags & TRYAGAIN)
2578 continue;
2579 return(NULL);
a0ed51b3
LW
2580 }
2581 else if (ret == NULL)
c277df42 2582 ret = latest;
a0d0e21e 2583 *flagp |= flags&HASWIDTH;
c277df42 2584 if (chain == NULL) /* First piece. */
a0d0e21e
LW
2585 *flagp |= flags&SPSTART;
2586 else {
830247a4
IZ
2587 RExC_naughty++;
2588 regtail(pRExC_state, chain, latest);
a687059c 2589 }
a0d0e21e 2590 chain = latest;
c277df42
IZ
2591 c++;
2592 }
2593 if (chain == NULL) { /* Loop ran zero times. */
830247a4 2594 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
2595 if (ret == NULL)
2596 ret = chain;
2597 }
2598 if (c == 1) {
2599 *flagp |= flags&SIMPLE;
a0d0e21e 2600 }
a687059c 2601
a0d0e21e 2602 return(ret);
a687059c
LW
2603}
2604
2605/*
2606 - regpiece - something followed by possible [*+?]
2607 *
2608 * Note that the branching code sequences used for ? and the general cases
2609 * of * and + are somewhat optimized: they use the same NOTHING node as
2610 * both the endmarker for their branch list and the body of the last branch.
2611 * It might seem that this node could be dispensed with entirely, but the
2612 * endmarker role is not redundant.
2613 */
76e3520e 2614STATIC regnode *
830247a4 2615S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2616{
c277df42 2617 register regnode *ret;
a0d0e21e
LW
2618 register char op;
2619 register char *next;
2620 I32 flags;
830247a4 2621 char *origparse = RExC_parse;
a0d0e21e
LW
2622 char *maxpos;
2623 I32 min;
c277df42 2624 I32 max = REG_INFTY;
fac92740 2625 char *parse_start;
a0d0e21e 2626
830247a4 2627 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
2628 if (ret == NULL) {
2629 if (flags & TRYAGAIN)
2630 *flagp |= TRYAGAIN;
2631 return(NULL);
2632 }
2633
830247a4 2634 op = *RExC_parse;
a0d0e21e 2635
830247a4 2636 if (op == '{' && regcurly(RExC_parse)) {
fac92740 2637 parse_start = RExC_parse; /* MJD */
830247a4 2638 next = RExC_parse + 1;
a0d0e21e
LW
2639 maxpos = Nullch;
2640 while (isDIGIT(*next) || *next == ',') {
2641 if (*next == ',') {
2642 if (maxpos)
2643 break;
2644 else
2645 maxpos = next;
a687059c 2646 }
a0d0e21e
LW
2647 next++;
2648 }
2649 if (*next == '}') { /* got one */
2650 if (!maxpos)
2651 maxpos = next;
830247a4
IZ
2652 RExC_parse++;
2653 min = atoi(RExC_parse);
a0d0e21e
LW
2654 if (*maxpos == ',')
2655 maxpos++;
2656 else
830247a4 2657 maxpos = RExC_parse;
a0d0e21e
LW
2658 max = atoi(maxpos);
2659 if (!max && *maxpos != '0')
c277df42
IZ
2660 max = REG_INFTY; /* meaning "infinity" */
2661 else if (max >= REG_INFTY)
8615cb43 2662 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
2663 RExC_parse = next;
2664 nextchar(pRExC_state);
a0d0e21e
LW
2665
2666 do_curly:
2667 if ((flags&SIMPLE)) {
830247a4
IZ
2668 RExC_naughty += 2 + RExC_naughty / 2;
2669 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
2670 Set_Node_Offset(ret, parse_start+1); /* MJD */
2671 Set_Node_Cur_Length(ret);
a0d0e21e
LW
2672 }
2673 else {
830247a4 2674 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
2675
2676 w->flags = 0;
830247a4
IZ
2677 regtail(pRExC_state, ret, w);
2678 if (!SIZE_ONLY && RExC_extralen) {
2679 reginsert(pRExC_state, LONGJMP,ret);
2680 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
2681 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2682 }
830247a4 2683 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
2684 /* MJD hk */
2685 Set_Node_Offset(ret, parse_start+1);
2686 Set_Node_Length(ret,
2687 op == '{' ? (RExC_parse - parse_start) : 1);
2688
830247a4 2689 if (!SIZE_ONLY && RExC_extralen)
c277df42 2690 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 2691 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 2692 if (SIZE_ONLY)
830247a4
IZ
2693 RExC_whilem_seen++, RExC_extralen += 3;
2694 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 2695 }
c277df42 2696 ret->flags = 0;
a0d0e21e
LW
2697
2698 if (min > 0)
821b33a5
IZ
2699 *flagp = WORST;
2700 if (max > 0)
2701 *flagp |= HASWIDTH;
a0d0e21e 2702 if (max && max < min)
8615cb43 2703 vFAIL("Can't do {n,m} with n > m");
c277df42 2704 if (!SIZE_ONLY) {
eb160463
GS
2705 ARG1_SET(ret, (U16)min);
2706 ARG2_SET(ret, (U16)max);
a687059c 2707 }
a687059c 2708
a0d0e21e 2709 goto nest_check;
a687059c 2710 }
a0d0e21e 2711 }
a687059c 2712
a0d0e21e
LW
2713 if (!ISMULT1(op)) {
2714 *flagp = flags;
a687059c 2715 return(ret);
a0d0e21e 2716 }
bb20fd44 2717
c277df42 2718#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
2719
2720 /* if this is reinstated, don't forget to put this back into perldiag:
2721
2722 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2723
2724 (F) The part of the regexp subject to either the * or + quantifier
2725 could match an empty string. The {#} shows in the regular
2726 expression about where the problem was discovered.
2727
2728 */
2729
bb20fd44 2730 if (!(flags&HASWIDTH) && op != '?')
b45f050a 2731 vFAIL("Regexp *+ operand could be empty");
b81d288d 2732#endif
bb20fd44 2733
fac92740 2734 parse_start = RExC_parse;
830247a4 2735 nextchar(pRExC_state);
a0d0e21e 2736
821b33a5 2737 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
2738
2739 if (op == '*' && (flags&SIMPLE)) {
830247a4 2740 reginsert(pRExC_state, STAR, ret);
c277df42 2741 ret->flags = 0;
830247a4 2742 RExC_naughty += 4;
a0d0e21e
LW
2743 }
2744 else if (op == '*') {
2745 min = 0;
2746 goto do_curly;
a0ed51b3
LW
2747 }
2748 else if (op == '+' && (flags&SIMPLE)) {
830247a4 2749 reginsert(pRExC_state, PLUS, ret);
c277df42 2750 ret->flags = 0;
830247a4 2751 RExC_naughty += 3;
a0d0e21e
LW
2752 }
2753 else if (op == '+') {
2754 min = 1;
2755 goto do_curly;
a0ed51b3
LW
2756 }
2757 else if (op == '?') {
a0d0e21e
LW
2758 min = 0; max = 1;
2759 goto do_curly;
2760 }
2761 nest_check:
e476b1b5 2762 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 2763 vWARN3(RExC_parse,
b45f050a 2764 "%.*s matches null string many times",
830247a4 2765 RExC_parse - origparse,
b45f050a 2766 origparse);
a0d0e21e
LW
2767 }
2768
830247a4
IZ
2769 if (*RExC_parse == '?') {
2770 nextchar(pRExC_state);
2771 reginsert(pRExC_state, MINMOD, ret);
2772 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 2773 }
830247a4
IZ
2774 if (ISMULT2(RExC_parse)) {
2775 RExC_parse++;
b45f050a
JF
2776 vFAIL("Nested quantifiers");
2777 }
a0d0e21e
LW
2778
2779 return(ret);
a687059c
LW
2780}
2781
2782/*
2783 - regatom - the lowest level
2784 *
2785 * Optimization: gobbles an entire sequence of ordinary characters so that
2786 * it can turn them into a single node, which is smaller to store and
2787 * faster to run. Backslashed characters are exceptions, each becoming a
2788 * separate node; the code is simpler that way and it's not worth fixing.
2789 *
b45f050a 2790 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 2791STATIC regnode *
830247a4 2792S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 2793{
c277df42 2794 register regnode *ret = 0;
a0d0e21e 2795 I32 flags;
45948336 2796 char *parse_start = RExC_parse;
a0d0e21e
LW
2797
2798 *flagp = WORST; /* Tentatively. */
2799
2800tryagain:
830247a4 2801 switch (*RExC_parse) {
a0d0e21e 2802 case '^':
830247a4
IZ
2803 RExC_seen_zerolen++;
2804 nextchar(pRExC_state);
e2509266 2805 if (RExC_flags & PMf_MULTILINE)
830247a4 2806 ret = reg_node(pRExC_state, MBOL);
e2509266 2807 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2808 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2809 else
830247a4 2810 ret = reg_node(pRExC_state, BOL);
fac92740 2811 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2812 break;
2813 case '$':
830247a4 2814 nextchar(pRExC_state);
b81d288d 2815 if (*RExC_parse)
830247a4 2816 RExC_seen_zerolen++;
e2509266 2817 if (RExC_flags & PMf_MULTILINE)
830247a4 2818 ret = reg_node(pRExC_state, MEOL);
e2509266 2819 else if (RExC_flags & PMf_SINGLELINE)
830247a4 2820 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2821 else
830247a4 2822 ret = reg_node(pRExC_state, EOL);
fac92740 2823 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2824 break;
2825 case '.':
830247a4 2826 nextchar(pRExC_state);
e2509266 2827 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
2828 ret = reg_node(pRExC_state, SANY);
2829 else
2830 ret = reg_node(pRExC_state, REG_ANY);
2831 *flagp |= HASWIDTH|SIMPLE;
830247a4 2832 RExC_naughty++;
fac92740 2833 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
2834 break;
2835 case '[':
b45f050a 2836 {
830247a4 2837 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 2838 ret = regclass(pRExC_state);
830247a4
IZ
2839 if (*RExC_parse != ']') {
2840 RExC_parse = oregcomp_parse;
b45f050a
JF
2841 vFAIL("Unmatched [");
2842 }
830247a4 2843 nextchar(pRExC_state);
a0d0e21e 2844 *flagp |= HASWIDTH|SIMPLE;
fac92740 2845 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 2846 break;
b45f050a 2847 }
a0d0e21e 2848 case '(':
830247a4
IZ
2849 nextchar(pRExC_state);
2850 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 2851 if (ret == NULL) {
bf93d4cc 2852 if (flags & TRYAGAIN) {
830247a4 2853 if (RExC_parse == RExC_end) {
bf93d4cc
GS
2854 /* Make parent create an empty node if needed. */
2855 *flagp |= TRYAGAIN;
2856 return(NULL);
2857 }
a0d0e21e 2858 goto tryagain;
bf93d4cc 2859 }
a0d0e21e
LW
2860 return(NULL);
2861 }
c277df42 2862 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
2863 break;
2864 case '|':
2865 case ')':
2866 if (flags & TRYAGAIN) {
2867 *flagp |= TRYAGAIN;
2868 return NULL;
2869 }
b45f050a 2870 vFAIL("Internal urp");
a0d0e21e
LW
2871 /* Supposed to be caught earlier. */
2872 break;
85afd4ae 2873 case '{':
830247a4
IZ
2874 if (!regcurly(RExC_parse)) {
2875 RExC_parse++;
85afd4ae
CS
2876 goto defchar;
2877 }
2878 /* FALL THROUGH */
a0d0e21e
LW
2879 case '?':
2880 case '+':
2881 case '*':
830247a4 2882 RExC_parse++;
b45f050a 2883 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
2884 break;
2885 case '\\':
830247a4 2886 switch (*++RExC_parse) {
a0d0e21e 2887 case 'A':
830247a4
IZ
2888 RExC_seen_zerolen++;
2889 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 2890 *flagp |= SIMPLE;
830247a4 2891 nextchar(pRExC_state);
fac92740 2892 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2893 break;
2894 case 'G':
830247a4
IZ
2895 ret = reg_node(pRExC_state, GPOS);
2896 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 2897 *flagp |= SIMPLE;
830247a4 2898 nextchar(pRExC_state);
fac92740 2899 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2900 break;
2901 case 'Z':
830247a4 2902 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 2903 *flagp |= SIMPLE;
a1917ab9 2904 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 2905 nextchar(pRExC_state);
a0d0e21e 2906 break;
b85d18e9 2907 case 'z':
830247a4 2908 ret = reg_node(pRExC_state, EOS);
b85d18e9 2909 *flagp |= SIMPLE;
830247a4
IZ
2910 RExC_seen_zerolen++; /* Do not optimize RE away */
2911 nextchar(pRExC_state);
fac92740 2912 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 2913 break;
4a2d328f 2914 case 'C':
f33976b4
DB
2915 ret = reg_node(pRExC_state, CANY);
2916 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 2917 *flagp |= HASWIDTH|SIMPLE;
830247a4 2918 nextchar(pRExC_state);
fac92740 2919 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
2920 break;
2921 case 'X':
830247a4 2922 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 2923 *flagp |= HASWIDTH;
830247a4 2924 nextchar(pRExC_state);
fac92740 2925 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 2926 break;
a0d0e21e 2927 case 'w':
eb160463 2928 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 2929 *flagp |= HASWIDTH|SIMPLE;
830247a4 2930 nextchar(pRExC_state);
fac92740 2931 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2932 break;
2933 case 'W':
eb160463 2934 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 2935 *flagp |= HASWIDTH|SIMPLE;
830247a4 2936 nextchar(pRExC_state);
fac92740 2937 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2938 break;
2939 case 'b':
830247a4
IZ
2940 RExC_seen_zerolen++;
2941 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2942 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 2943 *flagp |= SIMPLE;
830247a4 2944 nextchar(pRExC_state);
fac92740 2945 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2946 break;
2947 case 'B':
830247a4
IZ
2948 RExC_seen_zerolen++;
2949 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 2950 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 2951 *flagp |= SIMPLE;
830247a4 2952 nextchar(pRExC_state);
fac92740 2953 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2954 break;
2955 case 's':
eb160463 2956 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 2957 *flagp |= HASWIDTH|SIMPLE;
830247a4 2958 nextchar(pRExC_state);
fac92740 2959 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2960 break;
2961 case 'S':
eb160463 2962 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 2963 *flagp |= HASWIDTH|SIMPLE;
830247a4 2964 nextchar(pRExC_state);
fac92740 2965 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2966 break;
2967 case 'd':
ffc61ed2 2968 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 2969 *flagp |= HASWIDTH|SIMPLE;
830247a4 2970 nextchar(pRExC_state);
fac92740 2971 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
2972 break;
2973 case 'D':
ffc61ed2 2974 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 2975 *flagp |= HASWIDTH|SIMPLE;
830247a4 2976 nextchar(pRExC_state);
fac92740 2977 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 2978 break;
a14b48bc
LW
2979 case 'p':
2980 case 'P':
3568d838 2981 {
830247a4 2982 char* oldregxend = RExC_end;
ccb2c380 2983 char* parse_start = RExC_parse - 2;
a14b48bc 2984
830247a4 2985 if (RExC_parse[1] == '{') {
3568d838 2986 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
2987 RExC_end = strchr(RExC_parse, '}');
2988 if (!RExC_end) {
0da60cf5 2989 U8 c = (U8)*RExC_parse;
830247a4
IZ
2990 RExC_parse += 2;
2991 RExC_end = oldregxend;
0da60cf5 2992 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 2993 }
830247a4 2994 RExC_end++;
a14b48bc 2995 }
af6f566e 2996 else {
830247a4 2997 RExC_end = RExC_parse + 2;
af6f566e
HS
2998 if (RExC_end > oldregxend)
2999 RExC_end = oldregxend;
3000 }
830247a4 3001 RExC_parse--;
a14b48bc 3002
ffc61ed2 3003 ret = regclass(pRExC_state);
a14b48bc 3004
830247a4
IZ
3005 RExC_end = oldregxend;
3006 RExC_parse--;
ccb2c380
MP
3007
3008 Set_Node_Offset(ret, parse_start + 2);
3009 Set_Node_Cur_Length(ret);
830247a4 3010 nextchar(pRExC_state);
a14b48bc
LW
3011 *flagp |= HASWIDTH|SIMPLE;
3012 }
3013 break;
a0d0e21e
LW
3014 case 'n':
3015 case 'r':
3016 case 't':
3017 case 'f':
3018 case 'e':
3019 case 'a':
3020 case 'x':
3021 case 'c':
3022 case '0':
3023 goto defchar;
3024 case '1': case '2': case '3': case '4':
3025 case '5': case '6': case '7': case '8': case '9':
3026 {
830247a4 3027 I32 num = atoi(RExC_parse);
a0d0e21e 3028
830247a4 3029 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
3030 goto defchar;
3031 else {
fac92740 3032 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
3033 while (isDIGIT(*RExC_parse))
3034 RExC_parse++;
b45f050a 3035
eb160463 3036 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 3037 vFAIL("Reference to nonexistent group");
830247a4 3038 RExC_sawback = 1;
eb160463
GS
3039 ret = reganode(pRExC_state,
3040 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3041 num);
a0d0e21e 3042 *flagp |= HASWIDTH;
fac92740
MJD
3043
3044 /* override incorrect value set in reganode MJD */
3045 Set_Node_Offset(ret, parse_start+1);
3046 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
3047 RExC_parse--;
3048 nextchar(pRExC_state);
a0d0e21e
LW
3049 }
3050 }
3051 break;
3052 case '\0':
830247a4 3053 if (RExC_parse >= RExC_end)
b45f050a 3054 FAIL("Trailing \\");
a0d0e21e
LW
3055 /* FALL THROUGH */
3056 default:
c9f97d15
IZ
3057 /* Do not generate `unrecognized' warnings here, we fall
3058 back into the quick-grab loop below */
45948336 3059 parse_start--;
a0d0e21e
LW
3060 goto defchar;
3061 }
3062 break;
4633a7c4
LW
3063
3064 case '#':
e2509266 3065 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
3066 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3067 if (RExC_parse < RExC_end)
4633a7c4
LW
3068 goto tryagain;
3069 }
3070 /* FALL THROUGH */
3071
a0d0e21e 3072 default: {
ba210ebe 3073 register STRLEN len;
58ae7d3f 3074 register UV ender;
a0d0e21e 3075 register char *p;
c277df42 3076 char *oldp, *s;
ba210ebe 3077 STRLEN numlen;
80aecb99 3078 STRLEN foldlen;
60a8b682 3079 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
f06dbbb7
JH
3080
3081 parse_start = RExC_parse - 1;
a0d0e21e 3082
830247a4 3083 RExC_parse++;
a0d0e21e
LW
3084
3085 defchar:
58ae7d3f 3086 ender = 0;
eb160463
GS
3087 ret = reg_node(pRExC_state,
3088 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 3089 s = STRING(ret);
830247a4
IZ
3090 for (len = 0, p = RExC_parse - 1;
3091 len < 127 && p < RExC_end;
a0d0e21e
LW
3092 len++)
3093 {
3094 oldp = p;
5b5a24f7 3095
e2509266 3096 if (RExC_flags & PMf_EXTENDED)
830247a4 3097 p = regwhite(p, RExC_end);
a0d0e21e
LW
3098 switch (*p) {
3099 case '^':
3100 case '$':
3101 case '.':
3102 case '[':
3103 case '(':
3104 case ')':
3105 case '|':
3106 goto loopdone;
3107 case '\\':
3108 switch (*++p) {
3109 case 'A':
1ed8eac0
JF
3110 case 'C':
3111 case 'X':
a0d0e21e
LW
3112 case 'G':
3113 case 'Z':
b85d18e9 3114 case 'z':
a0d0e21e
LW
3115 case 'w':
3116 case 'W':
3117 case 'b':
3118 case 'B':
3119 case 's':
3120 case 'S':
3121 case 'd':
3122 case 'D':
a14b48bc
LW
3123 case 'p':
3124 case 'P':
a0d0e21e
LW
3125 --p;
3126 goto loopdone;
3127 case 'n':
3128 ender = '\n';
3129 p++;
a687059c 3130 break;
a0d0e21e
LW
3131 case 'r':
3132 ender = '\r';
3133 p++;
a687059c 3134 break;
a0d0e21e
LW
3135 case 't':
3136 ender = '\t';
3137 p++;
a687059c 3138 break;
a0d0e21e
LW
3139 case 'f':
3140 ender = '\f';
3141 p++;
a687059c 3142 break;
a0d0e21e 3143 case 'e':
c7f1f016 3144 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 3145 p++;
a687059c 3146 break;
a0d0e21e 3147 case 'a':
c7f1f016 3148 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 3149 p++;
a687059c 3150 break;
a0d0e21e 3151 case 'x':
a0ed51b3
LW
3152 if (*++p == '{') {
3153 char* e = strchr(p, '}');
b81d288d 3154
b45f050a 3155 if (!e) {
830247a4 3156 RExC_parse = p + 1;
b45f050a
JF
3157 vFAIL("Missing right brace on \\x{}");
3158 }
de5f0749 3159 else {
a4c04bdc
NC
3160 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3161 | PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3162 numlen = e - p - 1;
3163 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
3164 if (ender > 0xff)
3165 RExC_utf8 = 1;
a0ed51b3
LW
3166 p = e + 1;
3167 }
a0ed51b3
LW
3168 }
3169 else {
a4c04bdc 3170 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3171 numlen = 2;
3172 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
3173 p += numlen;
3174 }
a687059c 3175 break;
a0d0e21e
LW
3176 case 'c':
3177 p++;
bbce6d69 3178 ender = UCHARAT(p++);
3179 ender = toCTRL(ender);
a687059c 3180 break;
a0d0e21e
LW
3181 case '0': case '1': case '2': case '3':case '4':
3182 case '5': case '6': case '7': case '8':case '9':
3183 if (*p == '0' ||
830247a4 3184 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1
NC
3185 I32 flags = 0;
3186 numlen = 3;
3187 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
3188 p += numlen;
3189 }
3190 else {
3191 --p;
3192 goto loopdone;
a687059c
LW
3193 }
3194 break;
a0d0e21e 3195 case '\0':
830247a4 3196 if (p >= RExC_end)
b45f050a 3197 FAIL("Trailing \\");
a687059c 3198 /* FALL THROUGH */
a0d0e21e 3199 default:
e476b1b5 3200 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 3201 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 3202 goto normal_default;
a0d0e21e
LW
3203 }
3204 break;
a687059c 3205 default:
a0ed51b3 3206 normal_default:
fd400ab9 3207 if (UTF8_IS_START(*p) && UTF) {
5e12f4fb 3208 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 3209 &numlen, 0);
a0ed51b3
LW
3210 p += numlen;
3211 }
3212 else
3213 ender = *p++;
a0d0e21e 3214 break;
a687059c 3215 }
e2509266 3216 if (RExC_flags & PMf_EXTENDED)
830247a4 3217 p = regwhite(p, RExC_end);
60a8b682
JH
3218 if (UTF && FOLD) {
3219 /* Prime the casefolded buffer. */
ac7e0132 3220 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 3221 }
a0d0e21e
LW
3222 if (ISMULT2(p)) { /* Back off on ?+*. */
3223 if (len)
3224 p = oldp;
16ea2a2e 3225 else if (UTF) {
0ebc6274
JH
3226 STRLEN unilen;
3227
80aecb99 3228 if (FOLD) {
60a8b682 3229 /* Emit all the Unicode characters. */
80aecb99
JH
3230 for (foldbuf = tmpbuf;
3231 foldlen;
3232 foldlen -= numlen) {
3233 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3234 if (numlen > 0) {
0ebc6274
JH
3235 reguni(pRExC_state, ender, s, &unilen);
3236 s += unilen;
3237 len += unilen;
3238 /* In EBCDIC the numlen
3239 * and unilen can differ. */
9dc45d57 3240 foldbuf += numlen;
47654450
JH
3241 if (numlen >= foldlen)
3242 break;
9dc45d57
JH
3243 }
3244 else
3245 break; /* "Can't happen." */
80aecb99
JH
3246 }
3247 }
3248 else {
0ebc6274 3249 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3250 if (unilen > 0) {
0ebc6274
JH
3251 s += unilen;
3252 len += unilen;
9dc45d57 3253 }
80aecb99 3254 }
a0ed51b3 3255 }
a0d0e21e
LW
3256 else {
3257 len++;
eb160463 3258 REGC((char)ender, s++);
a0d0e21e
LW
3259 }
3260 break;
a687059c 3261 }
16ea2a2e 3262 if (UTF) {
0ebc6274
JH
3263 STRLEN unilen;
3264
80aecb99 3265 if (FOLD) {
60a8b682 3266 /* Emit all the Unicode characters. */
80aecb99
JH
3267 for (foldbuf = tmpbuf;
3268 foldlen;
3269 foldlen -= numlen) {
3270 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 3271 if (numlen > 0) {
0ebc6274
JH
3272 reguni(pRExC_state, ender, s, &unilen);
3273 len += unilen;
3274 s += unilen;
3275 /* In EBCDIC the numlen
3276 * and unilen can differ. */
9dc45d57 3277 foldbuf += numlen;
47654450
JH
3278 if (numlen >= foldlen)
3279 break;
9dc45d57
JH
3280 }
3281 else
3282 break;
80aecb99
JH
3283 }
3284 }
3285 else {
0ebc6274 3286 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 3287 if (unilen > 0) {
0ebc6274
JH
3288 s += unilen;
3289 len += unilen;
9dc45d57 3290 }
80aecb99
JH
3291 }
3292 len--;
a0ed51b3
LW
3293 }
3294 else
eb160463 3295 REGC((char)ender, s++);
a0d0e21e
LW
3296 }
3297 loopdone:
830247a4 3298 RExC_parse = p - 1;
fac92740 3299 Set_Node_Cur_Length(ret); /* MJD */
830247a4 3300 nextchar(pRExC_state);
793db0cb
JH
3301 {
3302 /* len is STRLEN which is unsigned, need to copy to signed */
3303 IV iv = len;
3304 if (iv < 0)
3305 vFAIL("Internal disaster");
3306 }
a0d0e21e
LW
3307 if (len > 0)
3308 *flagp |= HASWIDTH;
090f7165 3309 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 3310 *flagp |= SIMPLE;
c277df42 3311 if (!SIZE_ONLY)
cd439c50
IZ
3312 STR_LEN(ret) = len;
3313 if (SIZE_ONLY)
830247a4 3314 RExC_size += STR_SZ(len);
cd439c50 3315 else
830247a4 3316 RExC_emit += STR_SZ(len);
a687059c 3317 }
a0d0e21e
LW
3318 break;
3319 }
a687059c 3320
60a8b682
JH
3321 /* If the encoding pragma is in effect recode the text of
3322 * any EXACT-kind nodes. */
22c54be3 3323 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
3324 STRLEN oldlen = STR_LEN(ret);
3325 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3326
3327 if (RExC_utf8)
3328 SvUTF8_on(sv);
3329 if (sv_utf8_downgrade(sv, TRUE)) {
3330 char *s = sv_recode_to_utf8(sv, PL_encoding);
3331 STRLEN newlen = SvCUR(sv);
3332
3333 if (SvUTF8(sv))
3334 RExC_utf8 = 1;
3335 if (!SIZE_ONLY) {
3336 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3337 (int)oldlen, STRING(ret),
3338 (int)newlen, s));
3339 Copy(s, STRING(ret), newlen, char);
3340 STR_LEN(ret) += newlen - oldlen;
3341 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3342 } else
3343 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3344 }
a72c7584
JH
3345 }
3346
a0d0e21e 3347 return(ret);
a687059c
LW
3348}
3349
873ef191 3350STATIC char *
cea2e8a9 3351S_regwhite(pTHX_ char *p, char *e)
5b5a24f7
CS
3352{
3353 while (p < e) {
3354 if (isSPACE(*p))
3355 ++p;
3356 else if (*p == '#') {
3357 do {
3358 p++;
3359 } while (p < e && *p != '\n');
3360 }
3361 else
3362 break;
3363 }
3364 return p;
3365}
3366
b8c5462f
JH
3367/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3368 Character classes ([:foo:]) can also be negated ([:^foo:]).
3369 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3370 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 3371 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
3372
3373#define POSIXCC_DONE(c) ((c) == ':')
3374#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3375#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3376
b8c5462f 3377STATIC I32
830247a4 3378S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
3379{
3380 char *posixcc = 0;
936ed897 3381 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 3382
830247a4 3383 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 3384 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b
JH
3385 POSIXCC(UCHARAT(RExC_parse))) {
3386 char c = UCHARAT(RExC_parse);
830247a4 3387 char* s = RExC_parse++;
b81d288d 3388
9a86a77b 3389 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
3390 RExC_parse++;
3391 if (RExC_parse == RExC_end)
620e46c5 3392 /* Grandfather lone [:, [=, [. */
830247a4 3393 RExC_parse = s;
620e46c5 3394 else {
830247a4 3395 char* t = RExC_parse++; /* skip over the c */
b8c5462f 3396
9a86a77b 3397 if (UCHARAT(RExC_parse) == ']') {
830247a4 3398 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
3399 posixcc = s + 1;
3400 if (*s == ':') {
3401 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3402 I32 skip = 5; /* the most common skip */
3403
3404 switch (*posixcc) {
3405 case 'a':
3406 if (strnEQ(posixcc, "alnum", 5))
3407 namedclass =
3408 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3409 else if (strnEQ(posixcc, "alpha", 5))
3410 namedclass =
3411 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3412 else if (strnEQ(posixcc, "ascii", 5))
3413 namedclass =
3414 complement ? ANYOF_NASCII : ANYOF_ASCII;
3415 break;
aaa51d5e
JF
3416 case 'b':
3417 if (strnEQ(posixcc, "blank", 5))
3418 namedclass =
3419 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3420 break;
b8c5462f
JH
3421 case 'c':
3422 if (strnEQ(posixcc, "cntrl", 5))
3423 namedclass =
3424 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3425 break;
3426 case 'd':
3427 if (strnEQ(posixcc, "digit", 5))
3428 namedclass =
3429 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3430 break;
3431 case 'g':
3432 if (strnEQ(posixcc, "graph", 5))
3433 namedclass =
3434 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3435 break;
3436 case 'l':
3437 if (strnEQ(posixcc, "lower", 5))
3438 namedclass =
3439 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3440 break;
3441 case 'p':
3442 if (strnEQ(posixcc, "print", 5))
3443 namedclass =
3444 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3445 else if (strnEQ(posixcc, "punct", 5))
3446 namedclass =
3447 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3448 break;
3449 case 's':
3450 if (strnEQ(posixcc, "space", 5))
3451 namedclass =
aaa51d5e 3452 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
cc4319de 3453 break;
b8c5462f
JH
3454 case 'u':
3455 if (strnEQ(posixcc, "upper", 5))
3456 namedclass =
3457 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3458 break;
3459 case 'w': /* this is not POSIX, this is the Perl \w */
3460 if (strnEQ(posixcc, "word", 4)) {
3461 namedclass =
3462 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3463 skip = 4;
3464 }
3465 break;
3466 case 'x':
3467 if (strnEQ(posixcc, "xdigit", 6)) {
3468 namedclass =
3469 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3470 skip = 6;
3471 }
3472 break;
3473 }
ac561586
JH
3474 if (namedclass == OOB_NAMEDCLASS ||
3475 posixcc[skip] != ':' ||
3476 posixcc[skip+1] != ']')
b45f050a
JF
3477 {
3478 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3479 t - s - 1, s + 1);
3480 }
3481 } else if (!SIZE_ONLY) {
b8c5462f 3482 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 3483
830247a4 3484 /* adjust RExC_parse so the warning shows after
b45f050a 3485 the class closes */
9a86a77b 3486 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 3487 RExC_parse++;
b45f050a
JF
3488 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3489 }
b8c5462f
JH
3490 } else {
3491 /* Maternal grandfather:
3492 * "[:" ending in ":" but not in ":]" */
830247a4 3493 RExC_parse = s;
767d463e 3494 }
620e46c5
JH
3495 }
3496 }
3497
b8c5462f
JH
3498 return namedclass;
3499}
3500
3501STATIC void
830247a4 3502S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 3503{
b938889d 3504 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
830247a4 3505 char *s = RExC_parse;
93733859 3506 char c = *s++;
b8c5462f
JH
3507
3508 while(*s && isALNUM(*s))
3509 s++;
3510 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
3511 if (ckWARN(WARN_REGEXP))
3512 vWARN3(s+2,
3513 "POSIX syntax [%c %c] belongs inside character classes",
3514 c, c);
b45f050a
JF
3515
3516 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 3517 if (POSIXCC_NOTYET(c)) {
830247a4 3518 /* adjust RExC_parse so the error shows after
b45f050a 3519 the class closes */
9a86a77b 3520 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
3521 ;
3522 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3523 }
b8c5462f
JH
3524 }
3525 }
620e46c5
JH
3526}
3527
76e3520e 3528STATIC regnode *
830247a4 3529S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 3530{
ffc61ed2 3531 register UV value;
9a86a77b 3532 register UV nextvalue;
3568d838 3533 register IV prevvalue = OOB_UNICODE;
ffc61ed2 3534 register IV range = 0;
c277df42 3535 register regnode *ret;
ba210ebe 3536 STRLEN numlen;
ffc61ed2 3537 IV namedclass;
9c5ffd7c 3538 char *rangebegin = 0;
936ed897 3539 bool need_class = 0;
9c5ffd7c 3540 SV *listsv = Nullsv;
ffc61ed2
JH
3541 register char *e;
3542 UV n;
9e55ce06
JH
3543 bool optimize_invert = TRUE;
3544 AV* unicode_alternate = 0;
1b2d223b
JH
3545#ifdef EBCDIC
3546 UV literal_endpoint = 0;
3547#endif
ffc61ed2
JH
3548
3549 ret = reganode(pRExC_state, ANYOF, 0);
3550
3551 if (!SIZE_ONLY)
3552 ANYOF_FLAGS(ret) = 0;
3553
9a86a77b 3554 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
3555 RExC_naughty++;
3556 RExC_parse++;
3557 if (!SIZE_ONLY)
3558 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3559 }
a0d0e21e 3560
936ed897 3561 if (SIZE_ONLY)
830247a4 3562 RExC_size += ANYOF_SKIP;
936ed897 3563 else {
830247a4 3564 RExC_emit += ANYOF_SKIP;
936ed897
IZ
3565 if (FOLD)
3566 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3567 if (LOC)
3568 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
3569 ANYOF_BITMAP_ZERO(ret);
3570 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 3571 }
b8c5462f 3572
9a86a77b
JH
3573 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3574
b938889d 3575 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 3576 checkposixcc(pRExC_state);
b8c5462f 3577
f064b6ad
HS
3578 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3579 if (UCHARAT(RExC_parse) == ']')
3580 goto charclassloop;
ffc61ed2 3581
9a86a77b 3582 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
3583
3584 charclassloop:
3585
3586 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3587
73b437c8 3588 if (!range)
830247a4 3589 rangebegin = RExC_parse;
ffc61ed2 3590 if (UTF) {
5e12f4fb 3591 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
3592 RExC_end - RExC_parse,
3593 &numlen, 0);
ffc61ed2
JH
3594 RExC_parse += numlen;
3595 }
3596 else
3597 value = UCHARAT(RExC_parse++);
9a86a77b
JH
3598 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3599 if (value == '[' && POSIXCC(nextvalue))
830247a4 3600 namedclass = regpposixcc(pRExC_state, value);
620e46c5 3601 else if (value == '\\') {
ffc61ed2 3602 if (UTF) {
5e12f4fb 3603 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
3604 RExC_end - RExC_parse,
3605 &numlen, 0);
3606 RExC_parse += numlen;
3607 }
3608 else
3609 value = UCHARAT(RExC_parse++);
470c3474 3610 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 3611 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
3612 * be a problem later if we want switch on Unicode.
3613 * A similar issue a little bit later when switching on
3614 * namedclass. --jhi */
ffc61ed2 3615 switch ((I32)value) {
b8c5462f
JH
3616 case 'w': namedclass = ANYOF_ALNUM; break;
3617 case 'W': namedclass = ANYOF_NALNUM; break;
3618 case 's': namedclass = ANYOF_SPACE; break;
3619 case 'S': namedclass = ANYOF_NSPACE; break;
3620 case 'd': namedclass = ANYOF_DIGIT; break;
3621 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
3622 case 'p':
3623 case 'P':
af6f566e 3624 if (RExC_parse >= RExC_end)
2a4859cd 3625 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 3626 if (*RExC_parse == '{') {
0da60cf5 3627 U8 c = (U8)value;
ffc61ed2
JH
3628 e = strchr(RExC_parse++, '}');
3629 if (!e)
0da60cf5 3630 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
3631 while (isSPACE(UCHARAT(RExC_parse)))
3632 RExC_parse++;
3633 if (e == RExC_parse)
0da60cf5 3634 vFAIL2("Empty \\%c{}", c);
ffc61ed2 3635 n = e - RExC_parse;
ab13f0c7
JH
3636 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3637 n--;
ffc61ed2
JH
3638 }
3639 else {
3640 e = RExC_parse;
3641 n = 1;
3642 }
3643 if (!SIZE_ONLY) {
ab13f0c7
JH
3644 if (UCHARAT(RExC_parse) == '^') {
3645 RExC_parse++;
3646 n--;
3647 value = value == 'p' ? 'P' : 'p'; /* toggle */
3648 while (isSPACE(UCHARAT(RExC_parse))) {
3649 RExC_parse++;
3650 n--;
3651 }
3652 }
ffc61ed2 3653 if (value == 'p')
ab13f0c7
JH
3654 Perl_sv_catpvf(aTHX_ listsv,
3655 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 3656 else
ab13f0c7
JH
3657 Perl_sv_catpvf(aTHX_ listsv,
3658 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
3659 }
3660 RExC_parse = e + 1;
3661 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3662 continue;
b8c5462f
JH
3663 case 'n': value = '\n'; break;
3664 case 'r': value = '\r'; break;
3665 case 't': value = '\t'; break;
3666 case 'f': value = '\f'; break;
3667 case 'b': value = '\b'; break;
c7f1f016
NIS
3668 case 'e': value = ASCII_TO_NATIVE('\033');break;
3669 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 3670 case 'x':
ffc61ed2 3671 if (*RExC_parse == '{') {
a4c04bdc
NC
3672 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3673 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 3674 e = strchr(RExC_parse++, '}');
b81d288d 3675 if (!e)
ffc61ed2 3676 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
3677
3678 numlen = e - RExC_parse;
3679 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3680 RExC_parse = e + 1;
3681 }
3682 else {
a4c04bdc 3683 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
3684 numlen = 2;
3685 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
3686 RExC_parse += numlen;
3687 }
b8c5462f
JH
3688 break;
3689 case 'c':
830247a4 3690 value = UCHARAT(RExC_parse++);
b8c5462f
JH
3691 value = toCTRL(value);
3692 break;
3693 case '0': case '1': case '2': case '3': case '4':
3694 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
3695 {
3696 I32 flags = 0;
3697 numlen = 3;
3698 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 3699 RExC_parse += numlen;
b8c5462f 3700 break;
53305cf1 3701 }
1028017a 3702 default:
e476b1b5 3703 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
3704 vWARN2(RExC_parse,
3705 "Unrecognized escape \\%c in character class passed through",
3706 (int)value);
1028017a 3707 break;
b8c5462f 3708 }
ffc61ed2 3709 } /* end of \blah */
1b2d223b
JH
3710#ifdef EBCDIC
3711 else
3712 literal_endpoint++;
3713#endif
ffc61ed2
JH
3714
3715 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3716
3717 if (!SIZE_ONLY && !need_class)
936ed897 3718 ANYOF_CLASS_ZERO(ret);
ffc61ed2 3719
936ed897 3720 need_class = 1;
ffc61ed2
JH
3721
3722 /* a bad range like a-\d, a-[:digit:] ? */
3723 if (range) {
73b437c8 3724 if (!SIZE_ONLY) {
e476b1b5 3725 if (ckWARN(WARN_REGEXP))
830247a4 3726 vWARN4(RExC_parse,
b45f050a 3727 "False [] range \"%*.*s\"",
830247a4
IZ
3728 RExC_parse - rangebegin,
3729 RExC_parse - rangebegin,
b45f050a 3730 rangebegin);
3568d838
JH
3731 if (prevvalue < 256) {
3732 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
3733 ANYOF_BITMAP_SET(ret, '-');
3734 }
3735 else {
3736 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3737 Perl_sv_catpvf(aTHX_ listsv,
3568d838 3738 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 3739 }
b8c5462f 3740 }
ffc61ed2
JH
3741
3742 range = 0; /* this was not a true range */
73b437c8 3743 }
ffc61ed2 3744
73b437c8 3745 if (!SIZE_ONLY) {
3568d838
JH
3746 if (namedclass > OOB_NAMEDCLASS)
3747 optimize_invert = FALSE;
e2962f66
JH
3748 /* Possible truncation here but in some 64-bit environments
3749 * the compiler gets heartburn about switch on 64-bit values.
3750 * A similar issue a little earlier when switching on value.
98f323fa 3751 * --jhi */
e2962f66 3752 switch ((I32)namedclass) {
73b437c8
JH
3753 case ANYOF_ALNUM:
3754 if (LOC)
936ed897 3755 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
3756 else {
3757 for (value = 0; value < 256; value++)
3758 if (isALNUM(value))
936ed897 3759 ANYOF_BITMAP_SET(ret, value);
73b437c8 3760 }
ffc61ed2 3761 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
73b437c8
JH
3762 break;
3763 case ANYOF_NALNUM:
3764 if (LOC)
936ed897 3765 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
3766 else {
3767 for (value = 0; value < 256; value++)
3768 if (!isALNUM(value))
936ed897 3769 ANYOF_BITMAP_SET(ret, value);
73b437c8 3770 }
ffc61ed2 3771 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
73b437c8 3772 break;
ffc61ed2 3773 case ANYOF_ALNUMC:
73b437c8 3774 if (LOC)
ffc61ed2 3775 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
3776 else {
3777 for (value = 0; value < 256; value++)
ffc61ed2 3778 if (isALNUMC(value))
936ed897 3779 ANYOF_BITMAP_SET(ret, value);
73b437c8 3780 }
ffc61ed2 3781 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
73b437c8
JH
3782 break;
3783 case ANYOF_NALNUMC:
3784 if (LOC)
936ed897 3785 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
3786 else {
3787 for (value = 0; value < 256; value++)
3788 if (!isALNUMC(value))
936ed897 3789 ANYOF_BITMAP_SET(ret, value);
73b437c8 3790 }
ffc61ed2 3791 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
73b437c8
JH
3792 break;
3793 case ANYOF_ALPHA:
3794 if (LOC)
936ed897 3795 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
3796 else {
3797 for (value = 0; value < 256; value++)
3798 if (isALPHA(value))
936ed897 3799 ANYOF_BITMAP_SET(ret, value);
73b437c8 3800 }
ffc61ed2 3801 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
73b437c8
JH
3802 break;
3803 case ANYOF_NALPHA:
3804 if (LOC)
936ed897 3805 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
3806 else {
3807 for (value = 0; value < 256; value++)
3808 if (!isALPHA(value))
936ed897 3809 ANYOF_BITMAP_SET(ret, value);
73b437c8 3810 }
ffc61ed2 3811 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
73b437c8
JH
3812 break;
3813 case ANYOF_ASCII:
3814 if (LOC)
936ed897 3815 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 3816 else {
c7f1f016 3817#ifndef EBCDIC
1ba5c669
JH
3818 for (value = 0; value < 128; value++)
3819 ANYOF_BITMAP_SET(ret, value);
3820#else /* EBCDIC */
ffbc6a93 3821 for (value = 0; value < 256; value++) {
3a3c4447
JH
3822 if (isASCII(value))
3823 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3824 }
1ba5c669 3825#endif /* EBCDIC */
73b437c8 3826 }
ffc61ed2 3827 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
73b437c8
JH
3828 break;
3829 case ANYOF_NASCII:
3830 if (LOC)
936ed897 3831 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 3832 else {
c7f1f016 3833#ifndef EBCDIC
1ba5c669
JH
3834 for (value = 128; value < 256; value++)
3835 ANYOF_BITMAP_SET(ret, value);
3836#else /* EBCDIC */
ffbc6a93 3837 for (value = 0; value < 256; value++) {
3a3c4447
JH
3838 if (!isASCII(value))
3839 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 3840 }
1ba5c669 3841#endif /* EBCDIC */
73b437c8 3842 }
ffc61ed2 3843 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
73b437c8 3844 break;
aaa51d5e
JF
3845 case ANYOF_BLANK:
3846 if (LOC)
3847 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3848 else {
3849 for (value = 0; value < 256; value++)
3850 if (isBLANK(value))
3851 ANYOF_BITMAP_SET(ret, value);
3852 }
ffc61ed2 3853 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
aaa51d5e
JF
3854 break;
3855 case ANYOF_NBLANK:
3856 if (LOC)
3857 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3858 else {
3859 for (value = 0; value < 256; value++)
3860 if (!isBLANK(value))
3861 ANYOF_BITMAP_SET(ret, value);
3862 }
ffc61ed2 3863 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
aaa51d5e 3864 break;
73b437c8
JH
3865 case ANYOF_CNTRL:
3866 if (LOC)
936ed897 3867 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
3868 else {
3869 for (value = 0; value < 256; value++)
3870 if (isCNTRL(value))
936ed897 3871 ANYOF_BITMAP_SET(ret, value);
73b437c8 3872 }
ffc61ed2 3873 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
73b437c8
JH
3874 break;
3875 case ANYOF_NCNTRL:
3876 if (LOC)
936ed897 3877 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
3878 else {
3879 for (value = 0; value < 256; value++)
3880 if (!isCNTRL(value))
936ed897 3881 ANYOF_BITMAP_SET(ret, value);
73b437c8 3882 }
ffc61ed2
JH
3883 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3884 break;
3885 case ANYOF_DIGIT:
3886 if (LOC)
3887 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3888 else {
3889 /* consecutive digits assumed */
3890 for (value = '0'; value <= '9'; value++)
3891 ANYOF_BITMAP_SET(ret, value);
3892 }
3893 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3894 break;
3895 case ANYOF_NDIGIT:
3896 if (LOC)
3897 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3898 else {
3899 /* consecutive digits assumed */
3900 for (value = 0; value < '0'; value++)
3901 ANYOF_BITMAP_SET(ret, value);
3902 for (value = '9' + 1; value < 256; value++)
3903 ANYOF_BITMAP_SET(ret, value);
3904 }
3905 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
73b437c8
JH
3906 break;
3907 case ANYOF_GRAPH:
3908 if (LOC)
936ed897 3909 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
3910 else {
3911 for (value = 0; value < 256; value++)
3912 if (isGRAPH(value))
936ed897 3913 ANYOF_BITMAP_SET(ret, value);
73b437c8 3914 }
ffc61ed2 3915 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
73b437c8
JH
3916 break;
3917 case ANYOF_NGRAPH:
3918 if (LOC)
936ed897 3919 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
3920 else {
3921 for (value = 0; value < 256; value++)
3922 if (!isGRAPH(value))
936ed897 3923 ANYOF_BITMAP_SET(ret, value);
73b437c8 3924 }
ffc61ed2 3925 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
73b437c8
JH
3926 break;
3927 case ANYOF_LOWER:
3928 if (LOC)
936ed897 3929 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
3930 else {
3931 for (value = 0; value < 256; value++)
3932 if (isLOWER(value))
936ed897 3933 ANYOF_BITMAP_SET(ret, value);
73b437c8 3934 }
ffc61ed2 3935 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
73b437c8
JH
3936 break;
3937 case ANYOF_NLOWER:
3938 if (LOC)
936ed897 3939 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
3940 else {
3941 for (value = 0; value < 256; value++)
3942 if (!isLOWER(value))
936ed897 3943 ANYOF_BITMAP_SET(ret, value);
73b437c8 3944 }
ffc61ed2 3945 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
73b437c8
JH
3946 break;
3947 case ANYOF_PRINT:
3948 if (LOC)
936ed897 3949 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
3950 else {
3951 for (value = 0; value < 256; value++)
3952 if (isPRINT(value))
936ed897 3953 ANYOF_BITMAP_SET(ret, value);
73b437c8 3954 }
ffc61ed2 3955 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
73b437c8
JH
3956 break;
3957 case ANYOF_NPRINT:
3958 if (LOC)
936ed897 3959 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
3960 else {
3961 for (value = 0; value < 256; value++)
3962 if (!isPRINT(value))
936ed897 3963 ANYOF_BITMAP_SET(ret, value);
73b437c8 3964 }
ffc61ed2 3965 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
73b437c8 3966 break;
aaa51d5e
JF
3967 case ANYOF_PSXSPC:
3968 if (LOC)
3969 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3970 else {
3971 for (value = 0; value < 256; value++)
3972 if (isPSXSPC(value))
3973 ANYOF_BITMAP_SET(ret, value);
3974 }
ffc61ed2 3975 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
aaa51d5e
JF
3976 break;
3977 case ANYOF_NPSXSPC:
3978 if (LOC)
3979 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3980 else {
3981 for (value = 0; value < 256; value++)
3982 if (!isPSXSPC(value))
3983 ANYOF_BITMAP_SET(ret, value);
3984 }
ffc61ed2 3985 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
aaa51d5e 3986 break;
73b437c8
JH
3987 case ANYOF_PUNCT:
3988 if (LOC)
936ed897 3989 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
3990 else {
3991 for (value = 0; value < 256; value++)
3992 if (isPUNCT(value))
936ed897 3993 ANYOF_BITMAP_SET(ret, value);
73b437c8 3994 }
ffc61ed2 3995 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
73b437c8
JH
3996 break;
3997 case ANYOF_NPUNCT:
3998 if (LOC)
936ed897 3999 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
4000 else {
4001 for (value = 0; value < 256; value++)
4002 if (!isPUNCT(value))
936ed897 4003 ANYOF_BITMAP_SET(ret, value);
73b437c8 4004 }
ffc61ed2
JH
4005 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4006 break;
4007 case ANYOF_SPACE:
4008 if (LOC)
4009 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4010 else {
4011 for (value = 0; value < 256; value++)
4012 if (isSPACE(value))
4013 ANYOF_BITMAP_SET(ret, value);
4014 }
4015 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4016 break;
4017 case ANYOF_NSPACE:
4018 if (LOC)
4019 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4020 else {
4021 for (value = 0; value < 256; value++)
4022 if (!isSPACE(value))
4023 ANYOF_BITMAP_SET(ret, value);
4024 }
4025 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
73b437c8
JH
4026 break;
4027 case ANYOF_UPPER:
4028 if (LOC)
936ed897 4029 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
4030 else {
4031 for (value = 0; value < 256; value++)
4032 if (isUPPER(value))
936ed897 4033 ANYOF_BITMAP_SET(ret, value);
73b437c8 4034 }
ffc61ed2 4035 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
73b437c8
JH
4036 break;
4037 case ANYOF_NUPPER:
4038 if (LOC)
936ed897 4039 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
4040 else {
4041 for (value = 0; value < 256; value++)
4042 if (!isUPPER(value))
936ed897 4043 ANYOF_BITMAP_SET(ret, value);
73b437c8 4044 }
ffc61ed2 4045 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
73b437c8
JH
4046 break;
4047 case ANYOF_XDIGIT:
4048 if (LOC)
936ed897 4049 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
4050 else {
4051 for (value = 0; value < 256; value++)
4052 if (isXDIGIT(value))
936ed897 4053 ANYOF_BITMAP_SET(ret, value);
73b437c8 4054 }
ffc61ed2 4055 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
73b437c8
JH
4056 break;
4057 case ANYOF_NXDIGIT:
4058 if (LOC)
936ed897 4059 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
4060 else {
4061 for (value = 0; value < 256; value++)
4062 if (!isXDIGIT(value))
936ed897 4063 ANYOF_BITMAP_SET(ret, value);
73b437c8 4064 }
ffc61ed2 4065 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
73b437c8
JH
4066 break;
4067 default:
b45f050a 4068 vFAIL("Invalid [::] class");
73b437c8 4069 break;
b8c5462f 4070 }
b8c5462f 4071 if (LOC)
936ed897 4072 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 4073 continue;
a0d0e21e 4074 }
ffc61ed2
JH
4075 } /* end of namedclass \blah */
4076
a0d0e21e 4077 if (range) {
eb160463 4078 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 4079 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
4080 RExC_parse - rangebegin,
4081 RExC_parse - rangebegin,
b45f050a 4082 rangebegin);
3568d838 4083 range = 0; /* not a valid range */
73b437c8 4084 }
a0d0e21e
LW
4085 }
4086 else {
3568d838 4087 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
4088 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4089 RExC_parse[1] != ']') {
4090 RExC_parse++;
ffc61ed2
JH
4091
4092 /* a bad range like \w-, [:word:]- ? */
4093 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 4094 if (ckWARN(WARN_REGEXP))
830247a4 4095 vWARN4(RExC_parse,
b45f050a 4096 "False [] range \"%*.*s\"",
830247a4
IZ
4097 RExC_parse - rangebegin,
4098 RExC_parse - rangebegin,
b45f050a 4099 rangebegin);
73b437c8 4100 if (!SIZE_ONLY)
936ed897 4101 ANYOF_BITMAP_SET(ret, '-');
73b437c8 4102 } else
ffc61ed2
JH
4103 range = 1; /* yeah, it's a range! */
4104 continue; /* but do it the next time */
a0d0e21e 4105 }
a687059c 4106 }
ffc61ed2 4107
93733859 4108 /* now is the next time */
ae5c130c 4109 if (!SIZE_ONLY) {
3568d838
JH
4110 IV i;
4111
4112 if (prevvalue < 256) {
4113 IV ceilvalue = value < 256 ? value : 255;
4114
4115#ifdef EBCDIC
1b2d223b
JH
4116 /* In EBCDIC [\x89-\x91] should include
4117 * the \x8e but [i-j] should not. */
4118 if (literal_endpoint == 2 &&
4119 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4120 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 4121 {
3568d838
JH
4122 if (isLOWER(prevvalue)) {
4123 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4124 if (isLOWER(i))
4125 ANYOF_BITMAP_SET(ret, i);
4126 } else {
3568d838 4127 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
4128 if (isUPPER(i))
4129 ANYOF_BITMAP_SET(ret, i);
4130 }
8ada0baa 4131 }
ffc61ed2 4132 else
8ada0baa 4133#endif
a5961de5
JH
4134 for (i = prevvalue; i <= ceilvalue; i++)
4135 ANYOF_BITMAP_SET(ret, i);
3568d838 4136 }
a5961de5 4137 if (value > 255 || UTF) {
b08decb7
JH
4138 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4139 UV natvalue = NATIVE_TO_UNI(value);
4140
ffc61ed2 4141 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 4142 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 4143 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
4144 prevnatvalue, natvalue);
4145 }
4146 else if (prevnatvalue == natvalue) {
4147 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 4148 if (FOLD) {
254ba52a
JH
4149 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4150 STRLEN foldlen;
2f3bf011 4151 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 4152
c840d2a2
JH
4153 /* If folding and foldable and a single
4154 * character, insert also the folded version
4155 * to the charclass. */
9e55ce06 4156 if (f != value) {
eb160463 4157 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
4158 Perl_sv_catpvf(aTHX_ listsv,
4159 "%04"UVxf"\n", f);
4160 else {
4161 /* Any multicharacter foldings
4162 * require the following transform:
4163 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4164 * where E folds into "pq" and F folds
4165 * into "rst", all other characters
4166 * fold to single characters. We save
4167 * away these multicharacter foldings,
4168 * to be later saved as part of the
4169 * additional "s" data. */
4170 SV *sv;
4171
4172 if (!unicode_alternate)
4173 unicode_alternate = newAV();
4174 sv = newSVpvn((char*)foldbuf, foldlen);
4175 SvUTF8_on(sv);
4176 av_push(unicode_alternate, sv);
4177 }
4178 }
254ba52a 4179
60a8b682
JH
4180 /* If folding and the value is one of the Greek
4181 * sigmas insert a few more sigmas to make the
4182 * folding rules of the sigmas to work right.
4183 * Note that not all the possible combinations
4184 * are handled here: some of them are handled
9e55ce06
JH
4185 * by the standard folding rules, and some of
4186 * them (literal or EXACTF cases) are handled
4187 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
4188 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4189 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4190 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 4191 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4192 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4193 }
4194 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4195 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 4196 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
4197 }
4198 }
ffc61ed2 4199 }
1b2d223b
JH
4200#ifdef EBCDIC
4201 literal_endpoint = 0;
4202#endif
8ada0baa 4203 }
ffc61ed2
JH
4204
4205 range = 0; /* this range (if it was one) is done now */
a0d0e21e 4206 }
ffc61ed2 4207
936ed897 4208 if (need_class) {
4f66b38d 4209 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 4210 if (SIZE_ONLY)
830247a4 4211 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 4212 else
830247a4 4213 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 4214 }
ffc61ed2 4215
ae5c130c 4216 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 4217 if (!SIZE_ONLY &&
ffc61ed2 4218 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
4219 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4220 ) {
a0ed51b3 4221 for (value = 0; value < 256; ++value) {
936ed897 4222 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 4223 UV fold = PL_fold[value];
ffc61ed2
JH
4224
4225 if (fold != value)
4226 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
4227 }
4228 }
936ed897 4229 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 4230 }
ffc61ed2 4231
ae5c130c 4232 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 4233 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
4234 /* If the only flag is inversion. */
4235 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 4236 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 4237 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 4238 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 4239 }
a0d0e21e 4240
b81d288d 4241 if (!SIZE_ONLY) {
fde631ed 4242 AV *av = newAV();
ffc61ed2
JH
4243 SV *rv;
4244
9e55ce06 4245 /* The 0th element stores the character class description
6a0407ee 4246 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
4247 * to initialize the appropriate swash (which gets stored in
4248 * the 1st element), and also useful for dumping the regnode.
4249 * The 2nd element stores the multicharacter foldings,
6a0407ee 4250 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
4251 av_store(av, 0, listsv);
4252 av_store(av, 1, NULL);
9e55ce06 4253 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 4254 rv = newRV_noinc((SV*)av);
19860706 4255 n = add_data(pRExC_state, 1, "s");
830247a4 4256 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 4257 ARG_SET(ret, n);
a0ed51b3
LW
4258 }
4259
4260 return ret;
4261}
4262
76e3520e 4263STATIC char*
830247a4 4264S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 4265{
830247a4 4266 char* retval = RExC_parse++;
a0d0e21e 4267
4633a7c4 4268 for (;;) {
830247a4
IZ
4269 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4270 RExC_parse[2] == '#') {
e994fd66
AE
4271 while (*RExC_parse != ')') {
4272 if (RExC_parse == RExC_end)
4273 FAIL("Sequence (?#... not terminated");
830247a4 4274 RExC_parse++;
e994fd66 4275 }
830247a4 4276 RExC_parse++;
4633a7c4
LW
4277 continue;
4278 }
e2509266 4279 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4280 if (isSPACE(*RExC_parse)) {
4281 RExC_parse++;
748a9306
LW
4282 continue;
4283 }
830247a4 4284 else if (*RExC_parse == '#') {
e994fd66
AE
4285 while (RExC_parse < RExC_end)
4286 if (*RExC_parse++ == '\n') break;
748a9306
LW
4287 continue;
4288 }
748a9306 4289 }
4633a7c4 4290 return retval;
a0d0e21e 4291 }
a687059c
LW
4292}
4293
4294/*
c277df42 4295- reg_node - emit a node
a0d0e21e 4296*/
76e3520e 4297STATIC regnode * /* Location. */
830247a4 4298S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 4299{
c277df42
IZ
4300 register regnode *ret;
4301 register regnode *ptr;
a687059c 4302
830247a4 4303 ret = RExC_emit;
c277df42 4304 if (SIZE_ONLY) {
830247a4
IZ
4305 SIZE_ALIGN(RExC_size);
4306 RExC_size += 1;
a0d0e21e
LW
4307 return(ret);
4308 }
a687059c 4309
c277df42 4310 NODE_ALIGN_FILL(ret);
a0d0e21e 4311 ptr = ret;
c277df42 4312 FILL_ADVANCE_NODE(ptr, op);
fac92740 4313 if (RExC_offsets) { /* MJD */
ccb2c380 4314 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
4315 "reg_node", __LINE__,
4316 reg_name[op],
4317 RExC_emit - RExC_emit_start > RExC_offsets[0]
4318 ? "Overwriting end of array!\n" : "OK",
4319 RExC_emit - RExC_emit_start,
4320 RExC_parse - RExC_start,
4321 RExC_offsets[0]));
ccb2c380 4322 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
4323 }
4324
830247a4 4325 RExC_emit = ptr;
a687059c 4326
a0d0e21e 4327 return(ret);
a687059c
LW
4328}
4329
4330/*
a0d0e21e
LW
4331- reganode - emit a node with an argument
4332*/
76e3520e 4333STATIC regnode * /* Location. */
830247a4 4334S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 4335{
c277df42
IZ
4336 register regnode *ret;
4337 register regnode *ptr;
fe14fcc3 4338
830247a4 4339 ret = RExC_emit;
c277df42 4340 if (SIZE_ONLY) {
830247a4
IZ
4341 SIZE_ALIGN(RExC_size);
4342 RExC_size += 2;
a0d0e21e
LW
4343 return(ret);
4344 }
fe14fcc3 4345
c277df42 4346 NODE_ALIGN_FILL(ret);
a0d0e21e 4347 ptr = ret;
c277df42 4348 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 4349 if (RExC_offsets) { /* MJD */
ccb2c380 4350 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4351 "reganode",
ccb2c380
MP
4352 __LINE__,
4353 reg_name[op],
fac92740
MJD
4354 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4355 "Overwriting end of array!\n" : "OK",
4356 RExC_emit - RExC_emit_start,
4357 RExC_parse - RExC_start,
4358 RExC_offsets[0]));
ccb2c380 4359 Set_Cur_Node_Offset;
fac92740
MJD
4360 }
4361
830247a4 4362 RExC_emit = ptr;
fe14fcc3 4363
a0d0e21e 4364 return(ret);
fe14fcc3
LW
4365}
4366
4367/*
cd439c50 4368- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
4369*/
4370STATIC void
830247a4 4371S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 4372{
5e12f4fb 4373 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
4374}
4375
4376/*
a0d0e21e
LW
4377- reginsert - insert an operator in front of already-emitted operand
4378*
4379* Means relocating the operand.
4380*/
76e3520e 4381STATIC void
830247a4 4382S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 4383{
c277df42
IZ
4384 register regnode *src;
4385 register regnode *dst;
4386 register regnode *place;
4387 register int offset = regarglen[(U8)op];
b81d288d 4388
22c35a8c 4389/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
4390
4391 if (SIZE_ONLY) {
830247a4 4392 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
4393 return;
4394 }
a687059c 4395
830247a4
IZ
4396 src = RExC_emit;
4397 RExC_emit += NODE_STEP_REGNODE + offset;
4398 dst = RExC_emit;
fac92740 4399 while (src > opnd) {
c277df42 4400 StructCopy(--src, --dst, regnode);
fac92740 4401 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 4402 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 4403 "reg_insert",
ccb2c380
MP
4404 __LINE__,
4405 reg_name[op],
fac92740
MJD
4406 dst - RExC_emit_start > RExC_offsets[0]
4407 ? "Overwriting end of array!\n" : "OK",
4408 src - RExC_emit_start,
4409 dst - RExC_emit_start,
4410 RExC_offsets[0]));
ccb2c380
MP
4411 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4412 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
4413 }
4414 }
4415
a0d0e21e
LW
4416
4417 place = opnd; /* Op node, where operand used to be. */
fac92740 4418 if (RExC_offsets) { /* MJD */
ccb2c380 4419 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 4420 "reginsert",
ccb2c380
MP
4421 __LINE__,
4422 reg_name[op],
fac92740
MJD
4423 place - RExC_emit_start > RExC_offsets[0]
4424 ? "Overwriting end of array!\n" : "OK",
4425 place - RExC_emit_start,
4426 RExC_parse - RExC_start,
4427 RExC_offsets[0]));
ccb2c380 4428 Set_Node_Offset(place, RExC_parse);
45948336 4429 Set_Node_Length(place, 1);
fac92740 4430 }
c277df42
IZ
4431 src = NEXTOPER(place);
4432 FILL_ADVANCE_NODE(place, op);
4433 Zero(src, offset, regnode);
a687059c
LW
4434}
4435
4436/*
c277df42 4437- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 4438*/
76e3520e 4439STATIC void
830247a4 4440S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4441{
c277df42
IZ
4442 register regnode *scan;
4443 register regnode *temp;
a0d0e21e 4444
c277df42 4445 if (SIZE_ONLY)
a0d0e21e
LW
4446 return;
4447
4448 /* Find last node. */
4449 scan = p;
4450 for (;;) {
4451 temp = regnext(scan);
4452 if (temp == NULL)
4453 break;
4454 scan = temp;
4455 }
a687059c 4456
c277df42
IZ
4457 if (reg_off_by_arg[OP(scan)]) {
4458 ARG_SET(scan, val - scan);
a0ed51b3
LW
4459 }
4460 else {
c277df42
IZ
4461 NEXT_OFF(scan) = val - scan;
4462 }
a687059c
LW
4463}
4464
4465/*
a0d0e21e
LW
4466- regoptail - regtail on operand of first argument; nop if operandless
4467*/
76e3520e 4468STATIC void
830247a4 4469S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 4470{
a0d0e21e 4471 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
4472 if (p == NULL || SIZE_ONLY)
4473 return;
22c35a8c 4474 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 4475 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 4476 }
22c35a8c 4477 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 4478 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
4479 }
4480 else
a0d0e21e 4481 return;
a687059c
LW
4482}
4483
4484/*
4485 - regcurly - a little FSA that accepts {\d+,?\d*}
4486 */
79072805 4487STATIC I32
cea2e8a9 4488S_regcurly(pTHX_ register char *s)
a687059c
LW
4489{
4490 if (*s++ != '{')
4491 return FALSE;
f0fcb552 4492 if (!isDIGIT(*s))
a687059c 4493 return FALSE;
f0fcb552 4494 while (isDIGIT(*s))
a687059c
LW
4495 s++;
4496 if (*s == ',')
4497 s++;
f0fcb552 4498 while (isDIGIT(*s))
a687059c
LW
4499 s++;
4500 if (*s != '}')
4501 return FALSE;
4502 return TRUE;
4503}
4504
a687059c 4505
8fa7f367
JH
4506#ifdef DEBUGGING
4507
76e3520e 4508STATIC regnode *
cea2e8a9 4509S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 4510{
f248d071 4511 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 4512 register regnode *next;
c277df42
IZ
4513
4514 while (op != END && (!last || node < last)) {
4515 /* While that wasn't END last time... */
4516
4517 NODE_ALIGN(node);
4518 op = OP(node);
4519 if (op == CLOSE)
4520 l--;
4521 next = regnext(node);
4522 /* Where, what. */
4523 if (OP(node) == OPTIMIZED)
4524 goto after_print;
4525 regprop(sv, node);
b900a521 4526 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 4527 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
4528 if (next == NULL) /* Next ptr. */
4529 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 4530 else
b900a521 4531 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
4532 (void)PerlIO_putc(Perl_debug_log, '\n');
4533 after_print:
22c35a8c 4534 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
4535 register regnode *nnode = (OP(next) == LONGJMP
4536 ? regnext(next)
c277df42
IZ
4537 : next);
4538 if (last && nnode > last)
4539 nnode = last;
4540 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 4541 }
22c35a8c 4542 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 4543 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3
LW
4544 }
4545 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
4546 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4547 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 4548 }
22c35a8c 4549 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
4550 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4551 next, sv, l + 1);
a0ed51b3
LW
4552 }
4553 else if ( op == PLUS || op == STAR) {
c277df42 4554 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
4555 }
4556 else if (op == ANYOF) {
4f66b38d
HS
4557 /* arglen 1 + class block */
4558 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4559 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4560 node = NEXTOPER(node);
a0ed51b3 4561 }
22c35a8c 4562 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 4563 /* Literal string, where present. */
cd439c50 4564 node += NODE_SZ_STR(node) - 1;
c277df42 4565 node = NEXTOPER(node);
a0ed51b3
LW
4566 }
4567 else {
c277df42
IZ
4568 node = NEXTOPER(node);
4569 node += regarglen[(U8)op];
4570 }
4571 if (op == CURLYX || op == OPEN)
4572 l++;
4573 else if (op == WHILEM)
4574 l--;
4575 }
4576 return node;
4577}
4578
8fa7f367
JH
4579#endif /* DEBUGGING */
4580
a687059c 4581/*
fd181c75 4582 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
4583 */
4584void
864dbfa3 4585Perl_regdump(pTHX_ regexp *r)
a687059c 4586{
35ff7856 4587#ifdef DEBUGGING
46fc3d4c 4588 SV *sv = sv_newmortal();
a687059c 4589
c277df42 4590 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
4591
4592 /* Header fields of interest. */
c277df42 4593 if (r->anchored_substr)
7b0972df 4594 PerlIO_printf(Perl_debug_log,
b81d288d 4595 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 4596 PL_colors[0],
7b0972df 4597 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 4598 SvPVX(r->anchored_substr),
3280af22 4599 PL_colors[1],
c277df42 4600 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 4601 (IV)r->anchored_offset);
33b8afdf
JH
4602 else if (r->anchored_utf8)
4603 PerlIO_printf(Perl_debug_log,
4604 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4605 PL_colors[0],
4606 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4607 SvPVX(r->anchored_utf8),
4608 PL_colors[1],
4609 SvTAIL(r->anchored_utf8) ? "$" : "",
4610 (IV)r->anchored_offset);
c277df42 4611 if (r->float_substr)
7b0972df 4612 PerlIO_printf(Perl_debug_log,
b81d288d 4613 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 4614 PL_colors[0],
b81d288d 4615 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 4616 SvPVX(r->float_substr),
3280af22 4617 PL_colors[1],
c277df42 4618 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 4619 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
4620 else if (r->float_utf8)
4621 PerlIO_printf(Perl_debug_log,
4622 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4623 PL_colors[0],
4624 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4625 SvPVX(r->float_utf8),
4626 PL_colors[1],
4627 SvTAIL(r->float_utf8) ? "$" : "",
4628 (IV)r->float_min_offset, (UV)r->float_max_offset);
4629 if (r->check_substr || r->check_utf8)
b81d288d
AB
4630 PerlIO_printf(Perl_debug_log,
4631 r->check_substr == r->float_substr
33b8afdf 4632 && r->check_utf8 == r->float_utf8
c277df42
IZ
4633 ? "(checking floating" : "(checking anchored");
4634 if (r->reganch & ROPT_NOSCAN)
4635 PerlIO_printf(Perl_debug_log, " noscan");
4636 if (r->reganch & ROPT_CHECK_ALL)
4637 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 4638 if (r->check_substr || r->check_utf8)
c277df42
IZ
4639 PerlIO_printf(Perl_debug_log, ") ");
4640
46fc3d4c 4641 if (r->regstclass) {
4642 regprop(sv, r->regstclass);
4643 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4644 }
774d564b 4645 if (r->reganch & ROPT_ANCH) {
4646 PerlIO_printf(Perl_debug_log, "anchored");
4647 if (r->reganch & ROPT_ANCH_BOL)
4648 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
4649 if (r->reganch & ROPT_ANCH_MBOL)
4650 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
4651 if (r->reganch & ROPT_ANCH_SBOL)
4652 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 4653 if (r->reganch & ROPT_ANCH_GPOS)
4654 PerlIO_printf(Perl_debug_log, "(GPOS)");
4655 PerlIO_putc(Perl_debug_log, ' ');
4656 }
c277df42
IZ
4657 if (r->reganch & ROPT_GPOS_SEEN)
4658 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 4659 if (r->reganch & ROPT_SKIP)
760ac839 4660 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 4661 if (r->reganch & ROPT_IMPLICIT)
760ac839 4662 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 4663 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
4664 if (r->reganch & ROPT_EVAL_SEEN)
4665 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 4666 PerlIO_printf(Perl_debug_log, "\n");
fac92740
MJD
4667 if (r->offsets) {
4668 U32 i;
4669 U32 len = r->offsets[0];
392fbf5d 4670 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
fac92740 4671 for (i = 1; i <= len; i++)
392fbf5d
RB
4672 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4673 (UV)r->offsets[i*2-1],
4674 (UV)r->offsets[i*2]);
fac92740
MJD
4675 PerlIO_printf(Perl_debug_log, "\n");
4676 }
17c3b450 4677#endif /* DEBUGGING */
a687059c
LW
4678}
4679
8fa7f367
JH
4680#ifdef DEBUGGING
4681
653099ff
GS
4682STATIC void
4683S_put_byte(pTHX_ SV *sv, int c)
4684{
7be5a6cf 4685 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
4686 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4687 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4688 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4689 else
4690 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4691}
4692
8fa7f367
JH
4693#endif /* DEBUGGING */
4694
a687059c 4695/*
a0d0e21e
LW
4696- regprop - printable representation of opcode
4697*/
46fc3d4c 4698void
864dbfa3 4699Perl_regprop(pTHX_ SV *sv, regnode *o)
a687059c 4700{
35ff7856 4701#ifdef DEBUGGING
9b155405 4702 register int k;
a0d0e21e 4703
54dc92de 4704 sv_setpvn(sv, "", 0);
9b155405 4705 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
4706 /* It would be nice to FAIL() here, but this may be called from
4707 regexec.c, and it would be hard to supply pRExC_state. */
4708 Perl_croak(aTHX_ "Corrupted regexp opcode");
9b155405
IZ
4709 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4710
4711 k = PL_regkind[(U8)OP(o)];
4712
2a782b5b
JH
4713 if (k == EXACT) {
4714 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
4715 /* Using is_utf8_string() is a crude hack but it may
4716 * be the best for now since we have no flag "this EXACTish
4717 * node was UTF-8" --jhi */
4718 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
8a989385 4719 char *s = do_utf8 ?
c728cb41
JH
4720 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4721 UNI_DISPLAY_REGEX) :
2a782b5b 4722 STRING(o);
40eddc46 4723 int len = do_utf8 ?
2a782b5b
JH
4724 strlen(s) :
4725 STR_LEN(o);
4726 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4727 PL_colors[0],
4728 len, s,
4729 PL_colors[1]);
4730 }
9b155405 4731 else if (k == CURLY) {
cb434fcc 4732 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
4733 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4734 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 4735 }
2c2d71f5
JH
4736 else if (k == WHILEM && o->flags) /* Ordinal/of */
4737 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 4738 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 4739 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 4740 else if (k == LOGICAL)
04ebc1ab 4741 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
4742 else if (k == ANYOF) {
4743 int i, rangestart = -1;
ffc61ed2 4744 U8 flags = ANYOF_FLAGS(o);
a6d05634 4745 const char * const anyofs[] = { /* Should be synchronized with
19860706 4746 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
4747 "\\w",
4748 "\\W",
4749 "\\s",
4750 "\\S",
4751 "\\d",
4752 "\\D",
4753 "[:alnum:]",
4754 "[:^alnum:]",
4755 "[:alpha:]",
4756 "[:^alpha:]",
4757 "[:ascii:]",
4758 "[:^ascii:]",
4759 "[:ctrl:]",
4760 "[:^ctrl:]",
4761 "[:graph:]",
4762 "[:^graph:]",
4763 "[:lower:]",
4764 "[:^lower:]",
4765 "[:print:]",
4766 "[:^print:]",
4767 "[:punct:]",
4768 "[:^punct:]",
4769 "[:upper:]",
aaa51d5e 4770 "[:^upper:]",
653099ff 4771 "[:xdigit:]",
aaa51d5e
JF
4772 "[:^xdigit:]",
4773 "[:space:]",
4774 "[:^space:]",
4775 "[:blank:]",
4776 "[:^blank:]"
653099ff
GS
4777 };
4778
19860706 4779 if (flags & ANYOF_LOCALE)
653099ff 4780 sv_catpv(sv, "{loc}");
19860706 4781 if (flags & ANYOF_FOLD)
653099ff
GS
4782 sv_catpv(sv, "{i}");
4783 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 4784 if (flags & ANYOF_INVERT)
653099ff 4785 sv_catpv(sv, "^");
ffc61ed2
JH
4786 for (i = 0; i <= 256; i++) {
4787 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4788 if (rangestart == -1)
4789 rangestart = i;
4790 } else if (rangestart != -1) {
4791 if (i <= rangestart + 3)
4792 for (; rangestart < i; rangestart++)
653099ff 4793 put_byte(sv, rangestart);
ffc61ed2
JH
4794 else {
4795 put_byte(sv, rangestart);
4796 sv_catpv(sv, "-");
4797 put_byte(sv, i - 1);
653099ff 4798 }
ffc61ed2 4799 rangestart = -1;
653099ff 4800 }
847a199f 4801 }
ffc61ed2
JH
4802
4803 if (o->flags & ANYOF_CLASS)
4804 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4805 if (ANYOF_CLASS_TEST(o,i))
4806 sv_catpv(sv, anyofs[i]);
4807
4808 if (flags & ANYOF_UNICODE)
4809 sv_catpv(sv, "{unicode}");
1aa99e6b 4810 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 4811 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
4812
4813 {
4814 SV *lv;
9e55ce06 4815 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 4816
ffc61ed2
JH
4817 if (lv) {
4818 if (sw) {
ffc61ed2 4819 U8 s[UTF8_MAXLEN+1];
b81d288d 4820
ffc61ed2 4821 for (i = 0; i <= 256; i++) { /* just the first 256 */
2b9d42f0 4822 U8 *e = uvchr_to_utf8(s, i);
ffc61ed2 4823
3568d838 4824 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
4825 if (rangestart == -1)
4826 rangestart = i;
4827 } else if (rangestart != -1) {
4828 U8 *p;
b81d288d 4829
ffc61ed2
JH
4830 if (i <= rangestart + 3)
4831 for (; rangestart < i; rangestart++) {
2b9d42f0 4832 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4833 put_byte(sv, *p);
4834 }
4835 else {
2b9d42f0 4836 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
4837 put_byte(sv, *p);
4838 sv_catpv(sv, "-");
2b9d42f0 4839 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
ffc61ed2
JH
4840 put_byte(sv, *p);
4841 }
4842 rangestart = -1;
4843 }
19860706 4844 }
ffc61ed2
JH
4845
4846 sv_catpv(sv, "..."); /* et cetera */
19860706 4847 }
fde631ed 4848
ffc61ed2
JH
4849 {
4850 char *s = savepv(SvPVX(lv));
4851 char *origs = s;
b81d288d 4852
ffc61ed2 4853 while(*s && *s != '\n') s++;
b81d288d 4854
ffc61ed2
JH
4855 if (*s == '\n') {
4856 char *t = ++s;
4857
4858 while (*s) {
4859 if (*s == '\n')
4860 *s = ' ';
4861 s++;
4862 }
4863 if (s[-1] == ' ')
4864 s[-1] = 0;
4865
4866 sv_catpv(sv, t);
fde631ed 4867 }
b81d288d 4868
ffc61ed2 4869 Safefree(origs);
fde631ed
JH
4870 }
4871 }
653099ff 4872 }
ffc61ed2 4873
653099ff
GS
4874 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4875 }
9b155405 4876 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 4877 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 4878#endif /* DEBUGGING */
35ff7856 4879}
a687059c 4880
cad2e5aa
JH
4881SV *
4882Perl_re_intuit_string(pTHX_ regexp *prog)
4883{ /* Assume that RE_INTUIT is set */
4884 DEBUG_r(
4885 { STRLEN n_a;
33b8afdf
JH
4886 char *s = SvPV(prog->check_substr
4887 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
4888
4889 if (!PL_colorset) reginitcolors();
4890 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
4891 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4892 PL_colors[4],
4893 prog->check_substr ? "" : "utf8 ",
4894 PL_colors[5],PL_colors[0],
cad2e5aa
JH
4895 s,
4896 PL_colors[1],
4897 (strlen(s) > 60 ? "..." : ""));
4898 } );
4899
33b8afdf 4900 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
4901}
4902
2b69d0c2 4903void
864dbfa3 4904Perl_pregfree(pTHX_ struct regexp *r)
a687059c 4905{
9e55ce06
JH
4906#ifdef DEBUGGING
4907 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4908#endif
7821416a
IZ
4909
4910 if (!r || (--r->refcnt > 0))
4911 return;
9e55ce06 4912 DEBUG_r({
d103360b
HS
4913 int len;
4914 char *s;
4915
4916 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4917 r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 4918 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
d103360b 4919 len = SvCUR(dsv);
9e55ce06
JH
4920 if (!PL_colorset)
4921 reginitcolors();
4922 PerlIO_printf(Perl_debug_log,
4923 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4924 PL_colors[4],PL_colors[5],PL_colors[0],
4925 len, len, s,
4926 PL_colors[1],
4927 len > 60 ? "..." : "");
4928 });
cad2e5aa 4929
c277df42 4930 if (r->precomp)
a0d0e21e 4931 Safefree(r->precomp);
fac92740
MJD
4932 if (r->offsets) /* 20010421 MJD */
4933 Safefree(r->offsets);
ed252734
NC
4934 RX_MATCH_COPY_FREE(r);
4935#ifdef PERL_COPY_ON_WRITE
4936 if (r->saved_copy)
4937 SvREFCNT_dec(r->saved_copy);
4938#endif
a193d654
GS
4939 if (r->substrs) {
4940 if (r->anchored_substr)
4941 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
4942 if (r->anchored_utf8)
4943 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
4944 if (r->float_substr)
4945 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
4946 if (r->float_utf8)
4947 SvREFCNT_dec(r->float_utf8);
2779dcf1 4948 Safefree(r->substrs);
a193d654 4949 }
c277df42
IZ
4950 if (r->data) {
4951 int n = r->data->count;
f3548bdc
DM
4952 PAD* new_comppad = NULL;
4953 PAD* old_comppad;
dfad63ad 4954
c277df42 4955 while (--n >= 0) {
261faec3 4956 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
4957 switch (r->data->what[n]) {
4958 case 's':
4959 SvREFCNT_dec((SV*)r->data->data[n]);
4960 break;
653099ff
GS
4961 case 'f':
4962 Safefree(r->data->data[n]);
4963 break;
dfad63ad
HS
4964 case 'p':
4965 new_comppad = (AV*)r->data->data[n];
4966 break;
c277df42 4967 case 'o':
dfad63ad 4968 if (new_comppad == NULL)
cea2e8a9 4969 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
4970 PAD_SAVE_LOCAL(old_comppad,
4971 /* Watch out for global destruction's random ordering. */
4972 (SvTYPE(new_comppad) == SVt_PVAV) ?
4973 new_comppad : Null(PAD *)
4974 );
9b978d73
DM
4975 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4976 op_free((OP_4tree*)r->data->data[n]);
4977 }
4978
f3548bdc 4979 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
4980 SvREFCNT_dec((SV*)new_comppad);
4981 new_comppad = NULL;
c277df42
IZ
4982 break;
4983 case 'n':
9e55ce06 4984 break;
c277df42 4985 default:
830247a4 4986 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
4987 }
4988 }
4989 Safefree(r->data->what);
4990 Safefree(r->data);
a0d0e21e
LW
4991 }
4992 Safefree(r->startp);
4993 Safefree(r->endp);
4994 Safefree(r);
a687059c 4995}
c277df42
IZ
4996
4997/*
4998 - regnext - dig the "next" pointer out of a node
4999 *
5000 * [Note, when REGALIGN is defined there are two places in regmatch()
5001 * that bypass this code for speed.]
5002 */
5003regnode *
864dbfa3 5004Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
5005{
5006 register I32 offset;
5007
3280af22 5008 if (p == &PL_regdummy)
c277df42
IZ
5009 return(NULL);
5010
5011 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5012 if (offset == 0)
5013 return(NULL);
5014
c277df42 5015 return(p+offset);
c277df42
IZ
5016}
5017
01f988be 5018STATIC void
cea2e8a9 5019S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
5020{
5021 va_list args;
5022 STRLEN l1 = strlen(pat1);
5023 STRLEN l2 = strlen(pat2);
5024 char buf[512];
06bf62c7 5025 SV *msv;
c277df42
IZ
5026 char *message;
5027
5028 if (l1 > 510)
5029 l1 = 510;
5030 if (l1 + l2 > 510)
5031 l2 = 510 - l1;
5032 Copy(pat1, buf, l1 , char);
5033 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
5034 buf[l1 + l2] = '\n';
5035 buf[l1 + l2 + 1] = '\0';
8736538c
AS
5036#ifdef I_STDARG
5037 /* ANSI variant takes additional second argument */
c277df42 5038 va_start(args, pat2);
8736538c
AS
5039#else
5040 va_start(args);
5041#endif
5a844595 5042 msv = vmess(buf, &args);
c277df42 5043 va_end(args);
06bf62c7 5044 message = SvPV(msv,l1);
c277df42
IZ
5045 if (l1 > 512)
5046 l1 = 512;
5047 Copy(message, buf, l1 , char);
197cf9b9 5048 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 5049 Perl_croak(aTHX_ "%s", buf);
c277df42 5050}
a0ed51b3
LW
5051
5052/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5053
5054void
864dbfa3 5055Perl_save_re_context(pTHX)
b81d288d 5056{
830247a4 5057 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 5058 SAVEPPTR(PL_bostr);
a0ed51b3
LW
5059 SAVEPPTR(PL_reginput); /* String-input pointer. */
5060 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5061 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
5062 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5063 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5064 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 5065 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 5066 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 5067 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 5068 PL_reg_start_tmp = 0;
a0ed51b3
LW
5069 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5070 PL_reg_start_tmpl = 0;
7766f137 5071 SAVEVPTR(PL_regdata);
a0ed51b3
LW
5072 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5073 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 5074 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 5075 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
5076 SAVEVPTR(PL_regcc); /* from regexec.c */
5077 SAVEVPTR(PL_curcop);
7766f137
GS
5078 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5079 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
5080 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5081 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 5082 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 5083 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 5084 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
5085 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5086 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
5087 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5088 PL_reg_oldsaved = Nullch;
5089 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5090 PL_reg_oldsavedlen = 0;
ed252734
NC
5091#ifdef PERL_COPY_ON_WRITE
5092 SAVESPTR(PL_nrs);
5093 PL_nrs = Nullsv;
5094#endif
a5db57d6
GS
5095 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5096 PL_reg_maxiter = 0;
5097 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5098 PL_reg_leftiter = 0;
5099 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5100 PL_reg_poscache = Nullch;
5101 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5102 PL_reg_poscache_size = 0;
5103 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 5104 SAVEI32(PL_regnpar); /* () count. */
e49a9654 5105 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
5106
5107 {
5108 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
a8514157 5109 U32 i;
ada6e8a9
AMS
5110 GV *mgv;
5111 REGEXP *rx;
5112 char digits[16];
5113
5114 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5115 for (i = 1; i <= rx->nparens; i++) {
d994d9a1 5116 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
5117 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5118 save_scalar(mgv);
5119 }
5120 }
5121 }
5122
54b6e2fa 5123#ifdef DEBUGGING
b81d288d 5124 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 5125#endif
a0ed51b3 5126}
51371543 5127
51371543 5128static void
acfe0abc 5129clear_re(pTHX_ void *r)
51371543
GS
5130{
5131 ReREFCNT_dec((regexp *)r);
5132}
ffbc6a93 5133