This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Round up all string length requests to malloc()/realloc() to the next
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e
AD
32#ifdef PERL_EXT_RE_BUILD
33/* need to replace pregcomp et al, so enable that */
34# ifndef PERL_IN_XSUB_RE
35# define PERL_IN_XSUB_RE
36# endif
37/* need access to debugger hooks */
cad2e5aa 38# if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
b9d5759e
AD
39# define DEBUGGING
40# endif
41#endif
42
43#ifdef PERL_IN_XSUB_RE
d06ea78c 44/* We *really* need to overwrite these symbols: */
56953603
IZ
45# define Perl_pregcomp my_regcomp
46# define Perl_regdump my_regdump
47# define Perl_regprop my_regprop
d06ea78c 48# define Perl_pregfree my_regfree
cad2e5aa
JH
49# define Perl_re_intuit_string my_re_intuit_string
50/* *These* symbols are masked to allow static link. */
d06ea78c 51# define Perl_regnext my_regnext
f0b8d043 52# define Perl_save_re_context my_save_re_context
b81d288d 53# define Perl_reginitcolors my_reginitcolors
c5be433b
GS
54
55# define PERL_NO_GET_CONTEXT
b81d288d 56#endif
56953603 57
f0fcb552 58/*SUPPRESS 112*/
a687059c 59/*
e50aee73 60 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
61 *
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
64 *
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
68 *
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
71 * from defects in it.
72 *
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
75 *
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
78 *
79 *
80 **** Alterations to Henry's code are...
81 ****
4bb101f2 82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
241d1a3b 83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
a687059c 84 ****
9ef589d8
LW
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
87
a687059c
LW
88 *
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
92 */
93#include "EXTERN.h"
864dbfa3 94#define PERL_IN_REGCOMP_C
a687059c 95#include "perl.h"
d06ea78c 96
acfe0abc 97#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
98# include "INTERN.h"
99#endif
c277df42
IZ
100
101#define REG_COMP_C
a687059c
LW
102#include "regcomp.h"
103
d4cce5f1 104#ifdef op
11343788 105#undef op
d4cce5f1 106#endif /* op */
11343788 107
fe14fcc3 108#ifdef MSDOS
7e4e8c89 109# if defined(BUGGY_MSC6)
fe14fcc3 110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 111# pragma optimize("a",off)
fe14fcc3 112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
113# pragma optimize("w",on )
114# endif /* BUGGY_MSC6 */
fe14fcc3
LW
115#endif /* MSDOS */
116
a687059c
LW
117#ifndef STATIC
118#define STATIC static
119#endif
120
830247a4 121typedef struct RExC_state_t {
e2509266 122 U32 flags; /* are we folding, multilining? */
830247a4
IZ
123 char *precomp; /* uncompiled string. */
124 regexp *rx;
fac92740 125 char *start; /* Start of input for compile */
830247a4
IZ
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 129 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 130 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
133 U32 seen;
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
136 I32 extralen;
137 I32 seen_zerolen;
138 I32 seen_evals;
1aa99e6b 139 I32 utf8;
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
fac92740 149#define RExC_start (pRExC_state->start)
830247a4
IZ
150#define RExC_end (pRExC_state->end)
151#define RExC_parse (pRExC_state->parse)
152#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 153#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 154#define RExC_emit (pRExC_state->emit)
fac92740 155#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
156#define RExC_naughty (pRExC_state->naughty)
157#define RExC_sawback (pRExC_state->sawback)
158#define RExC_seen (pRExC_state->seen)
159#define RExC_size (pRExC_state->size)
160#define RExC_npar (pRExC_state->npar)
161#define RExC_extralen (pRExC_state->extralen)
162#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 164#define RExC_utf8 (pRExC_state->utf8)
830247a4 165
a687059c
LW
166#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
a687059c 169
35c8bce7
LW
170#ifdef SPSTART
171#undef SPSTART /* dratted cpp namespace... */
172#endif
a687059c
LW
173/*
174 * Flags to be passed up and down.
175 */
a687059c 176#define WORST 0 /* Worst case. */
821b33a5 177#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
178#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179#define SPSTART 0x4 /* Starts with * or +. */
180#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 181
2c2d71f5
JH
182/* Length of a variant. */
183
184typedef struct scan_data_t {
185 I32 len_min;
186 I32 len_delta;
187 I32 pos_min;
188 I32 pos_delta;
189 SV *last_found;
190 I32 last_end; /* min value, <0 unless valid. */
191 I32 last_start_min;
192 I32 last_start_max;
193 SV **longest; /* Either &l_fixed, or &l_float. */
194 SV *longest_fixed;
195 I32 offset_fixed;
196 SV *longest_float;
197 I32 offset_float_min;
198 I32 offset_float_max;
199 I32 flags;
200 I32 whilem_c;
cb434fcc 201 I32 *last_closep;
653099ff 202 struct regnode_charclass_class *start_class;
2c2d71f5
JH
203} scan_data_t;
204
a687059c 205/*
e50aee73 206 * Forward declarations for pregcomp()'s friends.
a687059c 207 */
a0d0e21e 208
27da23d5
JH
209static const scan_data_t zero_scan_data =
210 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
c277df42
IZ
211
212#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213#define SF_BEFORE_SEOL 0x1
214#define SF_BEFORE_MEOL 0x2
215#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
217
09b7f37c
CB
218#ifdef NO_UNARY_PLUS
219# define SF_FIX_SHIFT_EOL (0+2)
220# define SF_FL_SHIFT_EOL (0+4)
221#else
222# define SF_FIX_SHIFT_EOL (+2)
223# define SF_FL_SHIFT_EOL (+4)
224#endif
c277df42
IZ
225
226#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228
229#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231#define SF_IS_INF 0x40
232#define SF_HAS_PAR 0x80
233#define SF_IN_PAR 0x100
234#define SF_HAS_EVAL 0x200
4bfe0158 235#define SCF_DO_SUBSTR 0x400
653099ff
GS
236#define SCF_DO_STCLASS_AND 0x0800
237#define SCF_DO_STCLASS_OR 0x1000
238#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 239#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 240
eb160463 241#define UTF (RExC_utf8 != 0)
e2509266
JH
242#define LOC ((RExC_flags & PMf_LOCALE) != 0)
243#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 244
ffc61ed2 245#define OOB_UNICODE 12345678
93733859 246#define OOB_NAMEDCLASS -1
b8c5462f 247
a0ed51b3
LW
248#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
250
8615cb43 251
b45f050a
JF
252/* length of regex to show in messages that don't mark a position within */
253#define RegexLengthToShowInErrorMessages 127
254
255/*
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
259 */
7253e4e3
RK
260#define MARKER1 "<-- HERE" /* marker as it appears in the description */
261#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 262
7253e4e3 263#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
264
265/*
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
268 * "...".
269 */
ccb2c380 270#define FAIL(msg) STMT_START { \
bfed75c6 271 const char *ellipses = ""; \
ccb2c380
MP
272 IV len = RExC_end - RExC_precomp; \
273 \
274 if (!SIZE_ONLY) \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
279 ellipses = "..."; \
280 } \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
283} STMT_END
8615cb43 284
b45f050a
JF
285/*
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
288 * "...".
289 */
ccb2c380 290#define FAIL2(pat,msg) STMT_START { \
bfed75c6 291 const char *ellipses = ""; \
ccb2c380
MP
292 IV len = RExC_end - RExC_precomp; \
293 \
294 if (!SIZE_ONLY) \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
299 ellipses = "..."; \
300 } \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
303} STMT_END
b45f050a
JF
304
305
306/*
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 */
ccb2c380
MP
309#define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
313} STMT_END
b45f050a
JF
314
315/*
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 */
ccb2c380
MP
318#define vFAIL(m) STMT_START { \
319 if (!SIZE_ONLY) \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
321 Simple_vFAIL(m); \
322} STMT_END
b45f050a
JF
323
324/*
325 * Like Simple_vFAIL(), but accepts two arguments.
326 */
ccb2c380
MP
327#define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
331} STMT_END
b45f050a
JF
332
333/*
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 */
ccb2c380
MP
336#define vFAIL2(m,a1) STMT_START { \
337 if (!SIZE_ONLY) \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
340} STMT_END
b45f050a
JF
341
342
343/*
344 * Like Simple_vFAIL(), but accepts three arguments.
345 */
ccb2c380
MP
346#define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
350} STMT_END
b45f050a
JF
351
352/*
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 */
ccb2c380
MP
355#define vFAIL3(m,a1,a2) STMT_START { \
356 if (!SIZE_ONLY) \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
359} STMT_END
b45f050a
JF
360
361/*
362 * Like Simple_vFAIL(), but accepts four arguments.
363 */
ccb2c380
MP
364#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
368} STMT_END
b45f050a
JF
369
370/*
371 * Like Simple_vFAIL(), but accepts five arguments.
372 */
ccb2c380
MP
373#define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
377} STMT_END
378
379
380#define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384} STMT_END
385
386#define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
392
393
394#define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
398} STMT_END
399
400#define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
404} STMT_END
405
406#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
411
412#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
416} STMT_END
9d1d55b5 417
8615cb43 418
cd439c50 419/* Allow for side effects in s */
ccb2c380
MP
420#define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
422} STMT_END
cd439c50 423
fac92740
MJD
424/* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
428 */
429
430#define MJD_OFFSET_DEBUG(x)
a3621e74 431/* #define MJD_OFFSET_DEBUG(x) DEBUG_r(Perl_warn_nocontext x) */
ccb2c380
MP
432
433
434#define Set_Node_Offset_To_R(node,byte) STMT_START { \
435 if (! SIZE_ONLY) { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
438 if((node) < 0) { \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
440 } else { \
441 RExC_offsets[2*(node)-1] = (byte); \
442 } \
443 } \
444} STMT_END
445
446#define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
449
450#define Set_Node_Length_To_R(node,len) STMT_START { \
451 if (! SIZE_ONLY) { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
454 if((node) < 0) { \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
456 } else { \
457 RExC_offsets[2*(node)] = (len); \
458 } \
459 } \
460} STMT_END
461
462#define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465#define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
467
468/* Get offsets and lengths */
469#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
471
acfe0abc 472static void clear_re(pTHX_ void *r);
4327152a 473
653099ff
GS
474/* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
477
4327152a 478STATIC void
830247a4 479S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
c277df42 480{
e1ec3a88
AL
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
b81d288d 483
c277df42 484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 485 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
b81d288d 489 data->flags
c277df42
IZ
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
491 else
492 data->flags &= ~SF_FIX_BEFORE_EOL;
a0ed51b3
LW
493 }
494 else {
c277df42 495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
496 data->offset_float_max = (l
497 ? data->last_start_max
c277df42 498 : data->pos_min + data->pos_delta);
9051bda5
HS
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
c277df42 501 if (data->flags & SF_BEFORE_EOL)
b81d288d 502 data->flags
c277df42
IZ
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
504 else
505 data->flags &= ~SF_FL_BEFORE_EOL;
506 }
507 }
508 SvCUR_set(data->last_found, 0);
0eda9292
JH
509 {
510 SV * sv = data->last_found;
511 MAGIC *mg =
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
514 mg->mg_len = 0;
515 }
c277df42
IZ
516 data->last_end = -1;
517 data->flags &= ~SF_BEFORE_EOL;
518}
519
653099ff
GS
520/* Can match anything (initialization) */
521STATIC void
830247a4 522S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 523{
653099ff 524 ANYOF_CLASS_ZERO(cl);
f8bef550 525 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
527 if (LOC)
528 cl->flags |= ANYOF_LOCALE;
529}
530
531/* Can match anything (initialization) */
532STATIC int
533S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
534{
535 int value;
536
aaa51d5e 537 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
539 return 1;
1aa99e6b
IH
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
541 return 0;
f8bef550
NC
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
543 return 0;
653099ff
GS
544 return 1;
545}
546
547/* Can match anything (initialization) */
548STATIC void
830247a4 549S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 550{
8ecf7187 551 Zero(cl, 1, struct regnode_charclass_class);
653099ff 552 cl->type = ANYOF;
830247a4 553 cl_anything(pRExC_state, cl);
653099ff
GS
554}
555
556STATIC void
830247a4 557S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 558{
8ecf7187 559 Zero(cl, 1, struct regnode_charclass_class);
653099ff 560 cl->type = ANYOF;
830247a4 561 cl_anything(pRExC_state, cl);
653099ff
GS
562 if (LOC)
563 cl->flags |= ANYOF_LOCALE;
564}
565
566/* 'And' a given class with another one. Can create false positives */
567/* We assume that cl is not inverted */
568STATIC void
569S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
571{
653099ff
GS
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
577 int i;
578
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
582 else
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
1aa99e6b 588
14ebb1a2
JH
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
594 }
14ebb1a2
JH
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 597 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 600 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
601}
602
603/* 'OR' a given class with another one. Can create false positives */
604/* We assume that cl is not inverted */
605STATIC void
830247a4 606S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
653099ff 607{
653099ff
GS
608 if (or_with->flags & ANYOF_INVERT) {
609 /* We do not use
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
617 */
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
621 int i;
622
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
626 else {
830247a4 627 cl_anything(pRExC_state, cl);
653099ff
GS
628 }
629 } else {
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 632 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
633 || (cl->flags & ANYOF_FOLD)) ) {
634 int i;
635
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
643 }
644 }
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 646 cl_anything(pRExC_state, cl);
653099ff
GS
647 }
648 }
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
651
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
656 }
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
660 }
653099ff
GS
661}
662
5d1c421c 663/*
a3621e74
YO
664
665 make_trie(startbranch,first,last,tail,flags)
666 startbranch: the first branch in the whole branch sequence
667 first : start branch of sequence of branch-exact nodes.
668 May be the same as startbranch
669 last : Thing following the last branch.
670 May be the same as tail.
671 tail : item following the branch sequence
672 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
673
674Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
675
676A trie is an N'ary tree where the branches are determined by digital
677decomposition of the key. IE, at the root node you look up the 1st character and
678follow that branch repeat until you find the end of the branches. Nodes can be
679marked as "accepting" meaning they represent a complete word. Eg:
680
681 /he|she|his|hers/
682
683would convert into the following structure. Numbers represent states, letters
684following numbers represent valid transitions on the letter from that state, if
685the number is in square brackets it represents an accepting state, otherwise it
686will be in parenthesis.
687
688 +-h->+-e->[3]-+-r->(8)-+-s->[9]
689 | |
690 | (2)
691 | |
692 (1) +-i->(6)-+-s->[7]
693 |
694 +-s->(3)-+-h->(4)-+-e->[5]
695
696 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
697
698This shows that when matching against the string 'hers' we will begin at state 1
699read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
700then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
701is also accepting. Thus we know that we can match both 'he' and 'hers' with a
702single traverse. We store a mapping from accepting to state to which word was
703matched, and then when we have multiple possibilities we try to complete the
704rest of the regex in the order in which they occured in the alternation.
705
706The only prior NFA like behaviour that would be changed by the TRIE support is
707the silent ignoring of duplicate alternations which are of the form:
708
709 / (DUPE|DUPE) X? (?{ ... }) Y /x
710
711Thus EVAL blocks follwing a trie may be called a different number of times with
712and without the optimisation. With the optimisations dupes will be silently
713ignored. This inconsistant behaviour of EVAL type nodes is well established as
714the following demonstrates:
715
716 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
717
718which prints out 'word' three times, but
719
720 'words'=~/(word|word|word)(?{ print $1 })S/
721
722which doesnt print it out at all. This is due to other optimisations kicking in.
723
724Example of what happens on a structural level:
725
726The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
727
728 1: CURLYM[1] {1,32767}(18)
729 5: BRANCH(8)
730 6: EXACT <ac>(16)
731 8: BRANCH(11)
732 9: EXACT <ad>(16)
733 11: BRANCH(14)
734 12: EXACT <ab>(16)
735 16: SUCCEED(0)
736 17: NOTHING(18)
737 18: END(0)
738
739This would be optimizable with startbranch=5, first=5, last=16, tail=16
740and should turn into:
741
742 1: CURLYM[1] {1,32767}(18)
743 5: TRIE(16)
744 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
745 <ac>
746 <ad>
747 <ab>
748 16: SUCCEED(0)
749 17: NOTHING(18)
750 18: END(0)
751
752Cases where tail != last would be like /(?foo|bar)baz/:
753
754 1: BRANCH(4)
755 2: EXACT <foo>(8)
756 4: BRANCH(7)
757 5: EXACT <bar>(8)
758 7: TAIL(8)
759 8: EXACT <baz>(10)
760 10: END(0)
761
762which would be optimizable with startbranch=1, first=1, last=7, tail=8
763and would end up looking like:
764
765 1: TRIE(8)
766 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
767 <foo>
768 <bar>
769 7: TAIL(8)
770 8: EXACT <baz>(10)
771 10: END(0)
772
773*/
774
775#define TRIE_DEBUG_CHAR \
776 DEBUG_TRIE_COMPILE_r({ \
777 SV *tmp; \
778 if ( UTF ) { \
779 tmp = newSVpv( "", 0 ); \
780 pv_uni_display( tmp, uc, len, 60, UNI_DISPLAY_REGEX ); \
781 } else { \
e4584336 782 tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
a3621e74
YO
783 } \
784 av_push( trie->revcharmap, tmp ); \
785 })
786
787#define TRIE_READ_CHAR STMT_START { \
788 if ( UTF ) { \
789 if ( folder ) { \
790 if ( foldlen > 0 ) { \
791 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
792 foldlen -= len; \
793 scan += len; \
794 len = 0; \
795 } else { \
e1ec3a88 796 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
797 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
798 foldlen -= UNISKIP( uvc ); \
799 scan = foldbuf + UNISKIP( uvc ); \
800 } \
801 } else { \
e1ec3a88 802 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
a3621e74
YO
803 } \
804 } else { \
805 uvc = (U32)*uc; \
806 len = 1; \
807 } \
808} STMT_END
809
810
811#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
812#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
813#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
814#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
815
816#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
817 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
818 TRIE_LIST_LEN( state ) *= 2; \
819 Renew( trie->states[ state ].trans.list, \
820 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
821 } \
822 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
823 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
824 TRIE_LIST_CUR( state )++; \
825} STMT_END
826
827#define TRIE_LIST_NEW(state) STMT_START { \
828 Newz( 1023, trie->states[ state ].trans.list, \
829 4, reg_trie_trans_le ); \
830 TRIE_LIST_CUR( state ) = 1; \
831 TRIE_LIST_LEN( state ) = 4; \
832} STMT_END
833
834STATIC I32
835S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 flags)
836{
27da23d5 837 dVAR;
a3621e74
YO
838 /* first pass, loop through and scan words */
839 reg_trie_data *trie;
840 regnode *cur;
e1ec3a88 841 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
a3621e74
YO
842 STRLEN len = 0;
843 UV uvc = 0;
844 U16 curword = 0;
845 U32 next_alloc = 0;
846 /* we just use folder as a flag in utf8 */
e1ec3a88 847 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
848 ? PL_fold
849 : ( flags == EXACTFL
850 ? PL_fold_locale
851 : NULL
852 )
853 );
854
e1ec3a88 855 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74
YO
856 SV *re_trie_maxbuff;
857
858 GET_RE_DEBUG_FLAGS_DECL;
859
860 Newz( 848200, trie, 1, reg_trie_data );
861 trie->refcount = 1;
862 RExC_rx->data->data[ data_slot ] = (void*)trie;
863 Newz( 848201, trie->charmap, 256, U16 );
864 DEBUG_r({
865 trie->words = newAV();
866 trie->revcharmap = newAV();
867 });
868
869
0111c4fd 870 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 871 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 872 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74
YO
873 }
874
875 /* -- First loop and Setup --
876
877 We first traverse the branches and scan each word to determine if it
878 contains widechars, and how many unique chars there are, this is
879 important as we have to build a table with at least as many columns as we
880 have unique chars.
881
882 We use an array of integers to represent the character codes 0..255
883 (trie->charmap) and we use a an HV* to store unicode characters. We use the
884 native representation of the character value as the key and IV's for the
885 coded index.
886
887 *TODO* If we keep track of how many times each character is used we can
888 remap the columns so that the table compression later on is more
889 efficient in terms of memory by ensuring most common value is in the
890 middle and the least common are on the outside. IMO this would be better
891 than a most to least common mapping as theres a decent chance the most
892 common letter will share a node with the least common, meaning the node
893 will not be compressable. With a middle is most common approach the worst
894 case is when we have the least common nodes twice.
895
896 */
897
898
899 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
900 regnode *noper = NEXTOPER( cur );
e1ec3a88
AL
901 const U8 *uc = (U8*)STRING( noper );
902 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
903 STRLEN foldlen = 0;
904 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 905 const U8 *scan = (U8*)NULL;
a3621e74
YO
906
907 for ( ; uc < e ; uc += len ) {
908 trie->charcount++;
909 TRIE_READ_CHAR;
910 if ( uvc < 256 ) {
911 if ( !trie->charmap[ uvc ] ) {
912 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
913 if ( folder )
914 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
915 TRIE_DEBUG_CHAR;
916 }
917 } else {
918 SV** svpp;
919 if ( !trie->widecharmap )
920 trie->widecharmap = newHV();
921
922 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
923
924 if ( !svpp )
e4584336 925 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
926
927 if ( !SvTRUE( *svpp ) ) {
928 sv_setiv( *svpp, ++trie->uniquecharcount );
929 TRIE_DEBUG_CHAR;
930 }
931 }
932 }
933 trie->wordcount++;
934 } /* end first pass */
935 DEBUG_TRIE_COMPILE_r(
936 PerlIO_printf( Perl_debug_log, "TRIE(%s): W:%d C:%d Uq:%d \n",
937 ( trie->widecharmap ? "UTF8" : "NATIVE" ), trie->wordcount,
5d7488b2 938 (int)trie->charcount, trie->uniquecharcount )
a3621e74
YO
939 );
940
941
942 /*
943 We now know what we are dealing with in terms of unique chars and
944 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
945 representation using a flat table will take. If it's over a reasonable
946 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
947 conservative but potentially much slower representation using an array
948 of lists.
949
950 At the end we convert both representations into the same compressed
951 form that will be used in regexec.c for matching with. The latter
952 is a form that cannot be used to construct with but has memory
953 properties similar to the list form and access properties similar
954 to the table form making it both suitable for fast searches and
955 small enough that its feasable to store for the duration of a program.
956
957 See the comment in the code where the compressed table is produced
958 inplace from the flat tabe representation for an explanation of how
959 the compression works.
960
961 */
962
963
964 if ( (IV)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
965 /*
966 Second Pass -- Array Of Lists Representation
967
968 Each state will be represented by a list of charid:state records
969 (reg_trie_trans_le) the first such element holds the CUR and LEN
970 points of the allocated array. (See defines above).
971
972 We build the initial structure using the lists, and then convert
973 it into the compressed table form which allows faster lookups
974 (but cant be modified once converted).
975
976
977 */
978
979
980 STRLEN transcount = 1;
981
982 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
983 TRIE_LIST_NEW(1);
984 next_alloc = 2;
985
986 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
987
988 regnode *noper = NEXTOPER( cur );
989 U8 *uc = (U8*)STRING( noper );
990 U8 *e = uc + STR_LEN( noper );
991 U32 state = 1; /* required init */
992 U16 charid = 0; /* sanity init */
993 U8 *scan = (U8*)NULL; /* sanity init */
994 STRLEN foldlen = 0; /* required init */
995 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
996
997
998 for ( ; uc < e ; uc += len ) {
999
1000 TRIE_READ_CHAR;
1001
1002 if ( uvc < 256 ) {
1003 charid = trie->charmap[ uvc ];
1004 } else {
1005 SV** svpp=(SV**)NULL;
1006 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1007 if ( !svpp ) {
1008 charid = 0;
1009 } else {
1010 charid=(U16)SvIV( *svpp );
1011 }
1012 }
1013 if ( charid ) {
1014
1015 U16 check;
1016 U32 newstate = 0;
1017
1018 charid--;
1019 if ( !trie->states[ state ].trans.list ) {
1020 TRIE_LIST_NEW( state );
1021 }
1022 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1023 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1024 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1025 break;
1026 }
1027 }
1028 if ( ! newstate ) {
1029 newstate = next_alloc++;
1030 TRIE_LIST_PUSH( state, charid, newstate );
1031 transcount++;
1032 }
1033 state = newstate;
1034
1035 } else {
e4584336 1036 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1037 }
1038 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1039 }
1040
1041 if ( !trie->states[ state ].wordnum ) {
1042 /* we havent inserted this word into the structure yet. */
1043 trie->states[ state ].wordnum = ++curword;
1044
1045 DEBUG_r({
1046 /* store the word for dumping */
1047 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1048 if ( UTF ) SvUTF8_on( tmp );
1049 av_push( trie->words, tmp );
1050 });
1051
1052 } else {
1053 /* Its a dupe. So ignore it. */
1054 }
1055
1056 } /* end second pass */
1057
1058 trie->laststate = next_alloc;
1059 Renew( trie->states, next_alloc, reg_trie_state );
1060
1061 DEBUG_TRIE_COMPILE_MORE_r({
1062 U32 state;
1063 U16 charid;
1064
1065 /*
1066 print out the table precompression.
1067 */
1068
1069 PerlIO_printf( Perl_debug_log, "\nState :Word | Transition Data\n" );
1070 PerlIO_printf( Perl_debug_log, "------:-----+-----------------" );
1071
1072 for( state=1 ; state < next_alloc ; state ++ ) {
1073
e4584336 1074 PerlIO_printf( Perl_debug_log, "\n %04"UVXf" :", (UV)state );
a3621e74
YO
1075 if ( ! trie->states[ state ].wordnum ) {
1076 PerlIO_printf( Perl_debug_log, "%5s| ","");
1077 } else {
e4584336 1078 PerlIO_printf( Perl_debug_log, "W%04x| ",
a3621e74
YO
1079 trie->states[ state ].wordnum
1080 );
1081 }
1082 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1083 SV **tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
e4584336 1084 PerlIO_printf( Perl_debug_log, "%s:%3X=%04"UVXf" | ",
a3621e74
YO
1085 SvPV_nolen( *tmp ),
1086 TRIE_LIST_ITEM(state,charid).forid,
e4584336 1087 (UV)TRIE_LIST_ITEM(state,charid).newstate
a3621e74
YO
1088 );
1089 }
1090
1091 }
1092 PerlIO_printf( Perl_debug_log, "\n\n" );
1093 });
1094
1095 Newz( 848203, trie->trans, transcount ,reg_trie_trans );
1096 {
1097 U32 state;
1098 U16 idx;
1099 U32 tp = 0;
1100 U32 zp = 0;
1101
1102
1103 for( state=1 ; state < next_alloc ; state ++ ) {
1104 U32 base=0;
1105
1106 /*
1107 DEBUG_TRIE_COMPILE_MORE_r(
1108 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1109 );
1110 */
1111
1112 if (trie->states[state].trans.list) {
1113 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1114 U16 maxid=minid;
1115
1116
1117 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1118 if ( TRIE_LIST_ITEM( state, idx).forid < minid ) {
1119 minid=TRIE_LIST_ITEM( state, idx).forid;
1120 } else if ( TRIE_LIST_ITEM( state, idx).forid > maxid ) {
1121 maxid=TRIE_LIST_ITEM( state, idx).forid;
1122 }
1123 }
1124 if ( transcount < tp + maxid - minid + 1) {
1125 transcount *= 2;
1126 Renew( trie->trans, transcount, reg_trie_trans );
1127 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1128 }
1129 base = trie->uniquecharcount + tp - minid;
1130 if ( maxid == minid ) {
1131 U32 set = 0;
1132 for ( ; zp < tp ; zp++ ) {
1133 if ( ! trie->trans[ zp ].next ) {
1134 base = trie->uniquecharcount + zp - minid;
1135 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1136 trie->trans[ zp ].check = state;
1137 set = 1;
1138 break;
1139 }
1140 }
1141 if ( !set ) {
1142 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1143 trie->trans[ tp ].check = state;
1144 tp++;
1145 zp = tp;
1146 }
1147 } else {
1148 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1149 U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1150 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1151 trie->trans[ tid ].check = state;
1152 }
1153 tp += ( maxid - minid + 1 );
1154 }
1155 Safefree(trie->states[ state ].trans.list);
1156 }
1157 /*
1158 DEBUG_TRIE_COMPILE_MORE_r(
1159 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1160 );
1161 */
1162 trie->states[ state ].trans.base=base;
1163 }
cc601c31 1164 trie->lasttrans = tp + 1;
a3621e74
YO
1165 }
1166 } else {
1167 /*
1168 Second Pass -- Flat Table Representation.
1169
1170 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1171 We know that we will need Charcount+1 trans at most to store the data
1172 (one row per char at worst case) So we preallocate both structures
1173 assuming worst case.
1174
1175 We then construct the trie using only the .next slots of the entry
1176 structs.
1177
1178 We use the .check field of the first entry of the node temporarily to
1179 make compression both faster and easier by keeping track of how many non
1180 zero fields are in the node.
1181
1182 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1183 transition.
1184
1185 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1186 number representing the first entry of the node, and state as a
1187 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1188 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1189 are 2 entrys per node. eg:
1190
1191 A B A B
1192 1. 2 4 1. 3 7
1193 2. 0 3 3. 0 5
1194 3. 0 0 5. 0 0
1195 4. 0 0 7. 0 0
1196
1197 The table is internally in the right hand, idx form. However as we also
1198 have to deal with the states array which is indexed by nodenum we have to
1199 use TRIE_NODENUM() to convert.
1200
1201 */
1202
1203 Newz( 848203, trie->trans, ( trie->charcount + 1 ) * trie->uniquecharcount + 1,
1204 reg_trie_trans );
1205 Newz( 848204, trie->states, trie->charcount + 2, reg_trie_state );
1206 next_alloc = trie->uniquecharcount + 1;
1207
1208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1209
1210 regnode *noper = NEXTOPER( cur );
1211 U8 *uc = (U8*)STRING( noper );
1212 U8 *e = uc + STR_LEN( noper );
1213
1214 U32 state = 1; /* required init */
1215
1216 U16 charid = 0; /* sanity init */
1217 U32 accept_state = 0; /* sanity init */
1218 U8 *scan = (U8*)NULL; /* sanity init */
1219
1220 STRLEN foldlen = 0; /* required init */
1221 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1222
1223
1224 for ( ; uc < e ; uc += len ) {
1225
1226 TRIE_READ_CHAR;
1227
1228 if ( uvc < 256 ) {
1229 charid = trie->charmap[ uvc ];
1230 } else {
1231 SV** svpp=(SV**)NULL;
1232 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1233 if ( !svpp ) {
1234 charid = 0;
1235 } else {
1236 charid=(U16)SvIV( *svpp );
1237 }
1238 }
1239 if ( charid ) {
1240 charid--;
1241 if ( !trie->trans[ state + charid ].next ) {
1242 trie->trans[ state + charid ].next = next_alloc;
1243 trie->trans[ state ].check++;
1244 next_alloc += trie->uniquecharcount;
1245 }
1246 state = trie->trans[ state + charid ].next;
1247 } else {
e4584336 1248 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
a3621e74
YO
1249 }
1250 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1251 }
1252
1253 accept_state = TRIE_NODENUM( state );
1254 if ( !trie->states[ accept_state ].wordnum ) {
1255 /* we havent inserted this word into the structure yet. */
1256 trie->states[ accept_state ].wordnum = ++curword;
1257
1258 DEBUG_r({
1259 /* store the word for dumping */
1260 SV* tmp = newSVpvn( STRING( noper ), STR_LEN( noper ) );
1261 if ( UTF ) SvUTF8_on( tmp );
1262 av_push( trie->words, tmp );
1263 });
1264
1265 } else {
1266 /* Its a dupe. So ignore it. */
1267 }
1268
1269 } /* end second pass */
1270
1271 DEBUG_TRIE_COMPILE_MORE_r({
1272 /*
1273 print out the table precompression so that we can do a visual check
1274 that they are identical.
1275 */
1276 U32 state;
1277 U16 charid;
1278 PerlIO_printf( Perl_debug_log, "\nChar : " );
1279
1280 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1281 SV **tmp = av_fetch( trie->revcharmap, charid, 0);
1282 if ( tmp ) {
1283 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1284 }
1285 }
1286
1287 PerlIO_printf( Perl_debug_log, "\nState+-" );
1288
1289 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1290 PerlIO_printf( Perl_debug_log, "%4s-", "----" );
1291 }
1292
1293 PerlIO_printf( Perl_debug_log, "\n" );
1294
1295 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1296
e4584336 1297 PerlIO_printf( Perl_debug_log, "%04"UVXf" : ", (UV)TRIE_NODENUM( state ) );
a3621e74
YO
1298
1299 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
e4584336
RB
1300 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1301 (UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ) );
a3621e74
YO
1302 }
1303 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
e4584336 1304 PerlIO_printf( Perl_debug_log, " (%04"UVXf")\n", (UV)trie->trans[ state ].check );
a3621e74 1305 } else {
e4584336 1306 PerlIO_printf( Perl_debug_log, " (%04"UVXf") W%04X\n", (UV)trie->trans[ state ].check,
a3621e74
YO
1307 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1308 }
1309 }
1310 PerlIO_printf( Perl_debug_log, "\n\n" );
1311 });
1312 {
1313 /*
1314 * Inplace compress the table.*
1315
1316 For sparse data sets the table constructed by the trie algorithm will
1317 be mostly 0/FAIL transitions or to put it another way mostly empty.
1318 (Note that leaf nodes will not contain any transitions.)
1319
1320 This algorithm compresses the tables by eliminating most such
1321 transitions, at the cost of a modest bit of extra work during lookup:
1322
1323 - Each states[] entry contains a .base field which indicates the
1324 index in the state[] array wheres its transition data is stored.
1325
1326 - If .base is 0 there are no valid transitions from that node.
1327
1328 - If .base is nonzero then charid is added to it to find an entry in
1329 the trans array.
1330
1331 -If trans[states[state].base+charid].check!=state then the
1332 transition is taken to be a 0/Fail transition. Thus if there are fail
1333 transitions at the front of the node then the .base offset will point
1334 somewhere inside the previous nodes data (or maybe even into a node
1335 even earlier), but the .check field determines if the transition is
1336 valid.
1337
1338 The following process inplace converts the table to the compressed
1339 table: We first do not compress the root node 1,and mark its all its
1340 .check pointers as 1 and set its .base pointer as 1 as well. This
1341 allows to do a DFA construction from the compressed table later, and
1342 ensures that any .base pointers we calculate later are greater than
1343 0.
1344
1345 - We set 'pos' to indicate the first entry of the second node.
1346
1347 - We then iterate over the columns of the node, finding the first and
1348 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1349 and set the .check pointers accordingly, and advance pos
1350 appropriately and repreat for the next node. Note that when we copy
1351 the next pointers we have to convert them from the original
1352 NODEIDX form to NODENUM form as the former is not valid post
1353 compression.
1354
1355 - If a node has no transitions used we mark its base as 0 and do not
1356 advance the pos pointer.
1357
1358 - If a node only has one transition we use a second pointer into the
1359 structure to fill in allocated fail transitions from other states.
1360 This pointer is independent of the main pointer and scans forward
1361 looking for null transitions that are allocated to a state. When it
1362 finds one it writes the single transition into the "hole". If the
1363 pointer doesnt find one the single transition is appeneded as normal.
1364
1365 - Once compressed we can Renew/realloc the structures to release the
1366 excess space.
1367
1368 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1369 specifically Fig 3.47 and the associated pseudocode.
1370
1371 demq
1372 */
a3b680e6 1373 const U32 laststate = TRIE_NODENUM( next_alloc );
a3621e74
YO
1374 U32 used , state, charid;
1375 U32 pos = 0, zp=0;
1376 trie->laststate = laststate;
1377
1378 for ( state = 1 ; state < laststate ; state++ ) {
1379 U8 flag = 0;
1380 U32 stateidx = TRIE_NODEIDX( state );
1381 U32 o_used=trie->trans[ stateidx ].check;
1382 used = trie->trans[ stateidx ].check;
1383 trie->trans[ stateidx ].check = 0;
1384
1385 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1386 if ( flag || trie->trans[ stateidx + charid ].next ) {
1387 if ( trie->trans[ stateidx + charid ].next ) {
1388 if (o_used == 1) {
1389 for ( ; zp < pos ; zp++ ) {
1390 if ( ! trie->trans[ zp ].next ) {
1391 break;
1392 }
1393 }
1394 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1395 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1396 trie->trans[ zp ].check = state;
1397 if ( ++zp > pos ) pos = zp;
1398 break;
1399 }
1400 used--;
1401 }
1402 if ( !flag ) {
1403 flag = 1;
1404 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1405 }
1406 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1407 trie->trans[ pos ].check = state;
1408 pos++;
1409 }
1410 }
1411 }
cc601c31 1412 trie->lasttrans = pos + 1;
a3621e74
YO
1413 Renew( trie->states, laststate + 1, reg_trie_state);
1414 DEBUG_TRIE_COMPILE_MORE_r(
e4584336
RB
1415 PerlIO_printf( Perl_debug_log,
1416 " Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
5d7488b2
AL
1417 (int)( ( trie->charcount + 1 ) * trie->uniquecharcount + 1 ),
1418 (IV)next_alloc,
1419 (IV)pos,
a3621e74
YO
1420 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1421 );
1422
1423 } /* end table compress */
1424 }
cc601c31
YO
1425 /* resize the trans array to remove unused space */
1426 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74
YO
1427
1428 DEBUG_TRIE_COMPILE_r({
1429 U32 state;
1430 /*
1431 Now we print it out again, in a slightly different form as there is additional
1432 info we want to be able to see when its compressed. They are close enough for
1433 visual comparison though.
1434 */
1435 PerlIO_printf( Perl_debug_log, "\nChar : %-6s%-6s%-4s ","Match","Base","Ofs" );
1436
1437 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1438 SV **tmp = av_fetch( trie->revcharmap, state, 0);
1439 if ( tmp ) {
1440 PerlIO_printf( Perl_debug_log, "%4.4s ", SvPV_nolen( *tmp ) );
1441 }
1442 }
1443 PerlIO_printf( Perl_debug_log, "\n-----:-----------------------");
cc601c31 1444
a3621e74
YO
1445 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1446 PerlIO_printf( Perl_debug_log, "-----");
1447 PerlIO_printf( Perl_debug_log, "\n");
cc601c31 1448
a3621e74
YO
1449 for( state = 1 ; state < trie->laststate ; state++ ) {
1450 U32 base = trie->states[ state ].trans.base;
1451
e4584336 1452 PerlIO_printf( Perl_debug_log, "#%04"UVXf" ", (UV)state);
a3621e74
YO
1453
1454 if ( trie->states[ state ].wordnum ) {
1455 PerlIO_printf( Perl_debug_log, " W%04X", trie->states[ state ].wordnum );
1456 } else {
1457 PerlIO_printf( Perl_debug_log, "%6s", "" );
1458 }
1459
e4584336 1460 PerlIO_printf( Perl_debug_log, " @%04"UVXf" ", (UV)base );
a3621e74
YO
1461
1462 if ( base ) {
1463 U32 ofs = 0;
1464
cc601c31
YO
1465 while( ( base + ofs < trie->uniquecharcount ) ||
1466 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1467 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
a3621e74
YO
1468 ofs++;
1469
e4584336 1470 PerlIO_printf( Perl_debug_log, "+%02"UVXf"[ ", (UV)ofs);
a3621e74
YO
1471
1472 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1473 if ( ( base + ofs >= trie->uniquecharcount ) &&
1474 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1475 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1476 {
e4584336
RB
1477 PerlIO_printf( Perl_debug_log, "%04"UVXf" ",
1478 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
a3621e74
YO
1479 } else {
1480 PerlIO_printf( Perl_debug_log, "%4s "," 0" );
1481 }
1482 }
1483
e4584336 1484 PerlIO_printf( Perl_debug_log, "]");
a3621e74
YO
1485
1486 }
1487 PerlIO_printf( Perl_debug_log, "\n" );
1488 }
1489 });
1490
1491 {
1492 /* now finally we "stitch in" the new TRIE node
1493 This means we convert either the first branch or the first Exact,
1494 depending on whether the thing following (in 'last') is a branch
1495 or not and whther first is the startbranch (ie is it a sub part of
1496 the alternation or is it the whole thing.)
1497 Assuming its a sub part we conver the EXACT otherwise we convert
1498 the whole branch sequence, including the first.
1499 */
1500 regnode *convert;
1501
1502
1503
1504
1505 if ( first == startbranch && OP( last ) != BRANCH ) {
1506 convert = first;
1507 } else {
1508 convert = NEXTOPER( first );
1509 NEXT_OFF( first ) = (U16)(last - first);
1510 }
1511
1512 OP( convert ) = TRIE + (U8)( flags - EXACT );
1513 NEXT_OFF( convert ) = (U16)(tail - convert);
1514 ARG_SET( convert, data_slot );
1515
1516 /* tells us if we need to handle accept buffers specially */
1517 convert->flags = ( RExC_seen_evals ? 1 : 0 );
1518
1519
1520 /* needed for dumping*/
1521 DEBUG_r({
1522 regnode *optimize = convert + NODE_STEP_REGNODE + regarglen[ TRIE ];
1523 /* We now need to mark all of the space originally used by the
1524 branches as optimized away. This keeps the dumpuntil from
1525 throwing a wobbly as it doesnt use regnext() to traverse the
1526 opcodes.
1527 */
1528 while( optimize < last ) {
1529 OP( optimize ) = OPTIMIZED;
1530 optimize++;
1531 }
1532 });
1533 } /* end node insert */
1534 return 1;
1535}
1536
1537
1538
1539/*
5d1c421c
JH
1540 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
1541 * These need to be revisited when a newer toolchain becomes available.
1542 */
1543#if defined(__sparc64__) && defined(__GNUC__)
1544# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
1545# undef SPARC64_GCC_WORKAROUND
1546# define SPARC64_GCC_WORKAROUND 1
1547# endif
1548#endif
1549
653099ff
GS
1550/* REx optimizer. Converts nodes into quickier variants "in place".
1551 Finds fixed substrings. */
1552
c277df42
IZ
1553/* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
1554 to the position after last scanned or to NULL. */
1555
a3621e74 1556
76e3520e 1557STATIC I32
a3621e74 1558S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
1559 /* scanp: Start here (read-write). */
1560 /* deltap: Write maxlen-minlen here. */
1561 /* last: Stop before this one. */
1562{
1563 I32 min = 0, pars = 0, code;
1564 regnode *scan = *scanp, *next;
1565 I32 delta = 0;
1566 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 1567 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
1568 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
1569 scan_data_t data_fake;
653099ff 1570 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74
YO
1571 SV *re_trie_maxbuff = NULL;
1572
1573 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1574
c277df42
IZ
1575 while (scan && OP(scan) != END && scan < last) {
1576 /* Peephole optimizer: */
a3621e74
YO
1577 DEBUG_OPTIMISE_r({
1578 SV *mysv=sv_newmortal();
1579 regprop( mysv, scan);
e4584336
RB
1580 PerlIO_printf(Perl_debug_log, "%*speep: %s (0x%08"UVXf")\n",
1581 (int)depth*2, "", SvPV_nolen(mysv), PTR2UV(scan));
a3621e74 1582 });
c277df42 1583
22c35a8c 1584 if (PL_regkind[(U8)OP(scan)] == EXACT) {
653099ff 1585 /* Merge several consecutive EXACTish nodes into one. */
c277df42
IZ
1586 regnode *n = regnext(scan);
1587 U32 stringok = 1;
1588#ifdef DEBUGGING
1589 regnode *stop = scan;
b81d288d 1590#endif
c277df42 1591
cd439c50 1592 next = scan + NODE_SZ_STR(scan);
c277df42
IZ
1593 /* Skip NOTHING, merge EXACT*. */
1594 while (n &&
b81d288d 1595 ( PL_regkind[(U8)OP(n)] == NOTHING ||
c277df42
IZ
1596 (stringok && (OP(n) == OP(scan))))
1597 && NEXT_OFF(n)
1598 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
1599 if (OP(n) == TAIL || n > next)
1600 stringok = 0;
22c35a8c 1601 if (PL_regkind[(U8)OP(n)] == NOTHING) {
c277df42
IZ
1602 NEXT_OFF(scan) += NEXT_OFF(n);
1603 next = n + NODE_STEP_REGNODE;
1604#ifdef DEBUGGING
1605 if (stringok)
1606 stop = n;
b81d288d 1607#endif
c277df42 1608 n = regnext(n);
a0ed51b3 1609 }
f49d4d0f 1610 else if (stringok) {
a3b680e6 1611 const int oldl = STR_LEN(scan);
c277df42 1612 regnode *nnext = regnext(n);
f49d4d0f 1613
b81d288d 1614 if (oldl + STR_LEN(n) > U8_MAX)
c277df42
IZ
1615 break;
1616 NEXT_OFF(scan) += NEXT_OFF(n);
cd439c50
IZ
1617 STR_LEN(scan) += STR_LEN(n);
1618 next = n + NODE_SZ_STR(n);
c277df42 1619 /* Now we can overwrite *n : */
f49d4d0f 1620 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
c277df42 1621#ifdef DEBUGGING
f49d4d0f 1622 stop = next - 1;
b81d288d 1623#endif
c277df42
IZ
1624 n = nnext;
1625 }
1626 }
61a36c01 1627
a3621e74 1628 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
61a36c01
JH
1629/*
1630 Two problematic code points in Unicode casefolding of EXACT nodes:
1631
1632 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1633 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1634
1635 which casefold to
1636
1637 Unicode UTF-8
1638
1639 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
1640 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
1641
1642 This means that in case-insensitive matching (or "loose matching",
1643 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
1644 length of the above casefolded versions) can match a target string
1645 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
1646 This would rather mess up the minimum length computation.
1647
1648 What we'll do is to look for the tail four bytes, and then peek
1649 at the preceding two bytes to see whether we need to decrease
1650 the minimum length by four (six minus two).
1651
1652 Thanks to the design of UTF-8, there cannot be false matches:
1653 A sequence of valid UTF-8 bytes cannot be a subsequence of
1654 another valid sequence of UTF-8 bytes.
1655
1656*/
1657 char *s0 = STRING(scan), *s, *t;
1658 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
bfed75c6
AL
1659 const char *t0 = "\xcc\x88\xcc\x81";
1660 const char *t1 = t0 + 3;
2af232bd 1661
61a36c01
JH
1662 for (s = s0 + 2;
1663 s < s2 && (t = ninstr(s, s1, t0, t1));
1664 s = t + 4) {
1665 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
1666 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
1667 min -= 4;
1668 }
1669 }
1670
c277df42
IZ
1671#ifdef DEBUGGING
1672 /* Allow dumping */
cd439c50 1673 n = scan + NODE_SZ_STR(scan);
c277df42 1674 while (n <= stop) {
22c35a8c 1675 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
c277df42
IZ
1676 OP(n) = OPTIMIZED;
1677 NEXT_OFF(n) = 0;
1678 }
1679 n++;
1680 }
653099ff 1681#endif
c277df42 1682 }
a3621e74
YO
1683
1684
1685
653099ff
GS
1686 /* Follow the next-chain of the current node and optimize
1687 away all the NOTHINGs from it. */
c277df42 1688 if (OP(scan) != CURLYX) {
a3b680e6 1689 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
1690 ? I32_MAX
1691 /* I32 may be smaller than U16 on CRAYs! */
1692 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
1693 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
1694 int noff;
1695 regnode *n = scan;
b81d288d 1696
c277df42
IZ
1697 /* Skip NOTHING and LONGJMP. */
1698 while ((n = regnext(n))
22c35a8c 1699 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
1700 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
1701 && off + noff < max)
1702 off += noff;
1703 if (reg_off_by_arg[OP(scan)])
1704 ARG(scan) = off;
b81d288d 1705 else
c277df42
IZ
1706 NEXT_OFF(scan) = off;
1707 }
a3621e74 1708
653099ff
GS
1709 /* The principal pseudo-switch. Cannot be a switch, since we
1710 look into several different things. */
b81d288d 1711 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
1712 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
1713 next = regnext(scan);
1714 code = OP(scan);
a3621e74 1715 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
1716
1717 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
c277df42 1718 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 1719 struct regnode_charclass_class accum;
a3621e74 1720 regnode *startbranch=scan;
c277df42 1721
653099ff 1722 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
830247a4 1723 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
653099ff 1724 if (flags & SCF_DO_STCLASS)
830247a4 1725 cl_init_zero(pRExC_state, &accum);
a3621e74 1726
c277df42 1727 while (OP(scan) == code) {
830247a4 1728 I32 deltanext, minnext, f = 0, fake;
653099ff 1729 struct regnode_charclass_class this_class;
c277df42
IZ
1730
1731 num++;
1732 data_fake.flags = 0;
b81d288d 1733 if (data) {
2c2d71f5 1734 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
1735 data_fake.last_closep = data->last_closep;
1736 }
1737 else
1738 data_fake.last_closep = &fake;
c277df42
IZ
1739 next = regnext(scan);
1740 scan = NEXTOPER(scan);
1741 if (code != BRANCH)
1742 scan = NEXTOPER(scan);
653099ff 1743 if (flags & SCF_DO_STCLASS) {
830247a4 1744 cl_init(pRExC_state, &this_class);
653099ff
GS
1745 data_fake.start_class = &this_class;
1746 f = SCF_DO_STCLASS_AND;
b81d288d 1747 }
e1901655
IZ
1748 if (flags & SCF_WHILEM_VISITED_POS)
1749 f |= SCF_WHILEM_VISITED_POS;
a3621e74 1750
653099ff 1751 /* we suppose the run is continuous, last=next...*/
830247a4 1752 minnext = study_chunk(pRExC_state, &scan, &deltanext,
a3621e74 1753 next, &data_fake, f,depth+1);
b81d288d 1754 if (min1 > minnext)
c277df42
IZ
1755 min1 = minnext;
1756 if (max1 < minnext + deltanext)
1757 max1 = minnext + deltanext;
1758 if (deltanext == I32_MAX)
aca2d497 1759 is_inf = is_inf_internal = 1;
c277df42
IZ
1760 scan = next;
1761 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1762 pars++;
405ff068 1763 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 1764 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
1765 if (data)
1766 data->whilem_c = data_fake.whilem_c;
653099ff 1767 if (flags & SCF_DO_STCLASS)
830247a4 1768 cl_or(pRExC_state, &accum, &this_class);
b81d288d 1769 if (code == SUSPEND)
c277df42
IZ
1770 break;
1771 }
1772 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1773 min1 = 0;
1774 if (flags & SCF_DO_SUBSTR) {
1775 data->pos_min += min1;
1776 data->pos_delta += max1 - min1;
1777 if (max1 != min1 || is_inf)
1778 data->longest = &(data->longest_float);
1779 }
1780 min += min1;
1781 delta += max1 - min1;
653099ff 1782 if (flags & SCF_DO_STCLASS_OR) {
830247a4 1783 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
1784 if (min1) {
1785 cl_and(data->start_class, &and_with);
1786 flags &= ~SCF_DO_STCLASS;
1787 }
1788 }
1789 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
1790 if (min1) {
1791 cl_and(data->start_class, &accum);
653099ff 1792 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
1793 }
1794 else {
b81d288d 1795 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
1796 * data->start_class */
1797 StructCopy(data->start_class, &and_with,
1798 struct regnode_charclass_class);
1799 flags &= ~SCF_DO_STCLASS_AND;
1800 StructCopy(&accum, data->start_class,
1801 struct regnode_charclass_class);
1802 flags |= SCF_DO_STCLASS_OR;
1803 data->start_class->flags |= ANYOF_EOS;
1804 }
653099ff 1805 }
a3621e74
YO
1806
1807 /* demq.
1808
1809 Assuming this was/is a branch we are dealing with: 'scan' now
1810 points at the item that follows the branch sequence, whatever
1811 it is. We now start at the beginning of the sequence and look
1812 for subsequences of
1813
1814 BRANCH->EXACT=>X
1815 BRANCH->EXACT=>X
1816
1817 which would be constructed from a pattern like /A|LIST|OF|WORDS/
1818
1819 If we can find such a subseqence we need to turn the first
1820 element into a trie and then add the subsequent branch exact
1821 strings to the trie.
1822
1823 We have two cases
1824
1825 1. patterns where the whole set of branch can be converted to a trie,
1826
1827 2. patterns where only a subset of the alternations can be
1828 converted to a trie.
1829
1830 In case 1 we can replace the whole set with a single regop
1831 for the trie. In case 2 we need to keep the start and end
1832 branchs so
1833
1834 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1835 becomes BRANCH TRIE; BRANCH X;
1836
1837 Hypthetically when we know the regex isnt anchored we can
1838 turn a case 1 into a DFA and let it rip... Every time it finds a match
1839 it would just call its tail, no WHILEM/CURLY needed.
1840
1841 */
0111c4fd
RGS
1842 if (DO_TRIE) {
1843 if (!re_trie_maxbuff) {
1844 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1845 if (!SvIOK(re_trie_maxbuff))
1846 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1847 }
a3621e74
YO
1848 if ( SvIV(re_trie_maxbuff)>=0 && OP( startbranch )==BRANCH ) {
1849 regnode *cur;
1850 regnode *first = (regnode *)NULL;
1851 regnode *last = (regnode *)NULL;
1852 regnode *tail = scan;
1853 U8 optype = 0;
1854 U32 count=0;
1855
1856#ifdef DEBUGGING
1857 SV *mysv = sv_newmortal(); /* for dumping */
1858#endif
1859 /* var tail is used because there may be a TAIL
1860 regop in the way. Ie, the exacts will point to the
1861 thing following the TAIL, but the last branch will
1862 point at the TAIL. So we advance tail. If we
1863 have nested (?:) we may have to move through several
1864 tails.
1865 */
1866
1867 while ( OP( tail ) == TAIL ) {
1868 /* this is the TAIL generated by (?:) */
1869 tail = regnext( tail );
1870 }
1871
1872 DEBUG_OPTIMISE_r({
1873 regprop( mysv, tail );
1874 PerlIO_printf( Perl_debug_log, "%*s%s%s%s\n",
e4584336 1875 (int)depth * 2 + 2, "", "Tail node is:", SvPV_nolen( mysv ),
a3621e74
YO
1876 (RExC_seen_evals) ? "[EVAL]" : ""
1877 );
1878 });
1879 /*
1880
1881 step through the branches, cur represents each
1882 branch, noper is the first thing to be matched
1883 as part of that branch and noper_next is the
1884 regnext() of that node. if noper is an EXACT
1885 and noper_next is the same as scan (our current
1886 position in the regex) then the EXACT branch is
1887 a possible optimization target. Once we have
1888 two or more consequetive such branches we can
1889 create a trie of the EXACT's contents and stich
1890 it in place. If the sequence represents all of
1891 the branches we eliminate the whole thing and
1892 replace it with a single TRIE. If it is a
1893 subsequence then we need to stitch it in. This
1894 means the first branch has to remain, and needs
1895 to be repointed at the item on the branch chain
1896 following the last branch optimized. This could
1897 be either a BRANCH, in which case the
1898 subsequence is internal, or it could be the
1899 item following the branch sequence in which
1900 case the subsequence is at the end.
1901
1902 */
1903
1904 /* dont use tail as the end marker for this traverse */
1905 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
1906 regnode *noper = NEXTOPER( cur );
1907 regnode *noper_next = regnext( noper );
1908
a3621e74
YO
1909 DEBUG_OPTIMISE_r({
1910 regprop( mysv, cur);
1911 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1912 (int)depth * 2 + 2," ", SvPV_nolen( mysv ) );
a3621e74
YO
1913
1914 regprop( mysv, noper);
1915 PerlIO_printf( Perl_debug_log, " -> %s",
1916 SvPV_nolen(mysv));
1917
1918 if ( noper_next ) {
1919 regprop( mysv, noper_next );
1920 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
1921 SvPV_nolen(mysv));
1922 }
1923 PerlIO_printf( Perl_debug_log, "0x%p,0x%p,0x%p)\n",
1924 first, last, cur );
1925 });
1926 if ( ( first ? OP( noper ) == optype
1927 : PL_regkind[ (U8)OP( noper ) ] == EXACT )
1928 && noper_next == tail && count<U16_MAX)
1929 {
1930 count++;
1931 if ( !first ) {
1932 first = cur;
1933 optype = OP( noper );
1934 } else {
1935 DEBUG_OPTIMISE_r(
1936 if (!last ) {
1937 regprop( mysv, first);
1938 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1939 (int)depth * 2 + 2, "F:", SvPV_nolen( mysv ) );
a3621e74
YO
1940 regprop( mysv, NEXTOPER(first) );
1941 PerlIO_printf( Perl_debug_log, " -> %s\n",
1942 SvPV_nolen( mysv ) );
1943 }
1944 );
1945 last = cur;
1946 DEBUG_OPTIMISE_r({
1947 regprop( mysv, cur);
1948 PerlIO_printf( Perl_debug_log, "%*s%s",
e4584336 1949 (int)depth * 2 + 2, "N:", SvPV_nolen( mysv ) );
a3621e74
YO
1950 regprop( mysv, noper );
1951 PerlIO_printf( Perl_debug_log, " -> %s\n",
1952 SvPV_nolen( mysv ) );
1953 });
1954 }
1955 } else {
1956 if ( last ) {
1957 DEBUG_OPTIMISE_r(
1958 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1959 (int)depth * 2 + 2, "E:", "**END**" );
a3621e74
YO
1960 );
1961 make_trie( pRExC_state, startbranch, first, cur, tail, optype );
1962 }
1963 if ( PL_regkind[ (U8)OP( noper ) ] == EXACT
1964 && noper_next == tail )
1965 {
1966 count = 1;
1967 first = cur;
1968 optype = OP( noper );
1969 } else {
1970 count = 0;
1971 first = NULL;
1972 optype = 0;
1973 }
1974 last = NULL;
1975 }
1976 }
1977 DEBUG_OPTIMISE_r({
1978 regprop( mysv, cur);
1979 PerlIO_printf( Perl_debug_log,
e4584336 1980 "%*s%s\t(0x%p,0x%p,0x%p)\n", (int)depth * 2 + 2,
a3621e74
YO
1981 " ", SvPV_nolen( mysv ), first, last, cur);
1982
1983 });
1984 if ( last ) {
1985 DEBUG_OPTIMISE_r(
1986 PerlIO_printf( Perl_debug_log, "%*s%s\n",
e4584336 1987 (int)depth * 2 + 2, "E:", "==END==" );
a3621e74
YO
1988 );
1989 make_trie( pRExC_state, startbranch, first, scan, tail, optype );
1990 }
1991 }
1992 }
a0ed51b3 1993 }
a3621e74 1994 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 1995 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 1996 } else /* single branch is optimized. */
c277df42
IZ
1997 scan = NEXTOPER(scan);
1998 continue;
a0ed51b3
LW
1999 }
2000 else if (OP(scan) == EXACT) {
cd439c50 2001 I32 l = STR_LEN(scan);
1aa99e6b 2002 UV uc = *((U8*)STRING(scan));
a0ed51b3 2003 if (UTF) {
a3b680e6 2004 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2005 l = utf8_length(s, s + l);
9041c2e3 2006 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2007 }
2008 min += l;
c277df42 2009 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2010 /* The code below prefers earlier match for fixed
2011 offset, later match for variable offset. */
2012 if (data->last_end == -1) { /* Update the start info. */
2013 data->last_start_min = data->pos_min;
2014 data->last_start_max = is_inf
b81d288d 2015 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2016 }
cd439c50 2017 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
0eda9292
JH
2018 {
2019 SV * sv = data->last_found;
2020 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2021 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2022 if (mg && mg->mg_len >= 0)
5e43f467
JH
2023 mg->mg_len += utf8_length((U8*)STRING(scan),
2024 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2025 }
33b8afdf
JH
2026 if (UTF)
2027 SvUTF8_on(data->last_found);
c277df42
IZ
2028 data->last_end = data->pos_min + l;
2029 data->pos_min += l; /* As in the first entry. */
2030 data->flags &= ~SF_BEFORE_EOL;
2031 }
653099ff
GS
2032 if (flags & SCF_DO_STCLASS_AND) {
2033 /* Check whether it is compatible with what we know already! */
2034 int compat = 1;
2035
1aa99e6b 2036 if (uc >= 0x100 ||
516a5887 2037 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2038 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2039 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2040 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2041 )
653099ff
GS
2042 compat = 0;
2043 ANYOF_CLASS_ZERO(data->start_class);
2044 ANYOF_BITMAP_ZERO(data->start_class);
2045 if (compat)
1aa99e6b 2046 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2047 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2048 if (uc < 0x100)
2049 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2050 }
2051 else if (flags & SCF_DO_STCLASS_OR) {
2052 /* false positive possible if the class is case-folded */
1aa99e6b 2053 if (uc < 0x100)
9b877dbb
IH
2054 ANYOF_BITMAP_SET(data->start_class, uc);
2055 else
2056 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2057 data->start_class->flags &= ~ANYOF_EOS;
2058 cl_and(data->start_class, &and_with);
2059 }
2060 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2061 }
653099ff 2062 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2063 I32 l = STR_LEN(scan);
1aa99e6b 2064 UV uc = *((U8*)STRING(scan));
653099ff
GS
2065
2066 /* Search for fixed substrings supports EXACT only. */
b81d288d 2067 if (flags & SCF_DO_SUBSTR)
830247a4 2068 scan_commit(pRExC_state, data);
a0ed51b3 2069 if (UTF) {
1aa99e6b
IH
2070 U8 *s = (U8 *)STRING(scan);
2071 l = utf8_length(s, s + l);
9041c2e3 2072 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2073 }
2074 min += l;
c277df42 2075 if (data && (flags & SCF_DO_SUBSTR))
a0ed51b3 2076 data->pos_min += l;
653099ff
GS
2077 if (flags & SCF_DO_STCLASS_AND) {
2078 /* Check whether it is compatible with what we know already! */
2079 int compat = 1;
2080
1aa99e6b 2081 if (uc >= 0x100 ||
516a5887 2082 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2083 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2084 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2085 compat = 0;
2086 ANYOF_CLASS_ZERO(data->start_class);
2087 ANYOF_BITMAP_ZERO(data->start_class);
2088 if (compat) {
1aa99e6b 2089 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2090 data->start_class->flags &= ~ANYOF_EOS;
2091 data->start_class->flags |= ANYOF_FOLD;
2092 if (OP(scan) == EXACTFL)
2093 data->start_class->flags |= ANYOF_LOCALE;
2094 }
2095 }
2096 else if (flags & SCF_DO_STCLASS_OR) {
2097 if (data->start_class->flags & ANYOF_FOLD) {
2098 /* false positive possible if the class is case-folded.
2099 Assume that the locale settings are the same... */
1aa99e6b
IH
2100 if (uc < 0x100)
2101 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2102 data->start_class->flags &= ~ANYOF_EOS;
2103 }
2104 cl_and(data->start_class, &and_with);
2105 }
2106 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2107 }
bfed75c6 2108 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2109 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2110 I32 f = flags, pos_before = 0;
c277df42 2111 regnode *oscan = scan;
653099ff
GS
2112 struct regnode_charclass_class this_class;
2113 struct regnode_charclass_class *oclass = NULL;
727f22e3 2114 I32 next_is_eval = 0;
653099ff 2115
22c35a8c 2116 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2117 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2118 scan = NEXTOPER(scan);
2119 goto finish;
2120 case PLUS:
653099ff 2121 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2122 next = NEXTOPER(scan);
653099ff 2123 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2124 mincount = 1;
2125 maxcount = REG_INFTY;
c277df42
IZ
2126 next = regnext(scan);
2127 scan = NEXTOPER(scan);
2128 goto do_curly;
2129 }
2130 }
2131 if (flags & SCF_DO_SUBSTR)
2132 data->pos_min++;
2133 min++;
2134 /* Fall through. */
2135 case STAR:
653099ff
GS
2136 if (flags & SCF_DO_STCLASS) {
2137 mincount = 0;
b81d288d 2138 maxcount = REG_INFTY;
653099ff
GS
2139 next = regnext(scan);
2140 scan = NEXTOPER(scan);
2141 goto do_curly;
2142 }
b81d288d 2143 is_inf = is_inf_internal = 1;
c277df42
IZ
2144 scan = regnext(scan);
2145 if (flags & SCF_DO_SUBSTR) {
830247a4 2146 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
c277df42
IZ
2147 data->longest = &(data->longest_float);
2148 }
2149 goto optimize_curly_tail;
2150 case CURLY:
b81d288d 2151 mincount = ARG1(scan);
c277df42
IZ
2152 maxcount = ARG2(scan);
2153 next = regnext(scan);
cb434fcc
IZ
2154 if (OP(scan) == CURLYX) {
2155 I32 lp = (data ? *(data->last_closep) : 0);
a3621e74 2156 scan->flags = ((lp <= U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2157 }
c277df42 2158 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2159 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2160 do_curly:
2161 if (flags & SCF_DO_SUBSTR) {
830247a4 2162 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
c277df42
IZ
2163 pos_before = data->pos_min;
2164 }
2165 if (data) {
2166 fl = data->flags;
2167 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2168 if (is_inf)
2169 data->flags |= SF_IS_INF;
2170 }
653099ff 2171 if (flags & SCF_DO_STCLASS) {
830247a4 2172 cl_init(pRExC_state, &this_class);
653099ff
GS
2173 oclass = data->start_class;
2174 data->start_class = &this_class;
2175 f |= SCF_DO_STCLASS_AND;
2176 f &= ~SCF_DO_STCLASS_OR;
2177 }
e1901655
IZ
2178 /* These are the cases when once a subexpression
2179 fails at a particular position, it cannot succeed
2180 even after backtracking at the enclosing scope.
b81d288d 2181
e1901655
IZ
2182 XXXX what if minimal match and we are at the
2183 initial run of {n,m}? */
2184 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2185 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2186
c277df42 2187 /* This will finish on WHILEM, setting scan, or on NULL: */
b81d288d 2188 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
a3621e74
YO
2189 (mincount == 0
2190 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2191
2192 if (flags & SCF_DO_STCLASS)
2193 data->start_class = oclass;
2194 if (mincount == 0 || minnext == 0) {
2195 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2196 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2197 }
2198 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2199 /* Switch to OR mode: cache the old value of
653099ff
GS
2200 * data->start_class */
2201 StructCopy(data->start_class, &and_with,
2202 struct regnode_charclass_class);
2203 flags &= ~SCF_DO_STCLASS_AND;
2204 StructCopy(&this_class, data->start_class,
2205 struct regnode_charclass_class);
2206 flags |= SCF_DO_STCLASS_OR;
2207 data->start_class->flags |= ANYOF_EOS;
2208 }
2209 } else { /* Non-zero len */
2210 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2211 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2212 cl_and(data->start_class, &and_with);
2213 }
2214 else if (flags & SCF_DO_STCLASS_AND)
2215 cl_and(data->start_class, &this_class);
2216 flags &= ~SCF_DO_STCLASS;
2217 }
c277df42
IZ
2218 if (!scan) /* It was not CURLYX, but CURLY. */
2219 scan = next;
84037bb0 2220 if (ckWARN(WARN_REGEXP)
727f22e3
JP
2221 /* ? quantifier ok, except for (?{ ... }) */
2222 && (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2223 && (minnext == 0) && (deltanext == 0)
99799961 2224 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
17feb5d5 2225 && maxcount <= REG_INFTY/3) /* Complement check for big count */
b45f050a 2226 {
830247a4 2227 vWARN(RExC_parse,
b45f050a
JF
2228 "Quantifier unexpected on zero-length expression");
2229 }
2230
c277df42 2231 min += minnext * mincount;
b81d288d 2232 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2233 && (minnext + deltanext) > 0)
2234 || deltanext == I32_MAX);
aca2d497 2235 is_inf |= is_inf_internal;
c277df42
IZ
2236 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2237
2238 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2239 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2240 && data->flags & SF_IN_PAR
2241 && !(data->flags & SF_HAS_EVAL)
2242 && !deltanext && minnext == 1 ) {
2243 /* Try to optimize to CURLYN. */
2244 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
497b47a8
JH
2245 regnode *nxt1 = nxt;
2246#ifdef DEBUGGING
2247 regnode *nxt2;
2248#endif
c277df42
IZ
2249
2250 /* Skip open. */
2251 nxt = regnext(nxt);
bfed75c6 2252 if (!strchr((const char*)PL_simple,OP(nxt))
22c35a8c 2253 && !(PL_regkind[(U8)OP(nxt)] == EXACT
b81d288d 2254 && STR_LEN(nxt) == 1))
c277df42 2255 goto nogo;
497b47a8 2256#ifdef DEBUGGING
c277df42 2257 nxt2 = nxt;
497b47a8 2258#endif
c277df42 2259 nxt = regnext(nxt);
b81d288d 2260 if (OP(nxt) != CLOSE)
c277df42
IZ
2261 goto nogo;
2262 /* Now we know that nxt2 is the only contents: */
eb160463 2263 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2264 OP(oscan) = CURLYN;
2265 OP(nxt1) = NOTHING; /* was OPEN. */
2266#ifdef DEBUGGING
2267 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2268 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2269 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2270 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2271 OP(nxt + 1) = OPTIMIZED; /* was count. */
2272 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2273#endif
c277df42 2274 }
c277df42
IZ
2275 nogo:
2276
2277 /* Try optimization CURLYX => CURLYM. */
b81d288d 2278 if ( OP(oscan) == CURLYX && data
c277df42 2279 && !(data->flags & SF_HAS_PAR)
c277df42 2280 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2281 && !deltanext /* atom is fixed width */
2282 && minnext != 0 /* CURLYM can't handle zero width */
2283 ) {
c277df42
IZ
2284 /* XXXX How to optimize if data == 0? */
2285 /* Optimize to a simpler form. */
2286 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2287 regnode *nxt2;
2288
2289 OP(oscan) = CURLYM;
2290 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2291 && (OP(nxt2) != WHILEM))
c277df42
IZ
2292 nxt = nxt2;
2293 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2294 /* Need to optimize away parenths. */
2295 if (data->flags & SF_IN_PAR) {
2296 /* Set the parenth number. */
2297 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2298
b81d288d 2299 if (OP(nxt) != CLOSE)
b45f050a 2300 FAIL("Panic opt close");
eb160463 2301 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2302 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2303 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2304#ifdef DEBUGGING
2305 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2306 OP(nxt + 1) = OPTIMIZED; /* was count. */
2307 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2308 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2309#endif
c277df42
IZ
2310#if 0
2311 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2312 regnode *nnxt = regnext(nxt1);
b81d288d 2313
c277df42
IZ
2314 if (nnxt == nxt) {
2315 if (reg_off_by_arg[OP(nxt1)])
2316 ARG_SET(nxt1, nxt2 - nxt1);
2317 else if (nxt2 - nxt1 < U16_MAX)
2318 NEXT_OFF(nxt1) = nxt2 - nxt1;
2319 else
2320 OP(nxt) = NOTHING; /* Cannot beautify */
2321 }
2322 nxt1 = nnxt;
2323 }
2324#endif
2325 /* Optimize again: */
b81d288d 2326 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
a3621e74 2327 NULL, 0,depth+1);
a0ed51b3
LW
2328 }
2329 else
c277df42 2330 oscan->flags = 0;
c277df42 2331 }
e1901655
IZ
2332 else if ((OP(oscan) == CURLYX)
2333 && (flags & SCF_WHILEM_VISITED_POS)
2334 /* See the comment on a similar expression above.
2335 However, this time it not a subexpression
2336 we care about, but the expression itself. */
2337 && (maxcount == REG_INFTY)
2338 && data && ++data->whilem_c < 16) {
2339 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2340 /* Find WHILEM (as in regexec.c) */
2341 regnode *nxt = oscan + NEXT_OFF(oscan);
2342
2343 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2344 nxt += ARG(nxt);
eb160463
GS
2345 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2346 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2347 }
b81d288d 2348 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2349 pars++;
2350 if (flags & SCF_DO_SUBSTR) {
2351 SV *last_str = Nullsv;
2352 int counted = mincount != 0;
2353
2354 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2355#if defined(SPARC64_GCC_WORKAROUND)
2356 I32 b = 0;
2357 STRLEN l = 0;
2358 char *s = NULL;
2359 I32 old = 0;
2360
2361 if (pos_before >= data->last_start_min)
2362 b = pos_before;
2363 else
2364 b = data->last_start_min;
2365
2366 l = 0;
2367 s = SvPV(data->last_found, l);
2368 old = b - data->last_start_min;
2369
2370#else
b81d288d 2371 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2372 ? pos_before : data->last_start_min;
2373 STRLEN l;
2374 char *s = SvPV(data->last_found, l);
a0ed51b3 2375 I32 old = b - data->last_start_min;
5d1c421c 2376#endif
a0ed51b3
LW
2377
2378 if (UTF)
2379 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2380
a0ed51b3 2381 l -= old;
c277df42 2382 /* Get the added string: */
79cb57f6 2383 last_str = newSVpvn(s + old, l);
0e933229
IH
2384 if (UTF)
2385 SvUTF8_on(last_str);
c277df42
IZ
2386 if (deltanext == 0 && pos_before == b) {
2387 /* What was added is a constant string */
2388 if (mincount > 1) {
2389 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2390 repeatcpy(SvPVX(last_str) + l,
c277df42 2391 SvPVX(last_str), l, mincount - 1);
b162af07 2392 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 2393 /* Add additional parts. */
b81d288d 2394 SvCUR_set(data->last_found,
c277df42
IZ
2395 SvCUR(data->last_found) - l);
2396 sv_catsv(data->last_found, last_str);
0eda9292
JH
2397 {
2398 SV * sv = data->last_found;
2399 MAGIC *mg =
2400 SvUTF8(sv) && SvMAGICAL(sv) ?
2401 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2402 if (mg && mg->mg_len >= 0)
2403 mg->mg_len += CHR_SVLEN(last_str);
2404 }
c277df42
IZ
2405 data->last_end += l * (mincount - 1);
2406 }
2a8d9689
HS
2407 } else {
2408 /* start offset must point into the last copy */
2409 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
2410 data->last_start_max += is_inf ? I32_MAX
2411 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
2412 }
2413 }
2414 /* It is counted once already... */
2415 data->pos_min += minnext * (mincount - counted);
2416 data->pos_delta += - counted * deltanext +
2417 (minnext + deltanext) * maxcount - minnext * mincount;
2418 if (mincount != maxcount) {
653099ff
GS
2419 /* Cannot extend fixed substrings found inside
2420 the group. */
830247a4 2421 scan_commit(pRExC_state,data);
c277df42
IZ
2422 if (mincount && last_str) {
2423 sv_setsv(data->last_found, last_str);
2424 data->last_end = data->pos_min;
b81d288d 2425 data->last_start_min =
a0ed51b3 2426 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
2427 data->last_start_max = is_inf
2428 ? I32_MAX
c277df42 2429 : data->pos_min + data->pos_delta
a0ed51b3 2430 - CHR_SVLEN(last_str);
c277df42
IZ
2431 }
2432 data->longest = &(data->longest_float);
2433 }
aca2d497 2434 SvREFCNT_dec(last_str);
c277df42 2435 }
405ff068 2436 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
2437 data->flags |= SF_HAS_EVAL;
2438 optimize_curly_tail:
c277df42 2439 if (OP(oscan) != CURLYX) {
22c35a8c 2440 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
2441 && NEXT_OFF(next))
2442 NEXT_OFF(oscan) += NEXT_OFF(next);
2443 }
c277df42 2444 continue;
653099ff 2445 default: /* REF and CLUMP only? */
c277df42 2446 if (flags & SCF_DO_SUBSTR) {
830247a4 2447 scan_commit(pRExC_state,data); /* Cannot expect anything... */
c277df42
IZ
2448 data->longest = &(data->longest_float);
2449 }
aca2d497 2450 is_inf = is_inf_internal = 1;
653099ff 2451 if (flags & SCF_DO_STCLASS_OR)
830247a4 2452 cl_anything(pRExC_state, data->start_class);
653099ff 2453 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
2454 break;
2455 }
a0ed51b3 2456 }
bfed75c6 2457 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 2458 int value = 0;
653099ff 2459
c277df42 2460 if (flags & SCF_DO_SUBSTR) {
830247a4 2461 scan_commit(pRExC_state,data);
c277df42
IZ
2462 data->pos_min++;
2463 }
2464 min++;
653099ff
GS
2465 if (flags & SCF_DO_STCLASS) {
2466 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
2467
2468 /* Some of the logic below assumes that switching
2469 locale on will only add false positives. */
2470 switch (PL_regkind[(U8)OP(scan)]) {
653099ff 2471 case SANY:
653099ff
GS
2472 default:
2473 do_default:
2474 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
2475 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2476 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2477 break;
2478 case REG_ANY:
2479 if (OP(scan) == SANY)
2480 goto do_default;
2481 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
2482 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
2483 || (data->start_class->flags & ANYOF_CLASS));
830247a4 2484 cl_anything(pRExC_state, data->start_class);
653099ff
GS
2485 }
2486 if (flags & SCF_DO_STCLASS_AND || !value)
2487 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
2488 break;
2489 case ANYOF:
2490 if (flags & SCF_DO_STCLASS_AND)
2491 cl_and(data->start_class,
2492 (struct regnode_charclass_class*)scan);
2493 else
830247a4 2494 cl_or(pRExC_state, data->start_class,
653099ff
GS
2495 (struct regnode_charclass_class*)scan);
2496 break;
2497 case ALNUM:
2498 if (flags & SCF_DO_STCLASS_AND) {
2499 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2500 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2501 for (value = 0; value < 256; value++)
2502 if (!isALNUM(value))
2503 ANYOF_BITMAP_CLEAR(data->start_class, value);
2504 }
2505 }
2506 else {
2507 if (data->start_class->flags & ANYOF_LOCALE)
2508 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2509 else {
2510 for (value = 0; value < 256; value++)
2511 if (isALNUM(value))
b81d288d 2512 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2513 }
2514 }
2515 break;
2516 case ALNUML:
2517 if (flags & SCF_DO_STCLASS_AND) {
2518 if (data->start_class->flags & ANYOF_LOCALE)
2519 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
2520 }
2521 else {
2522 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
2523 data->start_class->flags |= ANYOF_LOCALE;
2524 }
2525 break;
2526 case NALNUM:
2527 if (flags & SCF_DO_STCLASS_AND) {
2528 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2529 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2530 for (value = 0; value < 256; value++)
2531 if (isALNUM(value))
2532 ANYOF_BITMAP_CLEAR(data->start_class, value);
2533 }
2534 }
2535 else {
2536 if (data->start_class->flags & ANYOF_LOCALE)
2537 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2538 else {
2539 for (value = 0; value < 256; value++)
2540 if (!isALNUM(value))
b81d288d 2541 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2542 }
2543 }
2544 break;
2545 case NALNUML:
2546 if (flags & SCF_DO_STCLASS_AND) {
2547 if (data->start_class->flags & ANYOF_LOCALE)
2548 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
2549 }
2550 else {
2551 data->start_class->flags |= ANYOF_LOCALE;
2552 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
2553 }
2554 break;
2555 case SPACE:
2556 if (flags & SCF_DO_STCLASS_AND) {
2557 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2558 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2559 for (value = 0; value < 256; value++)
2560 if (!isSPACE(value))
2561 ANYOF_BITMAP_CLEAR(data->start_class, value);
2562 }
2563 }
2564 else {
2565 if (data->start_class->flags & ANYOF_LOCALE)
2566 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2567 else {
2568 for (value = 0; value < 256; value++)
2569 if (isSPACE(value))
b81d288d 2570 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2571 }
2572 }
2573 break;
2574 case SPACEL:
2575 if (flags & SCF_DO_STCLASS_AND) {
2576 if (data->start_class->flags & ANYOF_LOCALE)
2577 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
2578 }
2579 else {
2580 data->start_class->flags |= ANYOF_LOCALE;
2581 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
2582 }
2583 break;
2584 case NSPACE:
2585 if (flags & SCF_DO_STCLASS_AND) {
2586 if (!(data->start_class->flags & ANYOF_LOCALE)) {
2587 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2588 for (value = 0; value < 256; value++)
2589 if (isSPACE(value))
2590 ANYOF_BITMAP_CLEAR(data->start_class, value);
2591 }
2592 }
2593 else {
2594 if (data->start_class->flags & ANYOF_LOCALE)
2595 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2596 else {
2597 for (value = 0; value < 256; value++)
2598 if (!isSPACE(value))
b81d288d 2599 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2600 }
2601 }
2602 break;
2603 case NSPACEL:
2604 if (flags & SCF_DO_STCLASS_AND) {
2605 if (data->start_class->flags & ANYOF_LOCALE) {
2606 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
2607 for (value = 0; value < 256; value++)
2608 if (!isSPACE(value))
2609 ANYOF_BITMAP_CLEAR(data->start_class, value);
2610 }
2611 }
2612 else {
2613 data->start_class->flags |= ANYOF_LOCALE;
2614 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
2615 }
2616 break;
2617 case DIGIT:
2618 if (flags & SCF_DO_STCLASS_AND) {
2619 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
2620 for (value = 0; value < 256; value++)
2621 if (!isDIGIT(value))
2622 ANYOF_BITMAP_CLEAR(data->start_class, value);
2623 }
2624 else {
2625 if (data->start_class->flags & ANYOF_LOCALE)
2626 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
2627 else {
2628 for (value = 0; value < 256; value++)
2629 if (isDIGIT(value))
b81d288d 2630 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2631 }
2632 }
2633 break;
2634 case NDIGIT:
2635 if (flags & SCF_DO_STCLASS_AND) {
2636 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
2637 for (value = 0; value < 256; value++)
2638 if (isDIGIT(value))
2639 ANYOF_BITMAP_CLEAR(data->start_class, value);
2640 }
2641 else {
2642 if (data->start_class->flags & ANYOF_LOCALE)
2643 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
2644 else {
2645 for (value = 0; value < 256; value++)
2646 if (!isDIGIT(value))
b81d288d 2647 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
2648 }
2649 }
2650 break;
2651 }
2652 if (flags & SCF_DO_STCLASS_OR)
2653 cl_and(data->start_class, &and_with);
2654 flags &= ~SCF_DO_STCLASS;
2655 }
a0ed51b3 2656 }
22c35a8c 2657 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
2658 data->flags |= (OP(scan) == MEOL
2659 ? SF_BEFORE_MEOL
2660 : SF_BEFORE_SEOL);
a0ed51b3 2661 }
653099ff
GS
2662 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
2663 /* Lookbehind, or need to calculate parens/evals/stclass: */
2664 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 2665 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
653099ff 2666 /* Lookahead/lookbehind */
cb434fcc 2667 I32 deltanext, minnext, fake = 0;
c277df42 2668 regnode *nscan;
653099ff
GS
2669 struct regnode_charclass_class intrnl;
2670 int f = 0;
c277df42
IZ
2671
2672 data_fake.flags = 0;
b81d288d 2673 if (data) {
2c2d71f5 2674 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2675 data_fake.last_closep = data->last_closep;
2676 }
2677 else
2678 data_fake.last_closep = &fake;
653099ff
GS
2679 if ( flags & SCF_DO_STCLASS && !scan->flags
2680 && OP(scan) == IFMATCH ) { /* Lookahead */
830247a4 2681 cl_init(pRExC_state, &intrnl);
653099ff 2682 data_fake.start_class = &intrnl;
e1901655 2683 f |= SCF_DO_STCLASS_AND;
653099ff 2684 }
e1901655
IZ
2685 if (flags & SCF_WHILEM_VISITED_POS)
2686 f |= SCF_WHILEM_VISITED_POS;
c277df42
IZ
2687 next = regnext(scan);
2688 nscan = NEXTOPER(NEXTOPER(scan));
a3621e74 2689 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f,depth+1);
c277df42
IZ
2690 if (scan->flags) {
2691 if (deltanext) {
9baa0206 2692 vFAIL("Variable length lookbehind not implemented");
a0ed51b3
LW
2693 }
2694 else if (minnext > U8_MAX) {
9baa0206 2695 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
c277df42 2696 }
eb160463 2697 scan->flags = (U8)minnext;
c277df42
IZ
2698 }
2699 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2700 pars++;
405ff068 2701 if (data && (data_fake.flags & SF_HAS_EVAL))
c277df42 2702 data->flags |= SF_HAS_EVAL;
2c2d71f5
JH
2703 if (data)
2704 data->whilem_c = data_fake.whilem_c;
e1901655 2705 if (f & SCF_DO_STCLASS_AND) {
653099ff
GS
2706 int was = (data->start_class->flags & ANYOF_EOS);
2707
2708 cl_and(data->start_class, &intrnl);
2709 if (was)
2710 data->start_class->flags |= ANYOF_EOS;
2711 }
a0ed51b3
LW
2712 }
2713 else if (OP(scan) == OPEN) {
c277df42 2714 pars++;
a0ed51b3 2715 }
cb434fcc 2716 else if (OP(scan) == CLOSE) {
eb160463 2717 if ((I32)ARG(scan) == is_par) {
cb434fcc 2718 next = regnext(scan);
c277df42 2719
cb434fcc
IZ
2720 if ( next && (OP(next) != WHILEM) && next < last)
2721 is_par = 0; /* Disable optimization */
2722 }
2723 if (data)
2724 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
2725 }
2726 else if (OP(scan) == EVAL) {
c277df42
IZ
2727 if (data)
2728 data->flags |= SF_HAS_EVAL;
2729 }
96776eda 2730 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 2731 if (flags & SCF_DO_SUBSTR) {
830247a4 2732 scan_commit(pRExC_state,data);
0f5d15d6
IZ
2733 data->longest = &(data->longest_float);
2734 }
2735 is_inf = is_inf_internal = 1;
653099ff 2736 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 2737 cl_anything(pRExC_state, data->start_class);
96776eda 2738 flags &= ~SCF_DO_STCLASS;
0f5d15d6 2739 }
c277df42
IZ
2740 /* Else: zero-length, ignore. */
2741 scan = regnext(scan);
2742 }
2743
2744 finish:
2745 *scanp = scan;
aca2d497 2746 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 2747 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42
IZ
2748 data->pos_delta = I32_MAX - data->pos_min;
2749 if (is_par > U8_MAX)
2750 is_par = 0;
2751 if (is_par && pars==1 && data) {
2752 data->flags |= SF_IN_PAR;
2753 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
2754 }
2755 else if (pars && data) {
c277df42
IZ
2756 data->flags |= SF_HAS_PAR;
2757 data->flags &= ~SF_IN_PAR;
2758 }
653099ff
GS
2759 if (flags & SCF_DO_STCLASS_OR)
2760 cl_and(data->start_class, &and_with);
c277df42
IZ
2761 return min;
2762}
2763
76e3520e 2764STATIC I32
bfed75c6 2765S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 2766{
830247a4 2767 if (RExC_rx->data) {
b81d288d
AB
2768 Renewc(RExC_rx->data,
2769 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 2770 char, struct reg_data);
830247a4
IZ
2771 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
2772 RExC_rx->data->count += n;
a0ed51b3
LW
2773 }
2774 else {
830247a4 2775 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 2776 char, struct reg_data);
830247a4
IZ
2777 New(1208, RExC_rx->data->what, n, U8);
2778 RExC_rx->data->count = n;
c277df42 2779 }
830247a4
IZ
2780 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
2781 return RExC_rx->data->count - n;
c277df42
IZ
2782}
2783
d88dccdf 2784void
864dbfa3 2785Perl_reginitcolors(pTHX)
d88dccdf 2786{
1df70142 2787 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 2788 if (s) {
1df70142
AL
2789 char *t = savepv(s);
2790 int i = 0;
2791 PL_colors[0] = t;
d88dccdf 2792 while (++i < 6) {
1df70142
AL
2793 t = strchr(t, '\t');
2794 if (t) {
2795 *t = '\0';
2796 PL_colors[i] = ++t;
d88dccdf
IZ
2797 }
2798 else
1df70142 2799 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
2800 }
2801 } else {
1df70142 2802 int i = 0;
b81d288d 2803 while (i < 6)
06b5626a 2804 PL_colors[i++] = (char *)"";
d88dccdf
IZ
2805 }
2806 PL_colorset = 1;
2807}
2808
8615cb43 2809
a687059c 2810/*
e50aee73 2811 - pregcomp - compile a regular expression into internal code
a687059c
LW
2812 *
2813 * We can't allocate space until we know how big the compiled form will be,
2814 * but we can't compile it (and thus know how big it is) until we've got a
2815 * place to put the code. So we cheat: we compile it twice, once with code
2816 * generation turned off and size counting turned on, and once "for real".
2817 * This also means that we don't allocate space until we are sure that the
2818 * thing really will compile successfully, and we never have to move the
2819 * code and thus invalidate pointers into it. (Note that it has to be in
2820 * one piece because free() must be able to free it all.) [NB: not true in perl]
2821 *
2822 * Beware that the optimization-preparation code in here knows about some
2823 * of the structure of the compiled regexp. [I'll say.]
2824 */
2825regexp *
864dbfa3 2826Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 2827{
a0d0e21e 2828 register regexp *r;
c277df42 2829 regnode *scan;
c277df42 2830 regnode *first;
a0d0e21e 2831 I32 flags;
a0d0e21e
LW
2832 I32 minlen = 0;
2833 I32 sawplus = 0;
2834 I32 sawopen = 0;
2c2d71f5 2835 scan_data_t data;
830247a4
IZ
2836 RExC_state_t RExC_state;
2837 RExC_state_t *pRExC_state = &RExC_state;
a0d0e21e 2838
a3621e74
YO
2839 GET_RE_DEBUG_FLAGS_DECL;
2840
a0d0e21e 2841 if (exp == NULL)
c277df42 2842 FAIL("NULL regexp argument");
a0d0e21e 2843
a5961de5 2844 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 2845
5cfc7842 2846 RExC_precomp = exp;
a3621e74
YO
2847 DEBUG_r(if (!PL_colorset) reginitcolors());
2848 DEBUG_COMPILE_r({
2849 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s \"%s%*s%s\"\n",
a5961de5
JH
2850 PL_colors[4],PL_colors[5],PL_colors[0],
2851 (int)(xend - exp), RExC_precomp, PL_colors[1]);
2852 });
e2509266 2853 RExC_flags = pm->op_pmflags;
830247a4 2854 RExC_sawback = 0;
bbce6d69 2855
830247a4
IZ
2856 RExC_seen = 0;
2857 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
2858 RExC_seen_evals = 0;
2859 RExC_extralen = 0;
c277df42 2860
bbce6d69 2861 /* First pass: determine size, legality. */
830247a4 2862 RExC_parse = exp;
fac92740 2863 RExC_start = exp;
830247a4
IZ
2864 RExC_end = xend;
2865 RExC_naughty = 0;
2866 RExC_npar = 1;
2867 RExC_size = 0L;
2868 RExC_emit = &PL_regdummy;
2869 RExC_whilem_seen = 0;
85ddcde9
JH
2870#if 0 /* REGC() is (currently) a NOP at the first pass.
2871 * Clever compilers notice this and complain. --jhi */
830247a4 2872 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 2873#endif
830247a4 2874 if (reg(pRExC_state, 0, &flags) == NULL) {
830247a4 2875 RExC_precomp = Nullch;
a0d0e21e
LW
2876 return(NULL);
2877 }
a3621e74 2878 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
c277df42 2879
c277df42
IZ
2880 /* Small enough for pointer-storage convention?
2881 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
2882 if (RExC_size >= 0x10000L && RExC_extralen)
2883 RExC_size += RExC_extralen;
c277df42 2884 else
830247a4
IZ
2885 RExC_extralen = 0;
2886 if (RExC_whilem_seen > 15)
2887 RExC_whilem_seen = 15;
a0d0e21e 2888
bbce6d69 2889 /* Allocate space and initialize. */
830247a4 2890 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 2891 char, regexp);
a0d0e21e 2892 if (r == NULL)
b45f050a
JF
2893 FAIL("Regexp out of space");
2894
0f79a09d
GS
2895#ifdef DEBUGGING
2896 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 2897 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 2898#endif
c277df42 2899 r->refcnt = 1;
bbce6d69 2900 r->prelen = xend - exp;
5cfc7842 2901 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 2902 r->subbeg = NULL;
ed252734
NC
2903#ifdef PERL_COPY_ON_WRITE
2904 r->saved_copy = Nullsv;
2905#endif
cf93c79d 2906 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 2907 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4327152a
IZ
2908
2909 r->substrs = 0; /* Useful during FAIL. */
2910 r->startp = 0; /* Useful during FAIL. */
2911 r->endp = 0; /* Useful during FAIL. */
2912
fac92740
MJD
2913 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
2914 if (r->offsets) {
2af232bd 2915 r->offsets[0] = RExC_size;
fac92740 2916 }
a3621e74 2917 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
2918 "%s %"UVuf" bytes for offset annotations.\n",
2919 r->offsets ? "Got" : "Couldn't get",
392fbf5d 2920 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 2921
830247a4 2922 RExC_rx = r;
bbce6d69 2923
2924 /* Second pass: emit code. */
e2509266 2925 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
2926 RExC_parse = exp;
2927 RExC_end = xend;
2928 RExC_naughty = 0;
2929 RExC_npar = 1;
fac92740 2930 RExC_emit_start = r->program;
830247a4 2931 RExC_emit = r->program;
2cd61cdb 2932 /* Store the count of eval-groups for security checks: */
eb160463 2933 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
830247a4 2934 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 2935 r->data = 0;
830247a4 2936 if (reg(pRExC_state, 0, &flags) == NULL)
a0d0e21e
LW
2937 return(NULL);
2938
a3621e74 2939
a0d0e21e 2940 /* Dig out information for optimizations. */
cf93c79d 2941 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 2942 pm->op_pmflags = RExC_flags;
a0ed51b3 2943 if (UTF)
5ff6fc6d 2944 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 2945 r->regstclass = NULL;
830247a4 2946 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 2947 r->reganch |= ROPT_NAUGHTY;
c277df42 2948 scan = r->program + 1; /* First BRANCH. */
2779dcf1
IZ
2949
2950 /* XXXX To minimize changes to RE engine we always allocate
2951 3-units-long substrs field. */
2952 Newz(1004, r->substrs, 1, struct reg_substr_data);
2953
2c2d71f5 2954 StructCopy(&zero_scan_data, &data, scan_data_t);
653099ff 2955 /* XXXX Should not we check for something else? Usually it is OPEN1... */
c277df42 2956 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
c277df42 2957 I32 fake;
c5254dd6 2958 STRLEN longest_float_length, longest_fixed_length;
653099ff
GS
2959 struct regnode_charclass_class ch_class;
2960 int stclass_flag;
cb434fcc 2961 I32 last_close = 0;
a0d0e21e
LW
2962
2963 first = scan;
c277df42 2964 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 2965 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 2966 /* An OR of *one* alternative - should not happen now. */
a0d0e21e
LW
2967 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
2968 (OP(first) == PLUS) ||
2969 (OP(first) == MINMOD) ||
653099ff 2970 /* An {n,m} with n>0 */
22c35a8c 2971 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
a0d0e21e
LW
2972 if (OP(first) == PLUS)
2973 sawplus = 1;
2974 else
2975 first += regarglen[(U8)OP(first)];
2976 first = NEXTOPER(first);
a687059c
LW
2977 }
2978
a0d0e21e
LW
2979 /* Starting-point info. */
2980 again:
653099ff 2981 if (PL_regkind[(U8)OP(first)] == EXACT) {
1aa99e6b
IH
2982 if (OP(first) == EXACT)
2983 ; /* Empty, get anchored substr later. */
2984 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
2985 r->regstclass = first;
2986 }
bfed75c6 2987 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 2988 r->regstclass = first;
22c35a8c
GS
2989 else if (PL_regkind[(U8)OP(first)] == BOUND ||
2990 PL_regkind[(U8)OP(first)] == NBOUND)
a0d0e21e 2991 r->regstclass = first;
22c35a8c 2992 else if (PL_regkind[(U8)OP(first)] == BOL) {
cad2e5aa
JH
2993 r->reganch |= (OP(first) == MBOL
2994 ? ROPT_ANCH_MBOL
2995 : (OP(first) == SBOL
2996 ? ROPT_ANCH_SBOL
2997 : ROPT_ANCH_BOL));
a0d0e21e 2998 first = NEXTOPER(first);
774d564b 2999 goto again;
3000 }
3001 else if (OP(first) == GPOS) {
3002 r->reganch |= ROPT_ANCH_GPOS;
3003 first = NEXTOPER(first);
3004 goto again;
a0d0e21e 3005 }
e09294f4 3006 else if (!sawopen && (OP(first) == STAR &&
22c35a8c 3007 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3008 !(r->reganch & ROPT_ANCH) )
3009 {
3010 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3011 const int type =
3012 (OP(NEXTOPER(first)) == REG_ANY)
3013 ? ROPT_ANCH_MBOL
3014 : ROPT_ANCH_SBOL;
cad2e5aa 3015 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3016 first = NEXTOPER(first);
774d564b 3017 goto again;
a0d0e21e 3018 }
b81d288d 3019 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3020 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3021 /* x+ must match at the 1st pos of run of x's */
3022 r->reganch |= ROPT_SKIP;
a0d0e21e 3023
c277df42 3024 /* Scan is after the zeroth branch, first is atomic matcher. */
a3621e74 3025 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
b900a521 3026 (IV)(first - scan + 1)));
a0d0e21e
LW
3027 /*
3028 * If there's something expensive in the r.e., find the
3029 * longest literal string that must appear and make it the
3030 * regmust. Resolve ties in favor of later strings, since
3031 * the regstart check works with the beginning of the r.e.
3032 * and avoiding duplication strengthens checking. Not a
3033 * strong reason, but sufficient in the absence of others.
3034 * [Now we resolve ties in favor of the earlier string if
c277df42 3035 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
3036 * earlier string may buy us something the later one won't.]
3037 */
a0d0e21e 3038 minlen = 0;
a687059c 3039
79cb57f6
GS
3040 data.longest_fixed = newSVpvn("",0);
3041 data.longest_float = newSVpvn("",0);
3042 data.last_found = newSVpvn("",0);
c277df42
IZ
3043 data.longest = &(data.longest_fixed);
3044 first = scan;
653099ff 3045 if (!r->regstclass) {
830247a4 3046 cl_init(pRExC_state, &ch_class);
653099ff
GS
3047 data.start_class = &ch_class;
3048 stclass_flag = SCF_DO_STCLASS_AND;
3049 } else /* XXXX Check for BOUND? */
3050 stclass_flag = 0;
cb434fcc 3051 data.last_closep = &last_close;
653099ff 3052
830247a4 3053 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
a3621e74 3054 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
830247a4 3055 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 3056 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
3057 && !RExC_seen_zerolen
3058 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 3059 r->reganch |= ROPT_CHECK_ALL;
830247a4 3060 scan_commit(pRExC_state, &data);
c277df42
IZ
3061 SvREFCNT_dec(data.last_found);
3062
a0ed51b3 3063 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 3064 if (longest_float_length
c277df42
IZ
3065 || (data.flags & SF_FL_BEFORE_EOL
3066 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3067 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3068 int t;
3069
a0ed51b3 3070 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
3071 && data.offset_fixed == data.offset_float_min
3072 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
3073 goto remove_float; /* As in (a)+. */
3074
33b8afdf
JH
3075 if (SvUTF8(data.longest_float)) {
3076 r->float_utf8 = data.longest_float;
3077 r->float_substr = Nullsv;
3078 } else {
3079 r->float_substr = data.longest_float;
3080 r->float_utf8 = Nullsv;
3081 }
c277df42
IZ
3082 r->float_min_offset = data.offset_float_min;
3083 r->float_max_offset = data.offset_float_max;
cf93c79d
IZ
3084 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
3085 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 3086 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3087 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3088 }
3089 else {
aca2d497 3090 remove_float:
33b8afdf 3091 r->float_substr = r->float_utf8 = Nullsv;
c277df42 3092 SvREFCNT_dec(data.longest_float);
c5254dd6 3093 longest_float_length = 0;
a0d0e21e 3094 }
c277df42 3095
a0ed51b3 3096 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 3097 if (longest_fixed_length
c277df42
IZ
3098 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
3099 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3100 || (RExC_flags & PMf_MULTILINE)))) {
cf93c79d
IZ
3101 int t;
3102
33b8afdf
JH
3103 if (SvUTF8(data.longest_fixed)) {
3104 r->anchored_utf8 = data.longest_fixed;
3105 r->anchored_substr = Nullsv;
3106 } else {
3107 r->anchored_substr = data.longest_fixed;
3108 r->anchored_utf8 = Nullsv;
3109 }
c277df42 3110 r->anchored_offset = data.offset_fixed;
cf93c79d
IZ
3111 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
3112 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 3113 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 3114 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
3115 }
3116 else {
33b8afdf 3117 r->anchored_substr = r->anchored_utf8 = Nullsv;
c277df42 3118 SvREFCNT_dec(data.longest_fixed);
c5254dd6 3119 longest_fixed_length = 0;
a0d0e21e 3120 }
b81d288d 3121 if (r->regstclass
ffc61ed2 3122 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 3123 r->regstclass = NULL;
33b8afdf
JH
3124 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
3125 && stclass_flag
653099ff 3126 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3127 && !cl_is_anything(data.start_class))
3128 {
1df70142 3129 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3130
b81d288d 3131 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3132 struct regnode_charclass_class);
3133 StructCopy(data.start_class,
830247a4 3134 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3135 struct regnode_charclass_class);
830247a4 3136 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3137 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
ffc61ed2 3138 PL_regdata = r->data; /* for regprop() */
a3621e74 3139 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
9c5ffd7c
JH
3140 regprop(sv, (regnode*)data.start_class);
3141 PerlIO_printf(Perl_debug_log,
3142 "synthetic stclass `%s'.\n",
3143 SvPVX(sv));});
653099ff 3144 }
c277df42
IZ
3145
3146 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 3147 if (longest_fixed_length > longest_float_length) {
c277df42 3148 r->check_substr = r->anchored_substr;
33b8afdf 3149 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
3150 r->check_offset_min = r->check_offset_max = r->anchored_offset;
3151 if (r->reganch & ROPT_ANCH_SINGLE)
3152 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
3153 }
3154 else {
c277df42 3155 r->check_substr = r->float_substr;
33b8afdf 3156 r->check_utf8 = r->float_utf8;
c277df42
IZ
3157 r->check_offset_min = data.offset_float_min;
3158 r->check_offset_max = data.offset_float_max;
a0d0e21e 3159 }
30382c73
IZ
3160 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
3161 This should be changed ASAP! */
33b8afdf 3162 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 3163 r->reganch |= RE_USE_INTUIT;
33b8afdf 3164 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
3165 r->reganch |= RE_INTUIT_TAIL;
3166 }
a0ed51b3
LW
3167 }
3168 else {
c277df42
IZ
3169 /* Several toplevels. Best we can is to set minlen. */
3170 I32 fake;
653099ff 3171 struct regnode_charclass_class ch_class;
cb434fcc 3172 I32 last_close = 0;
c277df42 3173
a3621e74 3174 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
c277df42 3175 scan = r->program + 1;
830247a4 3176 cl_init(pRExC_state, &ch_class);
653099ff 3177 data.start_class = &ch_class;
cb434fcc 3178 data.last_closep = &last_close;
a3621e74 3179 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
33b8afdf
JH
3180 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
3181 = r->float_substr = r->float_utf8 = Nullsv;
653099ff 3182 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
3183 && !cl_is_anything(data.start_class))
3184 {
1df70142 3185 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 3186
b81d288d 3187 New(1006, RExC_rx->data->data[n], 1,
653099ff
GS
3188 struct regnode_charclass_class);
3189 StructCopy(data.start_class,
830247a4 3190 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 3191 struct regnode_charclass_class);
830247a4 3192 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 3193 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 3194 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
9c5ffd7c
JH
3195 regprop(sv, (regnode*)data.start_class);
3196 PerlIO_printf(Perl_debug_log,
3197 "synthetic stclass `%s'.\n",
3198 SvPVX(sv));});
653099ff 3199 }
a0d0e21e
LW
3200 }
3201
a0d0e21e 3202 r->minlen = minlen;
b81d288d 3203 if (RExC_seen & REG_SEEN_GPOS)
c277df42 3204 r->reganch |= ROPT_GPOS_SEEN;
830247a4 3205 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 3206 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 3207 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 3208 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
3209 if (RExC_seen & REG_SEEN_CANY)
3210 r->reganch |= ROPT_CANY_SEEN;
830247a4
IZ
3211 Newz(1002, r->startp, RExC_npar, I32);
3212 Newz(1002, r->endp, RExC_npar, I32);
ffc61ed2 3213 PL_regdata = r->data; /* for regprop() */
a3621e74 3214 DEBUG_COMPILE_r(regdump(r));
a0d0e21e 3215 return(r);
a687059c
LW
3216}
3217
3218/*
3219 - reg - regular expression, i.e. main body or parenthesized thing
3220 *
3221 * Caller must absorb opening parenthesis.
3222 *
3223 * Combining parenthesis handling with the base level of regular expression
3224 * is a trifle forced, but the need to tie the tails of the branches to what
3225 * follows makes it hard to avoid.
3226 */
76e3520e 3227STATIC regnode *
830247a4 3228S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
c277df42 3229 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 3230{
27da23d5 3231 dVAR;
c277df42
IZ
3232 register regnode *ret; /* Will be the head of the group. */
3233 register regnode *br;
3234 register regnode *lastbr;
3235 register regnode *ender = 0;
a0d0e21e 3236 register I32 parno = 0;
e2509266 3237 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
9d1d55b5
JP
3238
3239 /* for (?g), (?gc), and (?o) warnings; warning
3240 about (?c) will warn about (?g) -- japhy */
3241
3242 I32 wastedflags = 0x00,
3243 wasted_o = 0x01,
3244 wasted_g = 0x02,
3245 wasted_gc = 0x02 | 0x04,
3246 wasted_c = 0x04;
3247
fac92740 3248 char * parse_start = RExC_parse; /* MJD */
830247a4 3249 char *oregcomp_parse = RExC_parse;
c277df42 3250 char c;
a0d0e21e 3251
821b33a5 3252 *flagp = 0; /* Tentatively. */
a0d0e21e 3253
9d1d55b5 3254
a0d0e21e
LW
3255 /* Make an OPEN node, if parenthesized. */
3256 if (paren) {
fac92740 3257 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
3258 U32 posflags = 0, negflags = 0;
3259 U32 *flagsp = &posflags;
0f5d15d6 3260 int logical = 0;
830247a4 3261 char *seqstart = RExC_parse;
ca9dfc88 3262
830247a4
IZ
3263 RExC_parse++;
3264 paren = *RExC_parse++;
c277df42 3265 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 3266 switch (paren) {
fac92740 3267 case '<': /* (?<...) */
830247a4 3268 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 3269 if (*RExC_parse == '!')
c277df42 3270 paren = ',';
b81d288d 3271 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 3272 goto unknown;
830247a4 3273 RExC_parse++;
fac92740
MJD
3274 case '=': /* (?=...) */
3275 case '!': /* (?!...) */
830247a4 3276 RExC_seen_zerolen++;
fac92740
MJD
3277 case ':': /* (?:...) */
3278 case '>': /* (?>...) */
a0d0e21e 3279 break;
fac92740
MJD
3280 case '$': /* (?$...) */
3281 case '@': /* (?@...) */
8615cb43 3282 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 3283 break;
fac92740 3284 case '#': /* (?#...) */
830247a4
IZ
3285 while (*RExC_parse && *RExC_parse != ')')
3286 RExC_parse++;
3287 if (*RExC_parse != ')')
c277df42 3288 FAIL("Sequence (?#... not terminated");
830247a4 3289 nextchar(pRExC_state);
a0d0e21e
LW
3290 *flagp = TRYAGAIN;
3291 return NULL;
fac92740 3292 case 'p': /* (?p...) */
9014280d 3293 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 3294 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 3295 /* FALL THROUGH*/
fac92740 3296 case '?': /* (??...) */
0f5d15d6 3297 logical = 1;
438a3801
YST
3298 if (*RExC_parse != '{')
3299 goto unknown;
830247a4 3300 paren = *RExC_parse++;
0f5d15d6 3301 /* FALL THROUGH */
fac92740 3302 case '{': /* (?{...}) */
c277df42 3303 {
c277df42
IZ
3304 I32 count = 1, n = 0;
3305 char c;
830247a4 3306 char *s = RExC_parse;
c277df42
IZ
3307 SV *sv;
3308 OP_4tree *sop, *rop;
3309
830247a4
IZ
3310 RExC_seen_zerolen++;
3311 RExC_seen |= REG_SEEN_EVAL;
3312 while (count && (c = *RExC_parse)) {
3313 if (c == '\\' && RExC_parse[1])
3314 RExC_parse++;
b81d288d 3315 else if (c == '{')
c277df42 3316 count++;
b81d288d 3317 else if (c == '}')
c277df42 3318 count--;
830247a4 3319 RExC_parse++;
c277df42 3320 }
830247a4 3321 if (*RExC_parse != ')')
b45f050a 3322 {
b81d288d 3323 RExC_parse = s;
b45f050a
JF
3324 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
3325 }
c277df42 3326 if (!SIZE_ONLY) {
f3548bdc 3327 PAD *pad;
b81d288d
AB
3328
3329 if (RExC_parse - 1 - s)
830247a4 3330 sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 3331 else
79cb57f6 3332 sv = newSVpvn("", 0);
c277df42 3333
569233ed
SB
3334 ENTER;
3335 Perl_save_re_context(aTHX);
f3548bdc 3336 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
3337 sop->op_private |= OPpREFCOUNTED;
3338 /* re_dup will OpREFCNT_inc */
3339 OpREFCNT_set(sop, 1);
569233ed 3340 LEAVE;
c277df42 3341
830247a4
IZ
3342 n = add_data(pRExC_state, 3, "nop");
3343 RExC_rx->data->data[n] = (void*)rop;
3344 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 3345 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 3346 SvREFCNT_dec(sv);
a0ed51b3 3347 }
e24b16f9 3348 else { /* First pass */
830247a4 3349 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 3350 && IN_PERL_RUNTIME)
2cd61cdb
IZ
3351 /* No compiled RE interpolated, has runtime
3352 components ===> unsafe. */
3353 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 3354 if (PL_tainting && PL_tainted)
cc6b7395 3355 FAIL("Eval-group in insecure regular expression");
923e4eb5 3356 if (IN_PERL_COMPILETIME)
b5c19bd7 3357 PL_cv_has_eval = 1;
c277df42 3358 }
b5c19bd7 3359
830247a4 3360 nextchar(pRExC_state);
0f5d15d6 3361 if (logical) {
830247a4 3362 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3363 if (!SIZE_ONLY)
3364 ret->flags = 2;
830247a4 3365 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 3366 /* deal with the length of this later - MJD */
0f5d15d6
IZ
3367 return ret;
3368 }
ccb2c380
MP
3369 ret = reganode(pRExC_state, EVAL, n);
3370 Set_Node_Length(ret, RExC_parse - parse_start + 1);
3371 Set_Node_Offset(ret, parse_start);
3372 return ret;
c277df42 3373 }
fac92740 3374 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 3375 {
fac92740 3376 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
3377 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
3378 || RExC_parse[1] == '<'
830247a4 3379 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
3380 I32 flag;
3381
830247a4 3382 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
3383 if (!SIZE_ONLY)
3384 ret->flags = 1;
830247a4 3385 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
c277df42 3386 goto insert_if;
b81d288d 3387 }
a0ed51b3 3388 }
830247a4 3389 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 3390 /* (?(1)...) */
830247a4 3391 parno = atoi(RExC_parse++);
c277df42 3392
830247a4
IZ
3393 while (isDIGIT(*RExC_parse))
3394 RExC_parse++;
fac92740 3395 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 3396
830247a4 3397 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 3398 vFAIL("Switch condition not recognized");
c277df42 3399 insert_if:
830247a4
IZ
3400 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
3401 br = regbranch(pRExC_state, &flags, 1);
c277df42 3402 if (br == NULL)
830247a4 3403 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 3404 else
830247a4
IZ
3405 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
3406 c = *nextchar(pRExC_state);
d1b80229
IZ
3407 if (flags&HASWIDTH)
3408 *flagp |= HASWIDTH;
c277df42 3409 if (c == '|') {
830247a4
IZ
3410 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3411 regbranch(pRExC_state, &flags, 1);
3412 regtail(pRExC_state, ret, lastbr);
d1b80229
IZ
3413 if (flags&HASWIDTH)
3414 *flagp |= HASWIDTH;
830247a4 3415 c = *nextchar(pRExC_state);
a0ed51b3
LW
3416 }
3417 else
c277df42
IZ
3418 lastbr = NULL;
3419 if (c != ')')
8615cb43 3420 vFAIL("Switch (?(condition)... contains too many branches");
830247a4
IZ
3421 ender = reg_node(pRExC_state, TAIL);
3422 regtail(pRExC_state, br, ender);
c277df42 3423 if (lastbr) {
830247a4
IZ
3424 regtail(pRExC_state, lastbr, ender);
3425 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
3426 }
3427 else
830247a4 3428 regtail(pRExC_state, ret, ender);
c277df42 3429 return ret;
a0ed51b3
LW
3430 }
3431 else {
830247a4 3432 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
3433 }
3434 }
1b1626e4 3435 case 0:
830247a4 3436 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 3437 vFAIL("Sequence (? incomplete");
1b1626e4 3438 break;
a0d0e21e 3439 default:
830247a4 3440 --RExC_parse;
fac92740 3441 parse_flags: /* (?i) */
830247a4 3442 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
3443 /* (?g), (?gc) and (?o) are useless here
3444 and must be globally applied -- japhy */
3445
3446 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
3447 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3448 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
3449 if (! (wastedflags & wflagbit) ) {
3450 wastedflags |= wflagbit;
3451 vWARN5(
3452 RExC_parse + 1,
3453 "Useless (%s%c) - %suse /%c modifier",
3454 flagsp == &negflags ? "?-" : "?",
3455 *RExC_parse,
3456 flagsp == &negflags ? "don't " : "",
3457 *RExC_parse
3458 );
3459 }
3460 }
3461 }
3462 else if (*RExC_parse == 'c') {
3463 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
3464 if (! (wastedflags & wasted_c) ) {
3465 wastedflags |= wasted_gc;
3466 vWARN3(
3467 RExC_parse + 1,
3468 "Useless (%sc) - %suse /gc modifier",
3469 flagsp == &negflags ? "?-" : "?",
3470 flagsp == &negflags ? "don't " : ""
3471 );
3472 }
3473 }
3474 }
3475 else { pmflag(flagsp, *RExC_parse); }
3476
830247a4 3477 ++RExC_parse;
ca9dfc88 3478 }
830247a4 3479 if (*RExC_parse == '-') {
ca9dfc88 3480 flagsp = &negflags;
9d1d55b5 3481 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 3482 ++RExC_parse;
ca9dfc88 3483 goto parse_flags;
48c036b1 3484 }
e2509266
JH
3485 RExC_flags |= posflags;
3486 RExC_flags &= ~negflags;
830247a4
IZ
3487 if (*RExC_parse == ':') {
3488 RExC_parse++;
ca9dfc88
IZ
3489 paren = ':';
3490 break;
3491 }
c277df42 3492 unknown:
830247a4
IZ
3493 if (*RExC_parse != ')') {
3494 RExC_parse++;
3495 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 3496 }
830247a4 3497 nextchar(pRExC_state);
a0d0e21e
LW
3498 *flagp = TRYAGAIN;
3499 return NULL;
3500 }
3501 }
fac92740 3502 else { /* (...) */
830247a4
IZ
3503 parno = RExC_npar;
3504 RExC_npar++;
3505 ret = reganode(pRExC_state, OPEN, parno);
fac92740
MJD
3506 Set_Node_Length(ret, 1); /* MJD */
3507 Set_Node_Offset(ret, RExC_parse); /* MJD */
c277df42 3508 open = 1;
a0d0e21e 3509 }
a0ed51b3 3510 }
fac92740 3511 else /* ! paren */
a0d0e21e
LW
3512 ret = NULL;
3513
3514 /* Pick up the branches, linking them together. */
fac92740 3515 parse_start = RExC_parse; /* MJD */
830247a4 3516 br = regbranch(pRExC_state, &flags, 1);
fac92740 3517 /* branch_len = (paren != 0); */
2af232bd 3518
a0d0e21e
LW
3519 if (br == NULL)
3520 return(NULL);
830247a4
IZ
3521 if (*RExC_parse == '|') {
3522 if (!SIZE_ONLY && RExC_extralen) {
3523 reginsert(pRExC_state, BRANCHJ, br);
a0ed51b3 3524 }
fac92740 3525 else { /* MJD */
830247a4 3526 reginsert(pRExC_state, BRANCH, br);
fac92740
MJD
3527 Set_Node_Length(br, paren != 0);
3528 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
3529 }
c277df42
IZ
3530 have_branch = 1;
3531 if (SIZE_ONLY)
830247a4 3532 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
3533 }
3534 else if (paren == ':') {
c277df42
IZ
3535 *flagp |= flags&SIMPLE;
3536 }
3537 if (open) { /* Starts with OPEN. */
830247a4 3538 regtail(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
3539 }
3540 else if (paren != '?') /* Not Conditional */
a0d0e21e 3541 ret = br;
32a0ca98 3542 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 3543 lastbr = br;
830247a4
IZ
3544 while (*RExC_parse == '|') {
3545 if (!SIZE_ONLY && RExC_extralen) {
3546 ender = reganode(pRExC_state, LONGJMP,0);
3547 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
3548 }
3549 if (SIZE_ONLY)
830247a4
IZ
3550 RExC_extralen += 2; /* Account for LONGJMP. */
3551 nextchar(pRExC_state);
3552 br = regbranch(pRExC_state, &flags, 0);
2af232bd 3553
a687059c 3554 if (br == NULL)
a0d0e21e 3555 return(NULL);
830247a4 3556 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 3557 lastbr = br;
821b33a5
IZ
3558 if (flags&HASWIDTH)
3559 *flagp |= HASWIDTH;
a687059c 3560 *flagp |= flags&SPSTART;
a0d0e21e
LW
3561 }
3562
c277df42
IZ
3563 if (have_branch || paren != ':') {
3564 /* Make a closing node, and hook it on the end. */
3565 switch (paren) {
3566 case ':':
830247a4 3567 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
3568 break;
3569 case 1:
830247a4 3570 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
3571 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
3572 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
3573 break;
3574 case '<':
c277df42
IZ
3575 case ',':
3576 case '=':
3577 case '!':
c277df42 3578 *flagp &= ~HASWIDTH;
821b33a5
IZ
3579 /* FALL THROUGH */
3580 case '>':
830247a4 3581 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
3582 break;
3583 case 0:
830247a4 3584 ender = reg_node(pRExC_state, END);
c277df42
IZ
3585 break;
3586 }
830247a4 3587 regtail(pRExC_state, lastbr, ender);
a0d0e21e 3588
c277df42
IZ
3589 if (have_branch) {
3590 /* Hook the tails of the branches to the closing node. */
3591 for (br = ret; br != NULL; br = regnext(br)) {
830247a4 3592 regoptail(pRExC_state, br, ender);
c277df42
IZ
3593 }
3594 }
a0d0e21e 3595 }
c277df42
IZ
3596
3597 {
e1ec3a88
AL
3598 const char *p;
3599 static const char parens[] = "=!<,>";
c277df42
IZ
3600
3601 if (paren && (p = strchr(parens, paren))) {
eb160463 3602 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
3603 int flag = (p - parens) > 1;
3604
3605 if (paren == '>')
3606 node = SUSPEND, flag = 0;
830247a4 3607 reginsert(pRExC_state, node,ret);
45948336
EP
3608 Set_Node_Cur_Length(ret);
3609 Set_Node_Offset(ret, parse_start + 1);
c277df42 3610 ret->flags = flag;
830247a4 3611 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 3612 }
a0d0e21e
LW
3613 }
3614
3615 /* Check for proper termination. */
ce3e6498 3616 if (paren) {
e2509266 3617 RExC_flags = oregflags;
830247a4
IZ
3618 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
3619 RExC_parse = oregcomp_parse;
380a0633 3620 vFAIL("Unmatched (");
ce3e6498 3621 }
a0ed51b3 3622 }
830247a4
IZ
3623 else if (!paren && RExC_parse < RExC_end) {
3624 if (*RExC_parse == ')') {
3625 RExC_parse++;
380a0633 3626 vFAIL("Unmatched )");
a0ed51b3
LW
3627 }
3628 else
b45f050a 3629 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
3630 /* NOTREACHED */
3631 }
a687059c 3632
a0d0e21e 3633 return(ret);
a687059c
LW
3634}
3635
3636/*
3637 - regbranch - one alternative of an | operator
3638 *
3639 * Implements the concatenation operator.
3640 */
76e3520e 3641STATIC regnode *
830247a4 3642S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
a687059c 3643{
c277df42
IZ
3644 register regnode *ret;
3645 register regnode *chain = NULL;
3646 register regnode *latest;
3647 I32 flags = 0, c = 0;
a0d0e21e 3648
b81d288d 3649 if (first)
c277df42
IZ
3650 ret = NULL;
3651 else {
b81d288d 3652 if (!SIZE_ONLY && RExC_extralen)
830247a4 3653 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 3654 else {
830247a4 3655 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
3656 Set_Node_Length(ret, 1);
3657 }
c277df42
IZ
3658 }
3659
b81d288d 3660 if (!first && SIZE_ONLY)
830247a4 3661 RExC_extralen += 1; /* BRANCHJ */
b81d288d 3662
c277df42 3663 *flagp = WORST; /* Tentatively. */
a0d0e21e 3664
830247a4
IZ
3665 RExC_parse--;
3666 nextchar(pRExC_state);
3667 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 3668 flags &= ~TRYAGAIN;
830247a4 3669 latest = regpiece(pRExC_state, &flags);
a0d0e21e
LW
3670 if (latest == NULL) {
3671 if (flags & TRYAGAIN)
3672 continue;
3673 return(NULL);
a0ed51b3
LW
3674 }
3675 else if (ret == NULL)
c277df42 3676 ret = latest;
a0d0e21e 3677 *flagp |= flags&HASWIDTH;
c277df42 3678 if (chain == NULL) /* First piece. */
a0d0e21e
LW
3679 *flagp |= flags&SPSTART;
3680 else {
830247a4
IZ
3681 RExC_naughty++;
3682 regtail(pRExC_state, chain, latest);
a687059c 3683 }
a0d0e21e 3684 chain = latest;
c277df42
IZ
3685 c++;
3686 }
3687 if (chain == NULL) { /* Loop ran zero times. */
830247a4 3688 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
3689 if (ret == NULL)
3690 ret = chain;
3691 }
3692 if (c == 1) {
3693 *flagp |= flags&SIMPLE;
a0d0e21e 3694 }
a687059c 3695
a0d0e21e 3696 return(ret);
a687059c
LW
3697}
3698
3699/*
3700 - regpiece - something followed by possible [*+?]
3701 *
3702 * Note that the branching code sequences used for ? and the general cases
3703 * of * and + are somewhat optimized: they use the same NOTHING node as
3704 * both the endmarker for their branch list and the body of the last branch.
3705 * It might seem that this node could be dispensed with entirely, but the
3706 * endmarker role is not redundant.
3707 */
76e3520e 3708STATIC regnode *
830247a4 3709S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3710{
c277df42 3711 register regnode *ret;
a0d0e21e
LW
3712 register char op;
3713 register char *next;
3714 I32 flags;
1df70142 3715 const char * const origparse = RExC_parse;
a0d0e21e
LW
3716 char *maxpos;
3717 I32 min;
c277df42 3718 I32 max = REG_INFTY;
fac92740 3719 char *parse_start;
a0d0e21e 3720
830247a4 3721 ret = regatom(pRExC_state, &flags);
a0d0e21e
LW
3722 if (ret == NULL) {
3723 if (flags & TRYAGAIN)
3724 *flagp |= TRYAGAIN;
3725 return(NULL);
3726 }
3727
830247a4 3728 op = *RExC_parse;
a0d0e21e 3729
830247a4 3730 if (op == '{' && regcurly(RExC_parse)) {
fac92740 3731 parse_start = RExC_parse; /* MJD */
830247a4 3732 next = RExC_parse + 1;
a0d0e21e
LW
3733 maxpos = Nullch;
3734 while (isDIGIT(*next) || *next == ',') {
3735 if (*next == ',') {
3736 if (maxpos)
3737 break;
3738 else
3739 maxpos = next;
a687059c 3740 }
a0d0e21e
LW
3741 next++;
3742 }
3743 if (*next == '}') { /* got one */
3744 if (!maxpos)
3745 maxpos = next;
830247a4
IZ
3746 RExC_parse++;
3747 min = atoi(RExC_parse);
a0d0e21e
LW
3748 if (*maxpos == ',')
3749 maxpos++;
3750 else
830247a4 3751 maxpos = RExC_parse;
a0d0e21e
LW
3752 max = atoi(maxpos);
3753 if (!max && *maxpos != '0')
c277df42
IZ
3754 max = REG_INFTY; /* meaning "infinity" */
3755 else if (max >= REG_INFTY)
8615cb43 3756 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
3757 RExC_parse = next;
3758 nextchar(pRExC_state);
a0d0e21e
LW
3759
3760 do_curly:
3761 if ((flags&SIMPLE)) {
830247a4
IZ
3762 RExC_naughty += 2 + RExC_naughty / 2;
3763 reginsert(pRExC_state, CURLY, ret);
fac92740
MJD
3764 Set_Node_Offset(ret, parse_start+1); /* MJD */
3765 Set_Node_Cur_Length(ret);
a0d0e21e
LW
3766 }
3767 else {
830247a4 3768 regnode *w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
3769
3770 w->flags = 0;
830247a4
IZ
3771 regtail(pRExC_state, ret, w);
3772 if (!SIZE_ONLY && RExC_extralen) {
3773 reginsert(pRExC_state, LONGJMP,ret);
3774 reginsert(pRExC_state, NOTHING,ret);
c277df42
IZ
3775 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
3776 }
830247a4 3777 reginsert(pRExC_state, CURLYX,ret);
fac92740
MJD
3778 /* MJD hk */
3779 Set_Node_Offset(ret, parse_start+1);
2af232bd 3780 Set_Node_Length(ret,
fac92740 3781 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 3782
830247a4 3783 if (!SIZE_ONLY && RExC_extralen)
c277df42 3784 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
830247a4 3785 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 3786 if (SIZE_ONLY)
830247a4
IZ
3787 RExC_whilem_seen++, RExC_extralen += 3;
3788 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 3789 }
c277df42 3790 ret->flags = 0;
a0d0e21e
LW
3791
3792 if (min > 0)
821b33a5
IZ
3793 *flagp = WORST;
3794 if (max > 0)
3795 *flagp |= HASWIDTH;
a0d0e21e 3796 if (max && max < min)
8615cb43 3797 vFAIL("Can't do {n,m} with n > m");
c277df42 3798 if (!SIZE_ONLY) {
eb160463
GS
3799 ARG1_SET(ret, (U16)min);
3800 ARG2_SET(ret, (U16)max);
a687059c 3801 }
a687059c 3802
a0d0e21e 3803 goto nest_check;
a687059c 3804 }
a0d0e21e 3805 }
a687059c 3806
a0d0e21e
LW
3807 if (!ISMULT1(op)) {
3808 *flagp = flags;
a687059c 3809 return(ret);
a0d0e21e 3810 }
bb20fd44 3811
c277df42 3812#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
3813
3814 /* if this is reinstated, don't forget to put this back into perldiag:
3815
3816 =item Regexp *+ operand could be empty at {#} in regex m/%s/
3817
3818 (F) The part of the regexp subject to either the * or + quantifier
3819 could match an empty string. The {#} shows in the regular
3820 expression about where the problem was discovered.
3821
3822 */
3823
bb20fd44 3824 if (!(flags&HASWIDTH) && op != '?')
b45f050a 3825 vFAIL("Regexp *+ operand could be empty");
b81d288d 3826#endif
bb20fd44 3827
fac92740 3828 parse_start = RExC_parse;
830247a4 3829 nextchar(pRExC_state);
a0d0e21e 3830
821b33a5 3831 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
3832
3833 if (op == '*' && (flags&SIMPLE)) {
830247a4 3834 reginsert(pRExC_state, STAR, ret);
c277df42 3835 ret->flags = 0;
830247a4 3836 RExC_naughty += 4;
a0d0e21e
LW
3837 }
3838 else if (op == '*') {
3839 min = 0;
3840 goto do_curly;
a0ed51b3
LW
3841 }
3842 else if (op == '+' && (flags&SIMPLE)) {
830247a4 3843 reginsert(pRExC_state, PLUS, ret);
c277df42 3844 ret->flags = 0;
830247a4 3845 RExC_naughty += 3;
a0d0e21e
LW
3846 }
3847 else if (op == '+') {
3848 min = 1;
3849 goto do_curly;
a0ed51b3
LW
3850 }
3851 else if (op == '?') {
a0d0e21e
LW
3852 min = 0; max = 1;
3853 goto do_curly;
3854 }
3855 nest_check:
e476b1b5 3856 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
830247a4 3857 vWARN3(RExC_parse,
b45f050a 3858 "%.*s matches null string many times",
830247a4 3859 RExC_parse - origparse,
b45f050a 3860 origparse);
a0d0e21e
LW
3861 }
3862
830247a4
IZ
3863 if (*RExC_parse == '?') {
3864 nextchar(pRExC_state);
3865 reginsert(pRExC_state, MINMOD, ret);
3866 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 3867 }
830247a4
IZ
3868 if (ISMULT2(RExC_parse)) {
3869 RExC_parse++;
b45f050a
JF
3870 vFAIL("Nested quantifiers");
3871 }
a0d0e21e
LW
3872
3873 return(ret);
a687059c
LW
3874}
3875
3876/*
3877 - regatom - the lowest level
3878 *
3879 * Optimization: gobbles an entire sequence of ordinary characters so that
3880 * it can turn them into a single node, which is smaller to store and
3881 * faster to run. Backslashed characters are exceptions, each becoming a
3882 * separate node; the code is simpler that way and it's not worth fixing.
3883 *
b45f050a 3884 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
76e3520e 3885STATIC regnode *
830247a4 3886S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
a687059c 3887{
c277df42 3888 register regnode *ret = 0;
a0d0e21e 3889 I32 flags;
45948336 3890 char *parse_start = RExC_parse;
a0d0e21e
LW
3891
3892 *flagp = WORST; /* Tentatively. */
3893
3894tryagain:
830247a4 3895 switch (*RExC_parse) {
a0d0e21e 3896 case '^':
830247a4
IZ
3897 RExC_seen_zerolen++;
3898 nextchar(pRExC_state);
e2509266 3899 if (RExC_flags & PMf_MULTILINE)
830247a4 3900 ret = reg_node(pRExC_state, MBOL);
e2509266 3901 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3902 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3903 else
830247a4 3904 ret = reg_node(pRExC_state, BOL);
fac92740 3905 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3906 break;
3907 case '$':
830247a4 3908 nextchar(pRExC_state);
b81d288d 3909 if (*RExC_parse)
830247a4 3910 RExC_seen_zerolen++;
e2509266 3911 if (RExC_flags & PMf_MULTILINE)
830247a4 3912 ret = reg_node(pRExC_state, MEOL);
e2509266 3913 else if (RExC_flags & PMf_SINGLELINE)
830247a4 3914 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3915 else
830247a4 3916 ret = reg_node(pRExC_state, EOL);
fac92740 3917 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3918 break;
3919 case '.':
830247a4 3920 nextchar(pRExC_state);
e2509266 3921 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
3922 ret = reg_node(pRExC_state, SANY);
3923 else
3924 ret = reg_node(pRExC_state, REG_ANY);
3925 *flagp |= HASWIDTH|SIMPLE;
830247a4 3926 RExC_naughty++;
fac92740 3927 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
3928 break;
3929 case '[':
b45f050a 3930 {
830247a4 3931 char *oregcomp_parse = ++RExC_parse;
ffc61ed2 3932 ret = regclass(pRExC_state);
830247a4
IZ
3933 if (*RExC_parse != ']') {
3934 RExC_parse = oregcomp_parse;
b45f050a
JF
3935 vFAIL("Unmatched [");
3936 }
830247a4 3937 nextchar(pRExC_state);
a0d0e21e 3938 *flagp |= HASWIDTH|SIMPLE;
fac92740 3939 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 3940 break;
b45f050a 3941 }
a0d0e21e 3942 case '(':
830247a4
IZ
3943 nextchar(pRExC_state);
3944 ret = reg(pRExC_state, 1, &flags);
a0d0e21e 3945 if (ret == NULL) {
bf93d4cc 3946 if (flags & TRYAGAIN) {
830247a4 3947 if (RExC_parse == RExC_end) {
bf93d4cc
GS
3948 /* Make parent create an empty node if needed. */
3949 *flagp |= TRYAGAIN;
3950 return(NULL);
3951 }
a0d0e21e 3952 goto tryagain;
bf93d4cc 3953 }
a0d0e21e
LW
3954 return(NULL);
3955 }
c277df42 3956 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
3957 break;
3958 case '|':
3959 case ')':
3960 if (flags & TRYAGAIN) {
3961 *flagp |= TRYAGAIN;
3962 return NULL;
3963 }
b45f050a 3964 vFAIL("Internal urp");
a0d0e21e
LW
3965 /* Supposed to be caught earlier. */
3966 break;
85afd4ae 3967 case '{':
830247a4
IZ
3968 if (!regcurly(RExC_parse)) {
3969 RExC_parse++;
85afd4ae
CS
3970 goto defchar;
3971 }
3972 /* FALL THROUGH */
a0d0e21e
LW
3973 case '?':
3974 case '+':
3975 case '*':
830247a4 3976 RExC_parse++;
b45f050a 3977 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
3978 break;
3979 case '\\':
830247a4 3980 switch (*++RExC_parse) {
a0d0e21e 3981 case 'A':
830247a4
IZ
3982 RExC_seen_zerolen++;
3983 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 3984 *flagp |= SIMPLE;
830247a4 3985 nextchar(pRExC_state);
fac92740 3986 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3987 break;
3988 case 'G':
830247a4
IZ
3989 ret = reg_node(pRExC_state, GPOS);
3990 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 3991 *flagp |= SIMPLE;
830247a4 3992 nextchar(pRExC_state);
fac92740 3993 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
3994 break;
3995 case 'Z':
830247a4 3996 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 3997 *flagp |= SIMPLE;
a1917ab9 3998 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 3999 nextchar(pRExC_state);
a0d0e21e 4000 break;
b85d18e9 4001 case 'z':
830247a4 4002 ret = reg_node(pRExC_state, EOS);
b85d18e9 4003 *flagp |= SIMPLE;
830247a4
IZ
4004 RExC_seen_zerolen++; /* Do not optimize RE away */
4005 nextchar(pRExC_state);
fac92740 4006 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 4007 break;
4a2d328f 4008 case 'C':
f33976b4
DB
4009 ret = reg_node(pRExC_state, CANY);
4010 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 4011 *flagp |= HASWIDTH|SIMPLE;
830247a4 4012 nextchar(pRExC_state);
fac92740 4013 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
4014 break;
4015 case 'X':
830247a4 4016 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 4017 *flagp |= HASWIDTH;
830247a4 4018 nextchar(pRExC_state);
fac92740 4019 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 4020 break;
a0d0e21e 4021 case 'w':
eb160463 4022 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 4023 *flagp |= HASWIDTH|SIMPLE;
830247a4 4024 nextchar(pRExC_state);
fac92740 4025 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4026 break;
4027 case 'W':
eb160463 4028 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 4029 *flagp |= HASWIDTH|SIMPLE;
830247a4 4030 nextchar(pRExC_state);
fac92740 4031 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4032 break;
4033 case 'b':
830247a4
IZ
4034 RExC_seen_zerolen++;
4035 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4036 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 4037 *flagp |= SIMPLE;
830247a4 4038 nextchar(pRExC_state);
fac92740 4039 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4040 break;
4041 case 'B':
830247a4
IZ
4042 RExC_seen_zerolen++;
4043 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 4044 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 4045 *flagp |= SIMPLE;
830247a4 4046 nextchar(pRExC_state);
fac92740 4047 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4048 break;
4049 case 's':
eb160463 4050 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 4051 *flagp |= HASWIDTH|SIMPLE;
830247a4 4052 nextchar(pRExC_state);
fac92740 4053 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4054 break;
4055 case 'S':
eb160463 4056 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 4057 *flagp |= HASWIDTH|SIMPLE;
830247a4 4058 nextchar(pRExC_state);
fac92740 4059 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4060 break;
4061 case 'd':
ffc61ed2 4062 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 4063 *flagp |= HASWIDTH|SIMPLE;
830247a4 4064 nextchar(pRExC_state);
fac92740 4065 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
4066 break;
4067 case 'D':
ffc61ed2 4068 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 4069 *flagp |= HASWIDTH|SIMPLE;
830247a4 4070 nextchar(pRExC_state);
fac92740 4071 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 4072 break;
a14b48bc
LW
4073 case 'p':
4074 case 'P':
3568d838 4075 {
830247a4 4076 char* oldregxend = RExC_end;
ccb2c380 4077 char* parse_start = RExC_parse - 2;
a14b48bc 4078
830247a4 4079 if (RExC_parse[1] == '{') {
3568d838 4080 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
4081 RExC_end = strchr(RExC_parse, '}');
4082 if (!RExC_end) {
0da60cf5 4083 U8 c = (U8)*RExC_parse;
830247a4
IZ
4084 RExC_parse += 2;
4085 RExC_end = oldregxend;
0da60cf5 4086 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 4087 }
830247a4 4088 RExC_end++;
a14b48bc 4089 }
af6f566e 4090 else {
830247a4 4091 RExC_end = RExC_parse + 2;
af6f566e
HS
4092 if (RExC_end > oldregxend)
4093 RExC_end = oldregxend;
4094 }
830247a4 4095 RExC_parse--;
a14b48bc 4096
ffc61ed2 4097 ret = regclass(pRExC_state);
a14b48bc 4098
830247a4
IZ
4099 RExC_end = oldregxend;
4100 RExC_parse--;
ccb2c380
MP
4101
4102 Set_Node_Offset(ret, parse_start + 2);
4103 Set_Node_Cur_Length(ret);
830247a4 4104 nextchar(pRExC_state);
a14b48bc
LW
4105 *flagp |= HASWIDTH|SIMPLE;
4106 }
4107 break;
a0d0e21e
LW
4108 case 'n':
4109 case 'r':
4110 case 't':
4111 case 'f':
4112 case 'e':
4113 case 'a':
4114 case 'x':
4115 case 'c':
4116 case '0':
4117 goto defchar;
4118 case '1': case '2': case '3': case '4':
4119 case '5': case '6': case '7': case '8': case '9':
4120 {
1df70142 4121 const I32 num = atoi(RExC_parse);
a0d0e21e 4122
830247a4 4123 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
4124 goto defchar;
4125 else {
fac92740 4126 char * parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
4127 while (isDIGIT(*RExC_parse))
4128 RExC_parse++;
b45f050a 4129
eb160463 4130 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 4131 vFAIL("Reference to nonexistent group");
830247a4 4132 RExC_sawback = 1;
eb160463
GS
4133 ret = reganode(pRExC_state,
4134 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
4135 num);
a0d0e21e 4136 *flagp |= HASWIDTH;
2af232bd 4137
fac92740 4138 /* override incorrect value set in reganode MJD */
2af232bd 4139 Set_Node_Offset(ret, parse_start+1);
fac92740 4140 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
4141 RExC_parse--;
4142 nextchar(pRExC_state);
a0d0e21e
LW
4143 }
4144 }
4145 break;
4146 case '\0':
830247a4 4147 if (RExC_parse >= RExC_end)
b45f050a 4148 FAIL("Trailing \\");
a0d0e21e
LW
4149 /* FALL THROUGH */
4150 default:
c9f97d15
IZ
4151 /* Do not generate `unrecognized' warnings here, we fall
4152 back into the quick-grab loop below */
45948336 4153 parse_start--;
a0d0e21e
LW
4154 goto defchar;
4155 }
4156 break;
4633a7c4
LW
4157
4158 case '#':
e2509266 4159 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
4160 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
4161 if (RExC_parse < RExC_end)
4633a7c4
LW
4162 goto tryagain;
4163 }
4164 /* FALL THROUGH */
4165
a0d0e21e 4166 default: {
ba210ebe 4167 register STRLEN len;
58ae7d3f 4168 register UV ender;
a0d0e21e 4169 register char *p;
c277df42 4170 char *oldp, *s;
80aecb99 4171 STRLEN foldlen;
89ebb4a3 4172 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
4173
4174 parse_start = RExC_parse - 1;
a0d0e21e 4175
830247a4 4176 RExC_parse++;
a0d0e21e
LW
4177
4178 defchar:
58ae7d3f 4179 ender = 0;
eb160463
GS
4180 ret = reg_node(pRExC_state,
4181 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 4182 s = STRING(ret);
830247a4
IZ
4183 for (len = 0, p = RExC_parse - 1;
4184 len < 127 && p < RExC_end;
a0d0e21e
LW
4185 len++)
4186 {
4187 oldp = p;
5b5a24f7 4188
e2509266 4189 if (RExC_flags & PMf_EXTENDED)
830247a4 4190 p = regwhite(p, RExC_end);
a0d0e21e
LW
4191 switch (*p) {
4192 case '^':
4193 case '$':
4194 case '.':
4195 case '[':
4196 case '(':
4197 case ')':
4198 case '|':
4199 goto loopdone;
4200 case '\\':
4201 switch (*++p) {
4202 case 'A':
1ed8eac0
JF
4203 case 'C':
4204 case 'X':
a0d0e21e
LW
4205 case 'G':
4206 case 'Z':
b85d18e9 4207 case 'z':
a0d0e21e
LW
4208 case 'w':
4209 case 'W':
4210 case 'b':
4211 case 'B':
4212 case 's':
4213 case 'S':
4214 case 'd':
4215 case 'D':
a14b48bc
LW
4216 case 'p':
4217 case 'P':
a0d0e21e
LW
4218 --p;
4219 goto loopdone;
4220 case 'n':
4221 ender = '\n';
4222 p++;
a687059c 4223 break;
a0d0e21e
LW
4224 case 'r':
4225 ender = '\r';
4226 p++;
a687059c 4227 break;
a0d0e21e
LW
4228 case 't':
4229 ender = '\t';
4230 p++;
a687059c 4231 break;
a0d0e21e
LW
4232 case 'f':
4233 ender = '\f';
4234 p++;
a687059c 4235 break;
a0d0e21e 4236 case 'e':
c7f1f016 4237 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 4238 p++;
a687059c 4239 break;
a0d0e21e 4240 case 'a':
c7f1f016 4241 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 4242 p++;
a687059c 4243 break;
a0d0e21e 4244 case 'x':
a0ed51b3 4245 if (*++p == '{') {
1df70142 4246 char* const e = strchr(p, '}');
b81d288d 4247
b45f050a 4248 if (!e) {
830247a4 4249 RExC_parse = p + 1;
b45f050a
JF
4250 vFAIL("Missing right brace on \\x{}");
4251 }
de5f0749 4252 else {
a4c04bdc
NC
4253 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4254 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 4255 STRLEN numlen = e - p - 1;
53305cf1 4256 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
4257 if (ender > 0xff)
4258 RExC_utf8 = 1;
a0ed51b3
LW
4259 p = e + 1;
4260 }
a0ed51b3
LW
4261 }
4262 else {
a4c04bdc 4263 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 4264 STRLEN numlen = 2;
53305cf1 4265 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
4266 p += numlen;
4267 }
a687059c 4268 break;
a0d0e21e
LW
4269 case 'c':
4270 p++;
bbce6d69 4271 ender = UCHARAT(p++);
4272 ender = toCTRL(ender);
a687059c 4273 break;
a0d0e21e
LW
4274 case '0': case '1': case '2': case '3':case '4':
4275 case '5': case '6': case '7': case '8':case '9':
4276 if (*p == '0' ||
830247a4 4277 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 4278 I32 flags = 0;
1df70142 4279 STRLEN numlen = 3;
53305cf1 4280 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
4281 p += numlen;
4282 }
4283 else {
4284 --p;
4285 goto loopdone;
a687059c
LW
4286 }
4287 break;
a0d0e21e 4288 case '\0':
830247a4 4289 if (p >= RExC_end)
b45f050a 4290 FAIL("Trailing \\");
a687059c 4291 /* FALL THROUGH */
a0d0e21e 4292 default:
e476b1b5 4293 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
4193bef7 4294 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 4295 goto normal_default;
a0d0e21e
LW
4296 }
4297 break;
a687059c 4298 default:
a0ed51b3 4299 normal_default:
fd400ab9 4300 if (UTF8_IS_START(*p) && UTF) {
1df70142 4301 STRLEN numlen;
5e12f4fb 4302 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
ba210ebe 4303 &numlen, 0);
a0ed51b3
LW
4304 p += numlen;
4305 }
4306 else
4307 ender = *p++;
a0d0e21e 4308 break;
a687059c 4309 }
e2509266 4310 if (RExC_flags & PMf_EXTENDED)
830247a4 4311 p = regwhite(p, RExC_end);
60a8b682
JH
4312 if (UTF && FOLD) {
4313 /* Prime the casefolded buffer. */
ac7e0132 4314 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 4315 }
a0d0e21e
LW
4316 if (ISMULT2(p)) { /* Back off on ?+*. */
4317 if (len)
4318 p = oldp;
16ea2a2e 4319 else if (UTF) {
0ebc6274
JH
4320 STRLEN unilen;
4321
80aecb99 4322 if (FOLD) {
60a8b682 4323 /* Emit all the Unicode characters. */
1df70142 4324 STRLEN numlen;
80aecb99
JH
4325 for (foldbuf = tmpbuf;
4326 foldlen;
4327 foldlen -= numlen) {
4328 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4329 if (numlen > 0) {
0ebc6274
JH
4330 reguni(pRExC_state, ender, s, &unilen);
4331 s += unilen;
4332 len += unilen;
4333 /* In EBCDIC the numlen
4334 * and unilen can differ. */
9dc45d57 4335 foldbuf += numlen;
47654450
JH
4336 if (numlen >= foldlen)
4337 break;
9dc45d57
JH
4338 }
4339 else
4340 break; /* "Can't happen." */
80aecb99
JH
4341 }
4342 }
4343 else {
0ebc6274 4344 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4345 if (unilen > 0) {
0ebc6274
JH
4346 s += unilen;
4347 len += unilen;
9dc45d57 4348 }
80aecb99 4349 }
a0ed51b3 4350 }
a0d0e21e
LW
4351 else {
4352 len++;
eb160463 4353 REGC((char)ender, s++);
a0d0e21e
LW
4354 }
4355 break;
a687059c 4356 }
16ea2a2e 4357 if (UTF) {
0ebc6274
JH
4358 STRLEN unilen;
4359
80aecb99 4360 if (FOLD) {
60a8b682 4361 /* Emit all the Unicode characters. */
1df70142 4362 STRLEN numlen;
80aecb99
JH
4363 for (foldbuf = tmpbuf;
4364 foldlen;
4365 foldlen -= numlen) {
4366 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 4367 if (numlen > 0) {
0ebc6274
JH
4368 reguni(pRExC_state, ender, s, &unilen);
4369 len += unilen;
4370 s += unilen;
4371 /* In EBCDIC the numlen
4372 * and unilen can differ. */
9dc45d57 4373 foldbuf += numlen;
47654450
JH
4374 if (numlen >= foldlen)
4375 break;
9dc45d57
JH
4376 }
4377 else
4378 break;
80aecb99
JH
4379 }
4380 }
4381 else {
0ebc6274 4382 reguni(pRExC_state, ender, s, &unilen);
9ede7db1 4383 if (unilen > 0) {
0ebc6274
JH
4384 s += unilen;
4385 len += unilen;
9dc45d57 4386 }
80aecb99
JH
4387 }
4388 len--;
a0ed51b3
LW
4389 }
4390 else
eb160463 4391 REGC((char)ender, s++);
a0d0e21e
LW
4392 }
4393 loopdone:
830247a4 4394 RExC_parse = p - 1;
fac92740 4395 Set_Node_Cur_Length(ret); /* MJD */
830247a4 4396 nextchar(pRExC_state);
793db0cb
JH
4397 {
4398 /* len is STRLEN which is unsigned, need to copy to signed */
4399 IV iv = len;
4400 if (iv < 0)
4401 vFAIL("Internal disaster");
4402 }
a0d0e21e
LW
4403 if (len > 0)
4404 *flagp |= HASWIDTH;
090f7165 4405 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 4406 *flagp |= SIMPLE;
c277df42 4407 if (!SIZE_ONLY)
cd439c50
IZ
4408 STR_LEN(ret) = len;
4409 if (SIZE_ONLY)
830247a4 4410 RExC_size += STR_SZ(len);
cd439c50 4411 else
830247a4 4412 RExC_emit += STR_SZ(len);
a687059c 4413 }
a0d0e21e
LW
4414 break;
4415 }
a687059c 4416
60a8b682
JH
4417 /* If the encoding pragma is in effect recode the text of
4418 * any EXACT-kind nodes. */
22c54be3 4419 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
d0063567
DK
4420 STRLEN oldlen = STR_LEN(ret);
4421 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
4422
4423 if (RExC_utf8)
4424 SvUTF8_on(sv);
4425 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
4426 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
4427 const STRLEN newlen = SvCUR(sv);
d0063567
DK
4428
4429 if (SvUTF8(sv))
4430 RExC_utf8 = 1;
4431 if (!SIZE_ONLY) {
a3621e74
YO
4432 GET_RE_DEBUG_FLAGS_DECL;
4433 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
4434 (int)oldlen, STRING(ret),
4435 (int)newlen, s));
4436 Copy(s, STRING(ret), newlen, char);
4437 STR_LEN(ret) += newlen - oldlen;
4438 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
4439 } else
4440 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
4441 }
a72c7584
JH
4442 }
4443
a0d0e21e 4444 return(ret);
a687059c
LW
4445}
4446
873ef191 4447STATIC char *
504618e9 4448S_regwhite(pTHX_ char *p, const char *e)
5b5a24f7
CS
4449{
4450 while (p < e) {
4451 if (isSPACE(*p))
4452 ++p;
4453 else if (*p == '#') {
4454 do {
4455 p++;
4456 } while (p < e && *p != '\n');
4457 }
4458 else
4459 break;
4460 }
4461 return p;
4462}
4463
b8c5462f
JH
4464/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
4465 Character classes ([:foo:]) can also be negated ([:^foo:]).
4466 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
4467 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 4468 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
4469
4470#define POSIXCC_DONE(c) ((c) == ':')
4471#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
4472#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
4473
b8c5462f 4474STATIC I32
830247a4 4475S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5
JH
4476{
4477 char *posixcc = 0;
936ed897 4478 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 4479
830247a4 4480 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 4481 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 4482 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 4483 const char c = UCHARAT(RExC_parse);
830247a4 4484 char* s = RExC_parse++;
b81d288d 4485
9a86a77b 4486 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
4487 RExC_parse++;
4488 if (RExC_parse == RExC_end)
620e46c5 4489 /* Grandfather lone [:, [=, [. */
830247a4 4490 RExC_parse = s;
620e46c5 4491 else {
1df70142 4492 const char* t = RExC_parse++; /* skip over the c */
b8c5462f 4493
80916619
NC
4494 assert(*t == c);
4495
9a86a77b 4496 if (UCHARAT(RExC_parse) == ']') {
830247a4 4497 RExC_parse++; /* skip over the ending ] */
b8c5462f
JH
4498 posixcc = s + 1;
4499 if (*s == ':') {
1df70142
AL
4500 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
4501 const I32 skip = t - posixcc;
80916619
NC
4502
4503 /* Initially switch on the length of the name. */
4504 switch (skip) {
4505 case 4:
4506 if (memEQ(posixcc, "word", 4)) {
4507 /* this is not POSIX, this is the Perl \w */;
4508 namedclass
4509 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
4510 }
cc4319de 4511 break;
80916619
NC
4512 case 5:
4513 /* Names all of length 5. */
4514 /* alnum alpha ascii blank cntrl digit graph lower
4515 print punct space upper */
4516 /* Offset 4 gives the best switch position. */
4517 switch (posixcc[4]) {
4518 case 'a':
4519 if (memEQ(posixcc, "alph", 4)) {
4520 /* a */
4521 namedclass
4522 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
4523 }
4524 break;
4525 case 'e':
4526 if (memEQ(posixcc, "spac", 4)) {
4527 /* e */
4528 namedclass
4529 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
4530 }
4531 break;
4532 case 'h':
4533 if (memEQ(posixcc, "grap", 4)) {
4534 /* h */
4535 namedclass
4536 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
4537 }
4538 break;
4539 case 'i':
4540 if (memEQ(posixcc, "asci", 4)) {
4541 /* i */
4542 namedclass
4543 = complement ? ANYOF_NASCII : ANYOF_ASCII;
4544 }
4545 break;
4546 case 'k':
4547 if (memEQ(posixcc, "blan", 4)) {
4548 /* k */
4549 namedclass
4550 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
4551 }
4552 break;
4553 case 'l':
4554 if (memEQ(posixcc, "cntr", 4)) {
4555 /* l */
4556 namedclass
4557 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
4558 }
4559 break;
4560 case 'm':
4561 if (memEQ(posixcc, "alnu", 4)) {
4562 /* m */
4563 namedclass
4564 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
4565 }
4566 break;
4567 case 'r':
4568 if (memEQ(posixcc, "lowe", 4)) {
4569 /* r */
4570 namedclass
4571 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
4572 }
4573 if (memEQ(posixcc, "uppe", 4)) {
8fdec511 4574 /* r */
80916619
NC
4575 namedclass
4576 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
4577 }
4578 break;
4579 case 't':
4580 if (memEQ(posixcc, "digi", 4)) {
4581 /* t */
4582 namedclass
4583 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
4584 }
4585 if (memEQ(posixcc, "prin", 4)) {
8fdec511 4586 /* t */
80916619
NC
4587 namedclass
4588 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
4589 }
4590 if (memEQ(posixcc, "punc", 4)) {
4591 /* t */
4592 namedclass
4593 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
4594 }
4595 break;
b8c5462f
JH
4596 }
4597 break;
80916619
NC
4598 case 6:
4599 if (memEQ(posixcc, "xdigit", 6)) {
4600 namedclass
4601 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
4602 }
4603 break;
4604 }
80916619
NC
4605
4606 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
4607 {
4608 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
4609 t - s - 1, s + 1);
4610 }
80916619
NC
4611 assert (posixcc[skip] == ':');
4612 assert (posixcc[skip+1] == ']');
b45f050a 4613 } else if (!SIZE_ONLY) {
b8c5462f 4614 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 4615
830247a4 4616 /* adjust RExC_parse so the warning shows after
b45f050a 4617 the class closes */
9a86a77b 4618 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 4619 RExC_parse++;
b45f050a
JF
4620 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4621 }
b8c5462f
JH
4622 } else {
4623 /* Maternal grandfather:
4624 * "[:" ending in ":" but not in ":]" */
830247a4 4625 RExC_parse = s;
767d463e 4626 }
620e46c5
JH
4627 }
4628 }
4629
b8c5462f
JH
4630 return namedclass;
4631}
4632
4633STATIC void
830247a4 4634S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 4635{
b938889d 4636 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
4637 const char *s = RExC_parse;
4638 const char c = *s++;
b8c5462f
JH
4639
4640 while(*s && isALNUM(*s))
4641 s++;
4642 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
4643 if (ckWARN(WARN_REGEXP))
4644 vWARN3(s+2,
4645 "POSIX syntax [%c %c] belongs inside character classes",
4646 c, c);
b45f050a
JF
4647
4648 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 4649 if (POSIXCC_NOTYET(c)) {
830247a4 4650 /* adjust RExC_parse so the error shows after
b45f050a 4651 the class closes */
9a86a77b 4652 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
b45f050a
JF
4653 ;
4654 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
4655 }
b8c5462f
JH
4656 }
4657 }
620e46c5
JH
4658}
4659
76e3520e 4660STATIC regnode *
830247a4 4661S_regclass(pTHX_ RExC_state_t *pRExC_state)
a687059c 4662{
ffc61ed2 4663 register UV value;
9a86a77b 4664 register UV nextvalue;
3568d838 4665 register IV prevvalue = OOB_UNICODE;
ffc61ed2 4666 register IV range = 0;
c277df42 4667 register regnode *ret;
ba210ebe 4668 STRLEN numlen;
ffc61ed2 4669 IV namedclass;
9c5ffd7c 4670 char *rangebegin = 0;
936ed897 4671 bool need_class = 0;
9c5ffd7c 4672 SV *listsv = Nullsv;
ffc61ed2
JH
4673 register char *e;
4674 UV n;
9e55ce06
JH
4675 bool optimize_invert = TRUE;
4676 AV* unicode_alternate = 0;
1b2d223b
JH
4677#ifdef EBCDIC
4678 UV literal_endpoint = 0;
4679#endif
ffc61ed2
JH
4680
4681 ret = reganode(pRExC_state, ANYOF, 0);
4682
4683 if (!SIZE_ONLY)
4684 ANYOF_FLAGS(ret) = 0;
4685
9a86a77b 4686 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
4687 RExC_naughty++;
4688 RExC_parse++;
4689 if (!SIZE_ONLY)
4690 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
4691 }
a0d0e21e 4692
936ed897 4693 if (SIZE_ONLY)
830247a4 4694 RExC_size += ANYOF_SKIP;
936ed897 4695 else {
830247a4 4696 RExC_emit += ANYOF_SKIP;
936ed897
IZ
4697 if (FOLD)
4698 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
4699 if (LOC)
4700 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2
JH
4701 ANYOF_BITMAP_ZERO(ret);
4702 listsv = newSVpvn("# comment\n", 10);
a0d0e21e 4703 }
b8c5462f 4704
9a86a77b
JH
4705 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4706
b938889d 4707 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 4708 checkposixcc(pRExC_state);
b8c5462f 4709
f064b6ad
HS
4710 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
4711 if (UCHARAT(RExC_parse) == ']')
4712 goto charclassloop;
ffc61ed2 4713
9a86a77b 4714 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
4715
4716 charclassloop:
4717
4718 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
4719
73b437c8 4720 if (!range)
830247a4 4721 rangebegin = RExC_parse;
ffc61ed2 4722 if (UTF) {
5e12f4fb 4723 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838
JH
4724 RExC_end - RExC_parse,
4725 &numlen, 0);
ffc61ed2
JH
4726 RExC_parse += numlen;
4727 }
4728 else
4729 value = UCHARAT(RExC_parse++);
9a86a77b
JH
4730 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
4731 if (value == '[' && POSIXCC(nextvalue))
830247a4 4732 namedclass = regpposixcc(pRExC_state, value);
620e46c5 4733 else if (value == '\\') {
ffc61ed2 4734 if (UTF) {
5e12f4fb 4735 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2
JH
4736 RExC_end - RExC_parse,
4737 &numlen, 0);
4738 RExC_parse += numlen;
4739 }
4740 else
4741 value = UCHARAT(RExC_parse++);
470c3474 4742 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 4743 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
4744 * be a problem later if we want switch on Unicode.
4745 * A similar issue a little bit later when switching on
4746 * namedclass. --jhi */
ffc61ed2 4747 switch ((I32)value) {
b8c5462f
JH
4748 case 'w': namedclass = ANYOF_ALNUM; break;
4749 case 'W': namedclass = ANYOF_NALNUM; break;
4750 case 's': namedclass = ANYOF_SPACE; break;
4751 case 'S': namedclass = ANYOF_NSPACE; break;
4752 case 'd': namedclass = ANYOF_DIGIT; break;
4753 case 'D': namedclass = ANYOF_NDIGIT; break;
ffc61ed2
JH
4754 case 'p':
4755 case 'P':
af6f566e 4756 if (RExC_parse >= RExC_end)
2a4859cd 4757 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 4758 if (*RExC_parse == '{') {
1df70142 4759 const U8 c = (U8)value;
ffc61ed2
JH
4760 e = strchr(RExC_parse++, '}');
4761 if (!e)
0da60cf5 4762 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
4763 while (isSPACE(UCHARAT(RExC_parse)))
4764 RExC_parse++;
4765 if (e == RExC_parse)
0da60cf5 4766 vFAIL2("Empty \\%c{}", c);
ffc61ed2 4767 n = e - RExC_parse;
ab13f0c7
JH
4768 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
4769 n--;
ffc61ed2
JH
4770 }
4771 else {
4772 e = RExC_parse;
4773 n = 1;
4774 }
4775 if (!SIZE_ONLY) {
ab13f0c7
JH
4776 if (UCHARAT(RExC_parse) == '^') {
4777 RExC_parse++;
4778 n--;
4779 value = value == 'p' ? 'P' : 'p'; /* toggle */
4780 while (isSPACE(UCHARAT(RExC_parse))) {
4781 RExC_parse++;
4782 n--;
4783 }
4784 }
ffc61ed2 4785 if (value == 'p')
ab13f0c7
JH
4786 Perl_sv_catpvf(aTHX_ listsv,
4787 "+utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2 4788 else
ab13f0c7
JH
4789 Perl_sv_catpvf(aTHX_ listsv,
4790 "!utf8::%.*s\n", (int)n, RExC_parse);
ffc61ed2
JH
4791 }
4792 RExC_parse = e + 1;
4793 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2
JP
4794 namedclass = ANYOF_MAX; /* no official name, but it's named */
4795 break;
b8c5462f
JH
4796 case 'n': value = '\n'; break;
4797 case 'r': value = '\r'; break;
4798 case 't': value = '\t'; break;
4799 case 'f': value = '\f'; break;
4800 case 'b': value = '\b'; break;
c7f1f016
NIS
4801 case 'e': value = ASCII_TO_NATIVE('\033');break;
4802 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 4803 case 'x':
ffc61ed2 4804 if (*RExC_parse == '{') {
a4c04bdc
NC
4805 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4806 | PERL_SCAN_DISALLOW_PREFIX;
ffc61ed2 4807 e = strchr(RExC_parse++, '}');
b81d288d 4808 if (!e)
ffc61ed2 4809 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
4810
4811 numlen = e - RExC_parse;
4812 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4813 RExC_parse = e + 1;
4814 }
4815 else {
a4c04bdc 4816 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
4817 numlen = 2;
4818 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
4819 RExC_parse += numlen;
4820 }
b8c5462f
JH
4821 break;
4822 case 'c':
830247a4 4823 value = UCHARAT(RExC_parse++);
b8c5462f
JH
4824 value = toCTRL(value);
4825 break;
4826 case '0': case '1': case '2': case '3': case '4':
4827 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
4828 {
4829 I32 flags = 0;
4830 numlen = 3;
4831 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 4832 RExC_parse += numlen;
b8c5462f 4833 break;
53305cf1 4834 }
1028017a 4835 default:
e476b1b5 4836 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
ffc61ed2
JH
4837 vWARN2(RExC_parse,
4838 "Unrecognized escape \\%c in character class passed through",
4839 (int)value);
1028017a 4840 break;
b8c5462f 4841 }
ffc61ed2 4842 } /* end of \blah */
1b2d223b
JH
4843#ifdef EBCDIC
4844 else
4845 literal_endpoint++;
4846#endif
ffc61ed2
JH
4847
4848 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
4849
4850 if (!SIZE_ONLY && !need_class)
936ed897 4851 ANYOF_CLASS_ZERO(ret);
ffc61ed2 4852
936ed897 4853 need_class = 1;
ffc61ed2
JH
4854
4855 /* a bad range like a-\d, a-[:digit:] ? */
4856 if (range) {
73b437c8 4857 if (!SIZE_ONLY) {
e476b1b5 4858 if (ckWARN(WARN_REGEXP))
830247a4 4859 vWARN4(RExC_parse,
b45f050a 4860 "False [] range \"%*.*s\"",
830247a4
IZ
4861 RExC_parse - rangebegin,
4862 RExC_parse - rangebegin,
b45f050a 4863 rangebegin);
3568d838
JH
4864 if (prevvalue < 256) {
4865 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
4866 ANYOF_BITMAP_SET(ret, '-');
4867 }
4868 else {
4869 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4870 Perl_sv_catpvf(aTHX_ listsv,
3568d838 4871 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 4872 }
b8c5462f 4873 }
ffc61ed2
JH
4874
4875 range = 0; /* this was not a true range */
73b437c8 4876 }
ffc61ed2 4877
73b437c8 4878 if (!SIZE_ONLY) {
c49a72a9
NC
4879 const char *what = NULL;
4880 char yesno = 0;
4881
3568d838
JH
4882 if (namedclass > OOB_NAMEDCLASS)
4883 optimize_invert = FALSE;
e2962f66
JH
4884 /* Possible truncation here but in some 64-bit environments
4885 * the compiler gets heartburn about switch on 64-bit values.
4886 * A similar issue a little earlier when switching on value.
98f323fa 4887 * --jhi */
e2962f66 4888 switch ((I32)namedclass) {
73b437c8
JH
4889 case ANYOF_ALNUM:
4890 if (LOC)
936ed897 4891 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
4892 else {
4893 for (value = 0; value < 256; value++)
4894 if (isALNUM(value))
936ed897 4895 ANYOF_BITMAP_SET(ret, value);
73b437c8 4896 }
c49a72a9
NC
4897 yesno = '+';
4898 what = "Word";
73b437c8
JH
4899 break;
4900 case ANYOF_NALNUM:
4901 if (LOC)
936ed897 4902 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
4903 else {
4904 for (value = 0; value < 256; value++)
4905 if (!isALNUM(value))
936ed897 4906 ANYOF_BITMAP_SET(ret, value);
73b437c8 4907 }
c49a72a9
NC
4908 yesno = '!';
4909 what = "Word";
73b437c8 4910 break;
ffc61ed2 4911 case ANYOF_ALNUMC:
73b437c8 4912 if (LOC)
ffc61ed2 4913 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
4914 else {
4915 for (value = 0; value < 256; value++)
ffc61ed2 4916 if (isALNUMC(value))
936ed897 4917 ANYOF_BITMAP_SET(ret, value);
73b437c8 4918 }
c49a72a9
NC
4919 yesno = '+';
4920 what = "Alnum";
73b437c8
JH
4921 break;
4922 case ANYOF_NALNUMC:
4923 if (LOC)
936ed897 4924 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
4925 else {
4926 for (value = 0; value < 256; value++)
4927 if (!isALNUMC(value))
936ed897 4928 ANYOF_BITMAP_SET(ret, value);
73b437c8 4929 }
c49a72a9
NC
4930 yesno = '!';
4931 what = "Alnum";
73b437c8
JH
4932 break;
4933 case ANYOF_ALPHA:
4934 if (LOC)
936ed897 4935 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
4936 else {
4937 for (value = 0; value < 256; value++)
4938 if (isALPHA(value))
936ed897 4939 ANYOF_BITMAP_SET(ret, value);
73b437c8 4940 }
c49a72a9
NC
4941 yesno = '+';
4942 what = "Alpha";
73b437c8
JH
4943 break;
4944 case ANYOF_NALPHA:
4945 if (LOC)
936ed897 4946 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
4947 else {
4948 for (value = 0; value < 256; value++)
4949 if (!isALPHA(value))
936ed897 4950 ANYOF_BITMAP_SET(ret, value);
73b437c8 4951 }
c49a72a9
NC
4952 yesno = '!';
4953 what = "Alpha";
73b437c8
JH
4954 break;
4955 case ANYOF_ASCII:
4956 if (LOC)
936ed897 4957 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 4958 else {
c7f1f016 4959#ifndef EBCDIC
1ba5c669
JH
4960 for (value = 0; value < 128; value++)
4961 ANYOF_BITMAP_SET(ret, value);
4962#else /* EBCDIC */
ffbc6a93 4963 for (value = 0; value < 256; value++) {
3a3c4447
JH
4964 if (isASCII(value))
4965 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4966 }
1ba5c669 4967#endif /* EBCDIC */
73b437c8 4968 }
c49a72a9
NC
4969 yesno = '+';
4970 what = "ASCII";
73b437c8
JH
4971 break;
4972 case ANYOF_NASCII:
4973 if (LOC)
936ed897 4974 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 4975 else {
c7f1f016 4976#ifndef EBCDIC
1ba5c669
JH
4977 for (value = 128; value < 256; value++)
4978 ANYOF_BITMAP_SET(ret, value);
4979#else /* EBCDIC */
ffbc6a93 4980 for (value = 0; value < 256; value++) {
3a3c4447
JH
4981 if (!isASCII(value))
4982 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 4983 }
1ba5c669 4984#endif /* EBCDIC */
73b437c8 4985 }
c49a72a9
NC
4986 yesno = '!';
4987 what = "ASCII";
73b437c8 4988 break;
aaa51d5e
JF
4989 case ANYOF_BLANK:
4990 if (LOC)
4991 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
4992 else {
4993 for (value = 0; value < 256; value++)
4994 if (isBLANK(value))
4995 ANYOF_BITMAP_SET(ret, value);
4996 }
c49a72a9
NC
4997 yesno = '+';
4998 what = "Blank";
aaa51d5e
JF
4999 break;
5000 case ANYOF_NBLANK:
5001 if (LOC)
5002 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
5003 else {
5004 for (value = 0; value < 256; value++)
5005 if (!isBLANK(value))
5006 ANYOF_BITMAP_SET(ret, value);
5007 }
c49a72a9
NC
5008 yesno = '!';
5009 what = "Blank";
aaa51d5e 5010 break;
73b437c8
JH
5011 case ANYOF_CNTRL:
5012 if (LOC)
936ed897 5013 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
5014 else {
5015 for (value = 0; value < 256; value++)
5016 if (isCNTRL(value))
936ed897 5017 ANYOF_BITMAP_SET(ret, value);
73b437c8 5018 }
c49a72a9
NC
5019 yesno = '+';
5020 what = "Cntrl";
73b437c8
JH
5021 break;
5022 case ANYOF_NCNTRL:
5023 if (LOC)
936ed897 5024 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
5025 else {
5026 for (value = 0; value < 256; value++)
5027 if (!isCNTRL(value))
936ed897 5028 ANYOF_BITMAP_SET(ret, value);
73b437c8 5029 }
c49a72a9
NC
5030 yesno = '!';
5031 what = "Cntrl";
ffc61ed2
JH
5032 break;
5033 case ANYOF_DIGIT:
5034 if (LOC)
5035 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
5036 else {
5037 /* consecutive digits assumed */
5038 for (value = '0'; value <= '9'; value++)
5039 ANYOF_BITMAP_SET(ret, value);
5040 }
c49a72a9
NC
5041 yesno = '+';
5042 what = "Digit";
ffc61ed2
JH
5043 break;
5044 case ANYOF_NDIGIT:
5045 if (LOC)
5046 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
5047 else {
5048 /* consecutive digits assumed */
5049 for (value = 0; value < '0'; value++)
5050 ANYOF_BITMAP_SET(ret, value);
5051 for (value = '9' + 1; value < 256; value++)
5052 ANYOF_BITMAP_SET(ret, value);
5053 }
c49a72a9
NC
5054 yesno = '!';
5055 what = "Digit";
73b437c8
JH
5056 break;
5057 case ANYOF_GRAPH:
5058 if (LOC)
936ed897 5059 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
5060 else {
5061 for (value = 0; value < 256; value++)
5062 if (isGRAPH(value))
936ed897 5063 ANYOF_BITMAP_SET(ret, value);
73b437c8 5064 }
c49a72a9
NC
5065 yesno = '+';
5066 what = "Graph";
73b437c8
JH
5067 break;
5068 case ANYOF_NGRAPH:
5069 if (LOC)
936ed897 5070 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
5071 else {
5072 for (value = 0; value < 256; value++)
5073 if (!isGRAPH(value))
936ed897 5074 ANYOF_BITMAP_SET(ret, value);
73b437c8 5075 }
c49a72a9
NC
5076 yesno = '!';
5077 what = "Graph";
73b437c8
JH
5078 break;
5079 case ANYOF_LOWER:
5080 if (LOC)
936ed897 5081 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
5082 else {
5083 for (value = 0; value < 256; value++)
5084 if (isLOWER(value))
936ed897 5085 ANYOF_BITMAP_SET(ret, value);
73b437c8 5086 }
c49a72a9
NC
5087 yesno = '+';
5088 what = "Lower";
73b437c8
JH
5089 break;
5090 case ANYOF_NLOWER:
5091 if (LOC)
936ed897 5092 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
5093 else {
5094 for (value = 0; value < 256; value++)
5095 if (!isLOWER(value))
936ed897 5096 ANYOF_BITMAP_SET(ret, value);
73b437c8 5097 }
c49a72a9
NC
5098 yesno = '!';
5099 what = "Lower";
73b437c8
JH
5100 break;
5101 case ANYOF_PRINT:
5102 if (LOC)
936ed897 5103 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
5104 else {
5105 for (value = 0; value < 256; value++)
5106 if (isPRINT(value))
936ed897 5107 ANYOF_BITMAP_SET(ret, value);
73b437c8 5108 }
c49a72a9
NC
5109 yesno = '+';
5110 what = "Print";
73b437c8
JH
5111 break;
5112 case ANYOF_NPRINT:
5113 if (LOC)
936ed897 5114 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
5115 else {
5116 for (value = 0; value < 256; value++)
5117 if (!isPRINT(value))
936ed897 5118 ANYOF_BITMAP_SET(ret, value);
73b437c8 5119 }
c49a72a9
NC
5120 yesno = '!';
5121 what = "Print";
73b437c8 5122 break;
aaa51d5e
JF
5123 case ANYOF_PSXSPC:
5124 if (LOC)
5125 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
5126 else {
5127 for (value = 0; value < 256; value++)
5128 if (isPSXSPC(value))
5129 ANYOF_BITMAP_SET(ret, value);
5130 }
c49a72a9
NC
5131 yesno = '+';
5132 what = "Space";
aaa51d5e
JF
5133 break;
5134 case ANYOF_NPSXSPC:
5135 if (LOC)
5136 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
5137 else {
5138 for (value = 0; value < 256; value++)
5139 if (!isPSXSPC(value))
5140 ANYOF_BITMAP_SET(ret, value);
5141 }
c49a72a9
NC
5142 yesno = '!';
5143 what = "Space";
aaa51d5e 5144 break;
73b437c8
JH
5145 case ANYOF_PUNCT:
5146 if (LOC)
936ed897 5147 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
5148 else {
5149 for (value = 0; value < 256; value++)
5150 if (isPUNCT(value))
936ed897 5151 ANYOF_BITMAP_SET(ret, value);
73b437c8 5152 }
c49a72a9
NC
5153 yesno = '+';
5154 what = "Punct";
73b437c8
JH
5155 break;
5156 case ANYOF_NPUNCT:
5157 if (LOC)
936ed897 5158 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
5159 else {
5160 for (value = 0; value < 256; value++)
5161 if (!isPUNCT(value))
936ed897 5162 ANYOF_BITMAP_SET(ret, value);
73b437c8 5163 }
c49a72a9
NC
5164 yesno = '!';
5165 what = "Punct";
ffc61ed2
JH
5166 break;
5167 case ANYOF_SPACE:
5168 if (LOC)
5169 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
5170 else {
5171 for (value = 0; value < 256; value++)
5172 if (isSPACE(value))
5173 ANYOF_BITMAP_SET(ret, value);
5174 }
c49a72a9
NC
5175 yesno = '+';
5176 what = "SpacePerl";
ffc61ed2
JH
5177 break;
5178 case ANYOF_NSPACE:
5179 if (LOC)
5180 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
5181 else {
5182 for (value = 0; value < 256; value++)
5183 if (!isSPACE(value))
5184 ANYOF_BITMAP_SET(ret, value);
5185 }
c49a72a9
NC
5186 yesno = '!';
5187 what = "SpacePerl";
73b437c8
JH
5188 break;
5189 case ANYOF_UPPER:
5190 if (LOC)
936ed897 5191 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
5192 else {
5193 for (value = 0; value < 256; value++)
5194 if (isUPPER(value))
936ed897 5195 ANYOF_BITMAP_SET(ret, value);
73b437c8 5196 }
c49a72a9
NC
5197 yesno = '+';
5198 what = "Upper";
73b437c8
JH
5199 break;
5200 case ANYOF_NUPPER:
5201 if (LOC)
936ed897 5202 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
5203 else {
5204 for (value = 0; value < 256; value++)
5205 if (!isUPPER(value))
936ed897 5206 ANYOF_BITMAP_SET(ret, value);
73b437c8 5207 }
c49a72a9
NC
5208 yesno = '!';
5209 what = "Upper";
73b437c8
JH
5210 break;
5211 case ANYOF_XDIGIT:
5212 if (LOC)
936ed897 5213 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
5214 else {
5215 for (value = 0; value < 256; value++)
5216 if (isXDIGIT(value))
936ed897 5217 ANYOF_BITMAP_SET(ret, value);
73b437c8 5218 }
c49a72a9
NC
5219 yesno = '+';
5220 what = "XDigit";
73b437c8
JH
5221 break;
5222 case ANYOF_NXDIGIT:
5223 if (LOC)
936ed897 5224 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
5225 else {
5226 for (value = 0; value < 256; value++)
5227 if (!isXDIGIT(value))
936ed897 5228 ANYOF_BITMAP_SET(ret, value);
73b437c8 5229 }
c49a72a9
NC
5230 yesno = '!';
5231 what = "XDigit";
73b437c8 5232 break;
f81125e2
JP
5233 case ANYOF_MAX:
5234 /* this is to handle \p and \P */
5235 break;
73b437c8 5236 default:
b45f050a 5237 vFAIL("Invalid [::] class");
73b437c8 5238 break;
b8c5462f 5239 }
c49a72a9
NC
5240 if (what) {
5241 /* Strings such as "+utf8::isWord\n" */
5242 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
5243 }
b8c5462f 5244 if (LOC)
936ed897 5245 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 5246 continue;
a0d0e21e 5247 }
ffc61ed2
JH
5248 } /* end of namedclass \blah */
5249
a0d0e21e 5250 if (range) {
eb160463 5251 if (prevvalue > (IV)value) /* b-a */ {
b45f050a 5252 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
830247a4
IZ
5253 RExC_parse - rangebegin,
5254 RExC_parse - rangebegin,
b45f050a 5255 rangebegin);
3568d838 5256 range = 0; /* not a valid range */
73b437c8 5257 }
a0d0e21e
LW
5258 }
5259 else {
3568d838 5260 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
5261 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
5262 RExC_parse[1] != ']') {
5263 RExC_parse++;
ffc61ed2
JH
5264
5265 /* a bad range like \w-, [:word:]- ? */
5266 if (namedclass > OOB_NAMEDCLASS) {
e476b1b5 5267 if (ckWARN(WARN_REGEXP))
830247a4 5268 vWARN4(RExC_parse,
b45f050a 5269 "False [] range \"%*.*s\"",
830247a4
IZ
5270 RExC_parse - rangebegin,
5271 RExC_parse - rangebegin,
b45f050a 5272 rangebegin);
73b437c8 5273 if (!SIZE_ONLY)
936ed897 5274 ANYOF_BITMAP_SET(ret, '-');
73b437c8 5275 } else
ffc61ed2
JH
5276 range = 1; /* yeah, it's a range! */
5277 continue; /* but do it the next time */
a0d0e21e 5278 }
a687059c 5279 }
ffc61ed2 5280
93733859 5281 /* now is the next time */
ae5c130c 5282 if (!SIZE_ONLY) {
3568d838
JH
5283 IV i;
5284
5285 if (prevvalue < 256) {
1df70142 5286 const IV ceilvalue = value < 256 ? value : 255;
3568d838
JH
5287
5288#ifdef EBCDIC
1b2d223b
JH
5289 /* In EBCDIC [\x89-\x91] should include
5290 * the \x8e but [i-j] should not. */
5291 if (literal_endpoint == 2 &&
5292 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
5293 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 5294 {
3568d838
JH
5295 if (isLOWER(prevvalue)) {
5296 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5297 if (isLOWER(i))
5298 ANYOF_BITMAP_SET(ret, i);
5299 } else {
3568d838 5300 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
5301 if (isUPPER(i))
5302 ANYOF_BITMAP_SET(ret, i);
5303 }
8ada0baa 5304 }
ffc61ed2 5305 else
8ada0baa 5306#endif
a5961de5
JH
5307 for (i = prevvalue; i <= ceilvalue; i++)
5308 ANYOF_BITMAP_SET(ret, i);
3568d838 5309 }
a5961de5 5310 if (value > 255 || UTF) {
1df70142
AL
5311 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
5312 const UV natvalue = NATIVE_TO_UNI(value);
b08decb7 5313
ffc61ed2 5314 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 5315 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 5316 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
5317 prevnatvalue, natvalue);
5318 }
5319 else if (prevnatvalue == natvalue) {
5320 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 5321 if (FOLD) {
89ebb4a3 5322 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 5323 STRLEN foldlen;
1df70142 5324 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 5325
c840d2a2
JH
5326 /* If folding and foldable and a single
5327 * character, insert also the folded version
5328 * to the charclass. */
9e55ce06 5329 if (f != value) {
eb160463 5330 if (foldlen == (STRLEN)UNISKIP(f))
9e55ce06
JH
5331 Perl_sv_catpvf(aTHX_ listsv,
5332 "%04"UVxf"\n", f);
5333 else {
5334 /* Any multicharacter foldings
5335 * require the following transform:
5336 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
5337 * where E folds into "pq" and F folds
5338 * into "rst", all other characters
5339 * fold to single characters. We save
5340 * away these multicharacter foldings,
5341 * to be later saved as part of the
5342 * additional "s" data. */
5343 SV *sv;
5344
5345 if (!unicode_alternate)
5346 unicode_alternate = newAV();
5347 sv = newSVpvn((char*)foldbuf, foldlen);
5348 SvUTF8_on(sv);
5349 av_push(unicode_alternate, sv);
5350 }
5351 }
254ba52a 5352
60a8b682
JH
5353 /* If folding and the value is one of the Greek
5354 * sigmas insert a few more sigmas to make the
5355 * folding rules of the sigmas to work right.
5356 * Note that not all the possible combinations
5357 * are handled here: some of them are handled
9e55ce06
JH
5358 * by the standard folding rules, and some of
5359 * them (literal or EXACTF cases) are handled
5360 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
5361 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
5362 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5363 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 5364 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5365 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5366 }
5367 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
5368 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 5369 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
5370 }
5371 }
ffc61ed2 5372 }
1b2d223b
JH
5373#ifdef EBCDIC
5374 literal_endpoint = 0;
5375#endif
8ada0baa 5376 }
ffc61ed2
JH
5377
5378 range = 0; /* this range (if it was one) is done now */
a0d0e21e 5379 }
ffc61ed2 5380
936ed897 5381 if (need_class) {
4f66b38d 5382 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 5383 if (SIZE_ONLY)
830247a4 5384 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 5385 else
830247a4 5386 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 5387 }
ffc61ed2 5388
ae5c130c 5389 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
b8c5462f 5390 if (!SIZE_ONLY &&
ffc61ed2 5391 /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
5392 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
5393 ) {
a0ed51b3 5394 for (value = 0; value < 256; ++value) {
936ed897 5395 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 5396 UV fold = PL_fold[value];
ffc61ed2
JH
5397
5398 if (fold != value)
5399 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
5400 }
5401 }
936ed897 5402 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 5403 }
ffc61ed2 5404
ae5c130c 5405 /* optimize inverted simple patterns (e.g. [^a-z]) */
3568d838 5406 if (!SIZE_ONLY && optimize_invert &&
ffc61ed2
JH
5407 /* If the only flag is inversion. */
5408 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 5409 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 5410 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 5411 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 5412 }
a0d0e21e 5413
b81d288d 5414 if (!SIZE_ONLY) {
fde631ed 5415 AV *av = newAV();
ffc61ed2
JH
5416 SV *rv;
5417
9e55ce06 5418 /* The 0th element stores the character class description
6a0407ee 5419 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
5420 * to initialize the appropriate swash (which gets stored in
5421 * the 1st element), and also useful for dumping the regnode.
5422 * The 2nd element stores the multicharacter foldings,
6a0407ee 5423 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
5424 av_store(av, 0, listsv);
5425 av_store(av, 1, NULL);
9e55ce06 5426 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 5427 rv = newRV_noinc((SV*)av);
19860706 5428 n = add_data(pRExC_state, 1, "s");
830247a4 5429 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 5430 ARG_SET(ret, n);
a0ed51b3
LW
5431 }
5432
5433 return ret;
5434}
5435
76e3520e 5436STATIC char*
830247a4 5437S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 5438{
830247a4 5439 char* retval = RExC_parse++;
a0d0e21e 5440
4633a7c4 5441 for (;;) {
830247a4
IZ
5442 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
5443 RExC_parse[2] == '#') {
e994fd66
AE
5444 while (*RExC_parse != ')') {
5445 if (RExC_parse == RExC_end)
5446 FAIL("Sequence (?#... not terminated");
830247a4 5447 RExC_parse++;
e994fd66 5448 }
830247a4 5449 RExC_parse++;
4633a7c4
LW
5450 continue;
5451 }
e2509266 5452 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
5453 if (isSPACE(*RExC_parse)) {
5454 RExC_parse++;
748a9306
LW
5455 continue;
5456 }
830247a4 5457 else if (*RExC_parse == '#') {
e994fd66
AE
5458 while (RExC_parse < RExC_end)
5459 if (*RExC_parse++ == '\n') break;
748a9306
LW
5460 continue;
5461 }
748a9306 5462 }
4633a7c4 5463 return retval;
a0d0e21e 5464 }
a687059c
LW
5465}
5466
5467/*
c277df42 5468- reg_node - emit a node
a0d0e21e 5469*/
76e3520e 5470STATIC regnode * /* Location. */
830247a4 5471S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 5472{
c277df42 5473 register regnode *ptr;
504618e9 5474 regnode * const ret = RExC_emit;
a687059c 5475
c277df42 5476 if (SIZE_ONLY) {
830247a4
IZ
5477 SIZE_ALIGN(RExC_size);
5478 RExC_size += 1;
a0d0e21e
LW
5479 return(ret);
5480 }
a687059c 5481
c277df42 5482 NODE_ALIGN_FILL(ret);
a0d0e21e 5483 ptr = ret;
c277df42 5484 FILL_ADVANCE_NODE(ptr, op);
fac92740 5485 if (RExC_offsets) { /* MJD */
ccb2c380 5486 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
fac92740
MJD
5487 "reg_node", __LINE__,
5488 reg_name[op],
5489 RExC_emit - RExC_emit_start > RExC_offsets[0]
5490 ? "Overwriting end of array!\n" : "OK",
5491 RExC_emit - RExC_emit_start,
5492 RExC_parse - RExC_start,
5493 RExC_offsets[0]));
ccb2c380 5494 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740
MJD
5495 }
5496
830247a4 5497 RExC_emit = ptr;
a687059c 5498
a0d0e21e 5499 return(ret);
a687059c
LW
5500}
5501
5502/*
a0d0e21e
LW
5503- reganode - emit a node with an argument
5504*/
76e3520e 5505STATIC regnode * /* Location. */
830247a4 5506S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 5507{
c277df42 5508 register regnode *ptr;
504618e9 5509 regnode * const ret = RExC_emit;
fe14fcc3 5510
c277df42 5511 if (SIZE_ONLY) {
830247a4
IZ
5512 SIZE_ALIGN(RExC_size);
5513 RExC_size += 2;
a0d0e21e
LW
5514 return(ret);
5515 }
fe14fcc3 5516
c277df42 5517 NODE_ALIGN_FILL(ret);
a0d0e21e 5518 ptr = ret;
c277df42 5519 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 5520 if (RExC_offsets) { /* MJD */
ccb2c380 5521 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5522 "reganode",
ccb2c380
MP
5523 __LINE__,
5524 reg_name[op],
fac92740
MJD
5525 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
5526 "Overwriting end of array!\n" : "OK",
5527 RExC_emit - RExC_emit_start,
5528 RExC_parse - RExC_start,
5529 RExC_offsets[0]));
ccb2c380 5530 Set_Cur_Node_Offset;
fac92740
MJD
5531 }
5532
830247a4 5533 RExC_emit = ptr;
fe14fcc3 5534
a0d0e21e 5535 return(ret);
fe14fcc3
LW
5536}
5537
5538/*
cd439c50 5539- reguni - emit (if appropriate) a Unicode character
a0ed51b3
LW
5540*/
5541STATIC void
830247a4 5542S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
a0ed51b3 5543{
5e12f4fb 5544 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
5545}
5546
5547/*
a0d0e21e
LW
5548- reginsert - insert an operator in front of already-emitted operand
5549*
5550* Means relocating the operand.
5551*/
76e3520e 5552STATIC void
830247a4 5553S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
a687059c 5554{
c277df42
IZ
5555 register regnode *src;
5556 register regnode *dst;
5557 register regnode *place;
504618e9 5558 const int offset = regarglen[(U8)op];
b81d288d 5559
22c35a8c 5560/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
c277df42
IZ
5561
5562 if (SIZE_ONLY) {
830247a4 5563 RExC_size += NODE_STEP_REGNODE + offset;
a0d0e21e
LW
5564 return;
5565 }
a687059c 5566
830247a4
IZ
5567 src = RExC_emit;
5568 RExC_emit += NODE_STEP_REGNODE + offset;
5569 dst = RExC_emit;
fac92740 5570 while (src > opnd) {
c277df42 5571 StructCopy(--src, --dst, regnode);
fac92740 5572 if (RExC_offsets) { /* MJD 20010112 */
ccb2c380 5573 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
fac92740 5574 "reg_insert",
ccb2c380
MP
5575 __LINE__,
5576 reg_name[op],
fac92740
MJD
5577 dst - RExC_emit_start > RExC_offsets[0]
5578 ? "Overwriting end of array!\n" : "OK",
5579 src - RExC_emit_start,
5580 dst - RExC_emit_start,
5581 RExC_offsets[0]));
ccb2c380
MP
5582 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
5583 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
5584 }
5585 }
5586
a0d0e21e
LW
5587
5588 place = opnd; /* Op node, where operand used to be. */
fac92740 5589 if (RExC_offsets) { /* MJD */
ccb2c380 5590 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
fac92740 5591 "reginsert",
ccb2c380
MP
5592 __LINE__,
5593 reg_name[op],
fac92740
MJD
5594 place - RExC_emit_start > RExC_offsets[0]
5595 ? "Overwriting end of array!\n" : "OK",
5596 place - RExC_emit_start,
5597 RExC_parse - RExC_start,
5598 RExC_offsets[0]));
ccb2c380 5599 Set_Node_Offset(place, RExC_parse);
45948336 5600 Set_Node_Length(place, 1);
fac92740 5601 }
c277df42
IZ
5602 src = NEXTOPER(place);
5603 FILL_ADVANCE_NODE(place, op);
5604 Zero(src, offset, regnode);
a687059c
LW
5605}
5606
5607/*
c277df42 5608- regtail - set the next-pointer at the end of a node chain of p to val.
a0d0e21e 5609*/
76e3520e 5610STATIC void
830247a4 5611S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5612{
c277df42 5613 register regnode *scan;
a0d0e21e 5614
c277df42 5615 if (SIZE_ONLY)
a0d0e21e
LW
5616 return;
5617
5618 /* Find last node. */
5619 scan = p;
5620 for (;;) {
504618e9 5621 regnode * const temp = regnext(scan);
a0d0e21e
LW
5622 if (temp == NULL)
5623 break;
5624 scan = temp;
5625 }
a687059c 5626
c277df42
IZ
5627 if (reg_off_by_arg[OP(scan)]) {
5628 ARG_SET(scan, val - scan);
a0ed51b3
LW
5629 }
5630 else {
c277df42
IZ
5631 NEXT_OFF(scan) = val - scan;
5632 }
a687059c
LW
5633}
5634
5635/*
a0d0e21e
LW
5636- regoptail - regtail on operand of first argument; nop if operandless
5637*/
76e3520e 5638STATIC void
830247a4 5639S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
a687059c 5640{
a0d0e21e 5641 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
c277df42
IZ
5642 if (p == NULL || SIZE_ONLY)
5643 return;
22c35a8c 5644 if (PL_regkind[(U8)OP(p)] == BRANCH) {
830247a4 5645 regtail(pRExC_state, NEXTOPER(p), val);
a0ed51b3 5646 }
22c35a8c 5647 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
830247a4 5648 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
a0ed51b3
LW
5649 }
5650 else
a0d0e21e 5651 return;
a687059c
LW
5652}
5653
5654/*
5655 - regcurly - a little FSA that accepts {\d+,?\d*}
5656 */
79072805 5657STATIC I32
a3b680e6 5658S_regcurly(pTHX_ register const char *s)
a687059c
LW
5659{
5660 if (*s++ != '{')
5661 return FALSE;
f0fcb552 5662 if (!isDIGIT(*s))
a687059c 5663 return FALSE;
f0fcb552 5664 while (isDIGIT(*s))
a687059c
LW
5665 s++;
5666 if (*s == ',')
5667 s++;
f0fcb552 5668 while (isDIGIT(*s))
a687059c
LW
5669 s++;
5670 if (*s != '}')
5671 return FALSE;
5672 return TRUE;
5673}
5674
a687059c 5675
8fa7f367
JH
5676#ifdef DEBUGGING
5677
76e3520e 5678STATIC regnode *
cea2e8a9 5679S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
c277df42 5680{
f248d071 5681 register U8 op = EXACT; /* Arbitrary non-END op. */
155aba94 5682 register regnode *next;
c277df42
IZ
5683
5684 while (op != END && (!last || node < last)) {
5685 /* While that wasn't END last time... */
5686
5687 NODE_ALIGN(node);
5688 op = OP(node);
5689 if (op == CLOSE)
5690 l--;
5691 next = regnext(node);
5692 /* Where, what. */
5693 if (OP(node) == OPTIMIZED)
5694 goto after_print;
5695 regprop(sv, node);
b900a521 5696 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
f1dbda3d 5697 (int)(2*l + 1), "", SvPVX(sv));
c277df42
IZ
5698 if (next == NULL) /* Next ptr. */
5699 PerlIO_printf(Perl_debug_log, "(0)");
b81d288d 5700 else
b900a521 5701 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
c277df42
IZ
5702 (void)PerlIO_putc(Perl_debug_log, '\n');
5703 after_print:
22c35a8c 5704 if (PL_regkind[(U8)op] == BRANCHJ) {
b81d288d
AB
5705 register regnode *nnode = (OP(next) == LONGJMP
5706 ? regnext(next)
c277df42
IZ
5707 : next);
5708 if (last && nnode > last)
5709 nnode = last;
5710 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
a0ed51b3 5711 }
22c35a8c 5712 else if (PL_regkind[(U8)op] == BRANCH) {
c277df42 5713 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
a0ed51b3 5714 }
a3621e74 5715 else if ( PL_regkind[(U8)op] == TRIE ) {
e1ec3a88
AL
5716 const I32 n = ARG(node);
5717 const reg_trie_data *trie = (reg_trie_data*)PL_regdata->data[n];
5718 const I32 arry_len = av_len(trie->words)+1;
a3621e74 5719 I32 word_idx;
a3621e74 5720 PerlIO_printf(Perl_debug_log,
e4584336 5721 "%*s[Words:%d Chars Stored:%d Unique Chars:%d States:%"IVdf"%s]\n",
5d7488b2
AL
5722 (int)(2*(l+3)),
5723 "",
a3621e74 5724 trie->wordcount,
5d7488b2 5725 (int)trie->charcount,
a3621e74 5726 trie->uniquecharcount,
e4584336 5727 (IV)trie->laststate-1,
a3621e74
YO
5728 node->flags ? " EVAL mode" : "");
5729
5730 for (word_idx=0; word_idx < arry_len; word_idx++) {
5731 SV **elem_ptr=av_fetch(trie->words,word_idx,0);
5732 if (elem_ptr) {
5733 PerlIO_printf(Perl_debug_log, "%*s<%s%s%s>\n",
5734 (int)(2*(l+4)), "",
5735 PL_colors[0],
5736 SvPV_nolen(*elem_ptr),
5737 PL_colors[1]
5738 );
5739 /*
5740 if (next == NULL)
5741 PerlIO_printf(Perl_debug_log, "(0)\n");
5742 else
5743 PerlIO_printf(Perl_debug_log, "(%"IVdf")\n", (IV)(next - start));
5744 */
5745 }
5746
5747 }
5748
5749 node = NEXTOPER(node);
5750 node += regarglen[(U8)op];
5751
5752 }
a0ed51b3 5753 else if ( op == CURLY) { /* `next' might be very big: optimizer */
c277df42
IZ
5754 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5755 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
a0ed51b3 5756 }
22c35a8c 5757 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
c277df42
IZ
5758 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5759 next, sv, l + 1);
a0ed51b3
LW
5760 }
5761 else if ( op == PLUS || op == STAR) {
c277df42 5762 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
a0ed51b3
LW
5763 }
5764 else if (op == ANYOF) {
4f66b38d
HS
5765 /* arglen 1 + class block */
5766 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5767 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5768 node = NEXTOPER(node);
a0ed51b3 5769 }
22c35a8c 5770 else if (PL_regkind[(U8)op] == EXACT) {
c277df42 5771 /* Literal string, where present. */
cd439c50 5772 node += NODE_SZ_STR(node) - 1;
c277df42 5773 node = NEXTOPER(node);
a0ed51b3
LW
5774 }
5775 else {
c277df42
IZ
5776 node = NEXTOPER(node);
5777 node += regarglen[(U8)op];
5778 }
5779 if (op == CURLYX || op == OPEN)
5780 l++;
5781 else if (op == WHILEM)
5782 l--;
5783 }
5784 return node;
5785}
5786
8fa7f367
JH
5787#endif /* DEBUGGING */
5788
a687059c 5789/*
fd181c75 5790 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
5791 */
5792void
864dbfa3 5793Perl_regdump(pTHX_ regexp *r)
a687059c 5794{
35ff7856 5795#ifdef DEBUGGING
46fc3d4c 5796 SV *sv = sv_newmortal();
a687059c 5797
c277df42 5798 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
a0d0e21e
LW
5799
5800 /* Header fields of interest. */
c277df42 5801 if (r->anchored_substr)
7b0972df 5802 PerlIO_printf(Perl_debug_log,
b81d288d 5803 "anchored `%s%.*s%s'%s at %"IVdf" ",
3280af22 5804 PL_colors[0],
7b0972df 5805 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
b81d288d 5806 SvPVX(r->anchored_substr),
3280af22 5807 PL_colors[1],
c277df42 5808 SvTAIL(r->anchored_substr) ? "$" : "",
7b0972df 5809 (IV)r->anchored_offset);
33b8afdf
JH
5810 else if (r->anchored_utf8)
5811 PerlIO_printf(Perl_debug_log,
5812 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
5813 PL_colors[0],
5814 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
5815 SvPVX(r->anchored_utf8),
5816 PL_colors[1],
5817 SvTAIL(r->anchored_utf8) ? "$" : "",
5818 (IV)r->anchored_offset);
c277df42 5819 if (r->float_substr)
7b0972df 5820 PerlIO_printf(Perl_debug_log,
b81d288d 5821 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
3280af22 5822 PL_colors[0],
b81d288d 5823 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
2c2d71f5 5824 SvPVX(r->float_substr),
3280af22 5825 PL_colors[1],
c277df42 5826 SvTAIL(r->float_substr) ? "$" : "",
7b0972df 5827 (IV)r->float_min_offset, (UV)r->float_max_offset);
33b8afdf
JH
5828 else if (r->float_utf8)
5829 PerlIO_printf(Perl_debug_log,
5830 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
5831 PL_colors[0],
5832 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
5833 SvPVX(r->float_utf8),
5834 PL_colors[1],
5835 SvTAIL(r->float_utf8) ? "$" : "",
5836 (IV)r->float_min_offset, (UV)r->float_max_offset);
5837 if (r->check_substr || r->check_utf8)
b81d288d
AB
5838 PerlIO_printf(Perl_debug_log,
5839 r->check_substr == r->float_substr
33b8afdf 5840 && r->check_utf8 == r->float_utf8
c277df42
IZ
5841 ? "(checking floating" : "(checking anchored");
5842 if (r->reganch & ROPT_NOSCAN)
5843 PerlIO_printf(Perl_debug_log, " noscan");
5844 if (r->reganch & ROPT_CHECK_ALL)
5845 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 5846 if (r->check_substr || r->check_utf8)
c277df42
IZ
5847 PerlIO_printf(Perl_debug_log, ") ");
5848
46fc3d4c 5849 if (r->regstclass) {
5850 regprop(sv, r->regstclass);
5851 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
5852 }
774d564b 5853 if (r->reganch & ROPT_ANCH) {
5854 PerlIO_printf(Perl_debug_log, "anchored");
5855 if (r->reganch & ROPT_ANCH_BOL)
5856 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
5857 if (r->reganch & ROPT_ANCH_MBOL)
5858 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
5859 if (r->reganch & ROPT_ANCH_SBOL)
5860 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 5861 if (r->reganch & ROPT_ANCH_GPOS)
5862 PerlIO_printf(Perl_debug_log, "(GPOS)");
5863 PerlIO_putc(Perl_debug_log, ' ');
5864 }
c277df42
IZ
5865 if (r->reganch & ROPT_GPOS_SEEN)
5866 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 5867 if (r->reganch & ROPT_SKIP)
760ac839 5868 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 5869 if (r->reganch & ROPT_IMPLICIT)
760ac839 5870 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 5871 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
5872 if (r->reganch & ROPT_EVAL_SEEN)
5873 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 5874 PerlIO_printf(Perl_debug_log, "\n");
fac92740 5875 if (r->offsets) {
e4584336 5876 const U32 len = r->offsets[0];
a3621e74
YO
5877 GET_RE_DEBUG_FLAGS_DECL;
5878 DEBUG_OFFSETS_r({
1df70142 5879 U32 i;
e4584336
RB
5880 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
5881 for (i = 1; i <= len; i++)
5882 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
5883 (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
5884 PerlIO_printf(Perl_debug_log, "\n");
a3621e74 5885 });
fac92740 5886 }
17c3b450 5887#endif /* DEBUGGING */
a687059c
LW
5888}
5889
8fa7f367
JH
5890#ifdef DEBUGGING
5891
653099ff
GS
5892STATIC void
5893S_put_byte(pTHX_ SV *sv, int c)
5894{
7be5a6cf 5895 if (isCNTRL(c) || c == 255 || !isPRINT(c))
653099ff
GS
5896 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5897 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5898 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5899 else
5900 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5901}
5902
8fa7f367
JH
5903#endif /* DEBUGGING */
5904
a3621e74 5905
a687059c 5906/*
a0d0e21e
LW
5907- regprop - printable representation of opcode
5908*/
46fc3d4c 5909void
a3b680e6 5910Perl_regprop(pTHX_ SV *sv, const regnode *o)
a687059c 5911{
35ff7856 5912#ifdef DEBUGGING
9b155405 5913 register int k;
a0d0e21e 5914
54dc92de 5915 sv_setpvn(sv, "", 0);
9b155405 5916 if (OP(o) >= reg_num) /* regnode.type is unsigned */
830247a4
IZ
5917 /* It would be nice to FAIL() here, but this may be called from
5918 regexec.c, and it would be hard to supply pRExC_state. */
5919 Perl_croak(aTHX_ "Corrupted regexp opcode");
bfed75c6 5920 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405
IZ
5921
5922 k = PL_regkind[(U8)OP(o)];
5923
2a782b5b
JH
5924 if (k == EXACT) {
5925 SV *dsv = sv_2mortal(newSVpvn("", 0));
c728cb41
JH
5926 /* Using is_utf8_string() is a crude hack but it may
5927 * be the best for now since we have no flag "this EXACTish
5928 * node was UTF-8" --jhi */
1df70142
AL
5929 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
5930 const char *s = do_utf8 ?
c728cb41
JH
5931 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
5932 UNI_DISPLAY_REGEX) :
2a782b5b 5933 STRING(o);
e1ec3a88 5934 const int len = do_utf8 ?
2a782b5b
JH
5935 strlen(s) :
5936 STR_LEN(o);
5937 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
5938 PL_colors[0],
5939 len, s,
5940 PL_colors[1]);
a3621e74
YO
5941 } else if (k == TRIE) {/*
5942 this isn't always safe, as Pl_regdata may not be for this regex yet
5943 (depending on where its called from) so its being moved to dumpuntil
5944 I32 n = ARG(o);
5945 reg_trie_data *trie=(reg_trie_data*)PL_regdata->data[n];
5946 Perl_sv_catpvf(aTHX_ sv, " (W:%d L:%d C:%d S:%d)",
5947 trie->wordcount,
5948 trie->charcount,
5949 trie->uniquecharcount,
5950 trie->laststate);
5951 */
5952 } else if (k == CURLY) {
cb434fcc 5953 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
5954 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
5955 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 5956 }
2c2d71f5
JH
5957 else if (k == WHILEM && o->flags) /* Ordinal/of */
5958 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9b155405 5959 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
894356b3 5960 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9b155405 5961 else if (k == LOGICAL)
04ebc1ab 5962 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
5963 else if (k == ANYOF) {
5964 int i, rangestart = -1;
ffc61ed2 5965 U8 flags = ANYOF_FLAGS(o);
a6d05634 5966 const char * const anyofs[] = { /* Should be synchronized with
19860706 5967 * ANYOF_ #xdefines in regcomp.h */
653099ff
GS
5968 "\\w",
5969 "\\W",
5970 "\\s",
5971 "\\S",
5972 "\\d",
5973 "\\D",
5974 "[:alnum:]",
5975 "[:^alnum:]",
5976 "[:alpha:]",
5977 "[:^alpha:]",
5978 "[:ascii:]",
5979 "[:^ascii:]",
5980 "[:ctrl:]",
5981 "[:^ctrl:]",
5982 "[:graph:]",
5983 "[:^graph:]",
5984 "[:lower:]",
5985 "[:^lower:]",
5986 "[:print:]",
5987 "[:^print:]",
5988 "[:punct:]",
5989 "[:^punct:]",
5990 "[:upper:]",
aaa51d5e 5991 "[:^upper:]",
653099ff 5992 "[:xdigit:]",
aaa51d5e
JF
5993 "[:^xdigit:]",
5994 "[:space:]",
5995 "[:^space:]",
5996 "[:blank:]",
5997 "[:^blank:]"
653099ff
GS
5998 };
5999
19860706 6000 if (flags & ANYOF_LOCALE)
653099ff 6001 sv_catpv(sv, "{loc}");
19860706 6002 if (flags & ANYOF_FOLD)
653099ff
GS
6003 sv_catpv(sv, "{i}");
6004 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 6005 if (flags & ANYOF_INVERT)
653099ff 6006 sv_catpv(sv, "^");
ffc61ed2
JH
6007 for (i = 0; i <= 256; i++) {
6008 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
6009 if (rangestart == -1)
6010 rangestart = i;
6011 } else if (rangestart != -1) {
6012 if (i <= rangestart + 3)
6013 for (; rangestart < i; rangestart++)
653099ff 6014 put_byte(sv, rangestart);
ffc61ed2
JH
6015 else {
6016 put_byte(sv, rangestart);
6017 sv_catpv(sv, "-");
6018 put_byte(sv, i - 1);
653099ff 6019 }
ffc61ed2 6020 rangestart = -1;
653099ff 6021 }
847a199f 6022 }
ffc61ed2
JH
6023
6024 if (o->flags & ANYOF_CLASS)
6025 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
6026 if (ANYOF_CLASS_TEST(o,i))
6027 sv_catpv(sv, anyofs[i]);
6028
6029 if (flags & ANYOF_UNICODE)
6030 sv_catpv(sv, "{unicode}");
1aa99e6b 6031 else if (flags & ANYOF_UNICODE_ALL)
2a782b5b 6032 sv_catpv(sv, "{unicode_all}");
ffc61ed2
JH
6033
6034 {
6035 SV *lv;
9e55ce06 6036 SV *sw = regclass_swash(o, FALSE, &lv, 0);
b81d288d 6037
ffc61ed2
JH
6038 if (lv) {
6039 if (sw) {
89ebb4a3 6040 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 6041
ffc61ed2 6042 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 6043 uvchr_to_utf8(s, i);
ffc61ed2 6044
3568d838 6045 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
6046 if (rangestart == -1)
6047 rangestart = i;
6048 } else if (rangestart != -1) {
6049 U8 *p;
b81d288d 6050
ffc61ed2
JH
6051 if (i <= rangestart + 3)
6052 for (; rangestart < i; rangestart++) {
1df70142 6053 U8 *e;
2b9d42f0 6054 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6055 put_byte(sv, *p);
6056 }
6057 else {
1df70142 6058 U8 *e;
2b9d42f0 6059 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
ffc61ed2
JH
6060 put_byte(sv, *p);
6061 sv_catpv(sv, "-");
1df70142
AL
6062 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
6063 put_byte(sv, *p);
ffc61ed2
JH
6064 }
6065 rangestart = -1;
6066 }
19860706 6067 }
ffc61ed2
JH
6068
6069 sv_catpv(sv, "..."); /* et cetera */
19860706 6070 }
fde631ed 6071
ffc61ed2 6072 {
2e0de35c 6073 char *s = savesvpv(lv);
ffc61ed2 6074 char *origs = s;
b81d288d 6075
ffc61ed2 6076 while(*s && *s != '\n') s++;
b81d288d 6077
ffc61ed2 6078 if (*s == '\n') {
1df70142 6079 const char *t = ++s;
ffc61ed2
JH
6080
6081 while (*s) {
6082 if (*s == '\n')
6083 *s = ' ';
6084 s++;
6085 }
6086 if (s[-1] == ' ')
6087 s[-1] = 0;
6088
6089 sv_catpv(sv, t);
fde631ed 6090 }
b81d288d 6091
ffc61ed2 6092 Safefree(origs);
fde631ed
JH
6093 }
6094 }
653099ff 6095 }
ffc61ed2 6096
653099ff
GS
6097 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
6098 }
9b155405 6099 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
cea2e8a9 6100 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
17c3b450 6101#endif /* DEBUGGING */
35ff7856 6102}
a687059c 6103
cad2e5aa
JH
6104SV *
6105Perl_re_intuit_string(pTHX_ regexp *prog)
6106{ /* Assume that RE_INTUIT is set */
a3621e74
YO
6107 GET_RE_DEBUG_FLAGS_DECL;
6108 DEBUG_COMPILE_r(
cad2e5aa 6109 { STRLEN n_a;
4373e329 6110 const char *s = SvPV(prog->check_substr
33b8afdf 6111 ? prog->check_substr : prog->check_utf8, n_a);
cad2e5aa
JH
6112
6113 if (!PL_colorset) reginitcolors();
6114 PerlIO_printf(Perl_debug_log,
33b8afdf
JH
6115 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
6116 PL_colors[4],
6117 prog->check_substr ? "" : "utf8 ",
6118 PL_colors[5],PL_colors[0],
cad2e5aa
JH
6119 s,
6120 PL_colors[1],
6121 (strlen(s) > 60 ? "..." : ""));
6122 } );
6123
33b8afdf 6124 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
6125}
6126
2b69d0c2 6127void
864dbfa3 6128Perl_pregfree(pTHX_ struct regexp *r)
a687059c 6129{
27da23d5 6130 dVAR;
9e55ce06
JH
6131#ifdef DEBUGGING
6132 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
a3621e74 6133 SV *re_debug_flags=get_sv(RE_DEBUG_FLAGS,0);
9e55ce06 6134#endif
7821416a 6135
a3621e74 6136
7821416a
IZ
6137 if (!r || (--r->refcnt > 0))
6138 return;
a3621e74 6139 DEBUG_r(if (re_debug_flags && (SvIV(re_debug_flags) & RE_DEBUG_COMPILE)) {
e1ec3a88
AL
6140 const char *s = (r->reganch & ROPT_UTF8)
6141 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
9f369894 6142 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
e1ec3a88 6143 const int len = SvCUR(dsv);
9e55ce06
JH
6144 if (!PL_colorset)
6145 reginitcolors();
6146 PerlIO_printf(Perl_debug_log,
a3621e74 6147 "%sFreeing REx:%s %s%*.*s%s%s\n",
9e55ce06
JH
6148 PL_colors[4],PL_colors[5],PL_colors[0],
6149 len, len, s,
6150 PL_colors[1],
6151 len > 60 ? "..." : "");
6152 });
cad2e5aa 6153
c277df42 6154 if (r->precomp)
a0d0e21e 6155 Safefree(r->precomp);
fac92740
MJD
6156 if (r->offsets) /* 20010421 MJD */
6157 Safefree(r->offsets);
ed252734
NC
6158 RX_MATCH_COPY_FREE(r);
6159#ifdef PERL_COPY_ON_WRITE
6160 if (r->saved_copy)
6161 SvREFCNT_dec(r->saved_copy);
6162#endif
a193d654
GS
6163 if (r->substrs) {
6164 if (r->anchored_substr)
6165 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
6166 if (r->anchored_utf8)
6167 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
6168 if (r->float_substr)
6169 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
6170 if (r->float_utf8)
6171 SvREFCNT_dec(r->float_utf8);
2779dcf1 6172 Safefree(r->substrs);
a193d654 6173 }
c277df42
IZ
6174 if (r->data) {
6175 int n = r->data->count;
f3548bdc
DM
6176 PAD* new_comppad = NULL;
6177 PAD* old_comppad;
4026c95a 6178 PADOFFSET refcnt;
dfad63ad 6179
c277df42 6180 while (--n >= 0) {
261faec3 6181 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
6182 switch (r->data->what[n]) {
6183 case 's':
6184 SvREFCNT_dec((SV*)r->data->data[n]);
6185 break;
653099ff
GS
6186 case 'f':
6187 Safefree(r->data->data[n]);
6188 break;
dfad63ad
HS
6189 case 'p':
6190 new_comppad = (AV*)r->data->data[n];
6191 break;
c277df42 6192 case 'o':
dfad63ad 6193 if (new_comppad == NULL)
cea2e8a9 6194 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
6195 PAD_SAVE_LOCAL(old_comppad,
6196 /* Watch out for global destruction's random ordering. */
6197 (SvTYPE(new_comppad) == SVt_PVAV) ?
6198 new_comppad : Null(PAD *)
6199 );
b34c0dd4 6200 OP_REFCNT_LOCK;
4026c95a
SH
6201 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
6202 OP_REFCNT_UNLOCK;
6203 if (!refcnt)
9b978d73 6204 op_free((OP_4tree*)r->data->data[n]);
9b978d73 6205
f3548bdc 6206 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
6207 SvREFCNT_dec((SV*)new_comppad);
6208 new_comppad = NULL;
c277df42
IZ
6209 break;
6210 case 'n':
9e55ce06 6211 break;
a3621e74
YO
6212 case 't':
6213 {
6214 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
6215 U32 refcount;
6216 OP_REFCNT_LOCK;
6217 refcount = trie->refcount--;
6218 OP_REFCNT_UNLOCK;
6219 if ( !refcount ) {
6220 if (trie->charmap)
6221 Safefree(trie->charmap);
6222 if (trie->widecharmap)
6223 SvREFCNT_dec((SV*)trie->widecharmap);
6224 if (trie->states)
6225 Safefree(trie->states);
6226 if (trie->trans)
6227 Safefree(trie->trans);
6228#ifdef DEBUGGING
6229 if (trie->words)
6230 SvREFCNT_dec((SV*)trie->words);
6231 if (trie->revcharmap)
6232 SvREFCNT_dec((SV*)trie->revcharmap);
6233#endif
6234 Safefree(r->data->data[n]); /* do this last!!!! */
6235 }
6236 break;
6237 }
c277df42 6238 default:
830247a4 6239 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
6240 }
6241 }
6242 Safefree(r->data->what);
6243 Safefree(r->data);
a0d0e21e
LW
6244 }
6245 Safefree(r->startp);
6246 Safefree(r->endp);
6247 Safefree(r);
a687059c 6248}
c277df42
IZ
6249
6250/*
6251 - regnext - dig the "next" pointer out of a node
c277df42
IZ
6252 */
6253regnode *
864dbfa3 6254Perl_regnext(pTHX_ register regnode *p)
c277df42
IZ
6255{
6256 register I32 offset;
6257
3280af22 6258 if (p == &PL_regdummy)
c277df42
IZ
6259 return(NULL);
6260
6261 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
6262 if (offset == 0)
6263 return(NULL);
6264
c277df42 6265 return(p+offset);
c277df42
IZ
6266}
6267
01f988be 6268STATIC void
cea2e8a9 6269S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
6270{
6271 va_list args;
6272 STRLEN l1 = strlen(pat1);
6273 STRLEN l2 = strlen(pat2);
6274 char buf[512];
06bf62c7 6275 SV *msv;
73d840c0 6276 const char *message;
c277df42
IZ
6277
6278 if (l1 > 510)
6279 l1 = 510;
6280 if (l1 + l2 > 510)
6281 l2 = 510 - l1;
6282 Copy(pat1, buf, l1 , char);
6283 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
6284 buf[l1 + l2] = '\n';
6285 buf[l1 + l2 + 1] = '\0';
8736538c
AS
6286#ifdef I_STDARG
6287 /* ANSI variant takes additional second argument */
c277df42 6288 va_start(args, pat2);
8736538c
AS
6289#else
6290 va_start(args);
6291#endif
5a844595 6292 msv = vmess(buf, &args);
c277df42 6293 va_end(args);
06bf62c7 6294 message = SvPV(msv,l1);
c277df42
IZ
6295 if (l1 > 512)
6296 l1 = 512;
6297 Copy(message, buf, l1 , char);
197cf9b9 6298 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 6299 Perl_croak(aTHX_ "%s", buf);
c277df42 6300}
a0ed51b3
LW
6301
6302/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
6303
6304void
864dbfa3 6305Perl_save_re_context(pTHX)
b81d288d 6306{
830247a4 6307 SAVEI32(PL_reg_flags); /* from regexec.c */
a0ed51b3 6308 SAVEPPTR(PL_bostr);
a0ed51b3
LW
6309 SAVEPPTR(PL_reginput); /* String-input pointer. */
6310 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
6311 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
7766f137
GS
6312 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
6313 SAVEVPTR(PL_regendp); /* Ditto for endp. */
6314 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
a5db57d6 6315 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
a0ed51b3 6316 SAVEPPTR(PL_regtill); /* How far we are required to go. */
b81d288d 6317 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
a0ed51b3 6318 PL_reg_start_tmp = 0;
a0ed51b3
LW
6319 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
6320 PL_reg_start_tmpl = 0;
7766f137 6321 SAVEVPTR(PL_regdata);
a0ed51b3
LW
6322 SAVEI32(PL_reg_eval_set); /* from regexec.c */
6323 SAVEI32(PL_regnarrate); /* from regexec.c */
7766f137 6324 SAVEVPTR(PL_regprogram); /* from regexec.c */
a0ed51b3 6325 SAVEINT(PL_regindent); /* from regexec.c */
7766f137
GS
6326 SAVEVPTR(PL_regcc); /* from regexec.c */
6327 SAVEVPTR(PL_curcop);
7766f137
GS
6328 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
6329 SAVEVPTR(PL_reg_re); /* from regexec.c */
54b6e2fa
IZ
6330 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
6331 SAVESPTR(PL_reg_sv); /* from regexec.c */
9febdf04 6332 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
7766f137 6333 SAVEVPTR(PL_reg_magic); /* from regexec.c */
54b6e2fa 6334 SAVEI32(PL_reg_oldpos); /* from regexec.c */
7766f137
GS
6335 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
6336 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
a5db57d6
GS
6337 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
6338 PL_reg_oldsaved = Nullch;
6339 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
6340 PL_reg_oldsavedlen = 0;
ed252734
NC
6341#ifdef PERL_COPY_ON_WRITE
6342 SAVESPTR(PL_nrs);
6343 PL_nrs = Nullsv;
6344#endif
a5db57d6
GS
6345 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
6346 PL_reg_maxiter = 0;
6347 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
6348 PL_reg_leftiter = 0;
6349 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
6350 PL_reg_poscache = Nullch;
6351 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
6352 PL_reg_poscache_size = 0;
6353 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5fb7366e 6354 SAVEI32(PL_regnpar); /* () count. */
e49a9654 6355 SAVEI32(PL_regsize); /* from regexec.c */
ada6e8a9
AMS
6356
6357 {
6358 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
ada6e8a9 6359 REGEXP *rx;
ada6e8a9
AMS
6360
6361 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
1df70142 6362 U32 i;
ada6e8a9 6363 for (i = 1; i <= rx->nparens; i++) {
1df70142
AL
6364 GV *mgv;
6365 char digits[TYPE_CHARS(long)];
d994d9a1 6366 sprintf(digits, "%lu", (long)i);
ada6e8a9
AMS
6367 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
6368 save_scalar(mgv);
6369 }
6370 }
6371 }
6372
54b6e2fa 6373#ifdef DEBUGGING
b81d288d 6374 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
54b6e2fa 6375#endif
a0ed51b3 6376}
51371543 6377
51371543 6378static void
acfe0abc 6379clear_re(pTHX_ void *r)
51371543
GS
6380{
6381 ReREFCNT_dec((regexp *)r);
6382}
ffbc6a93 6383
241d1a3b
NC
6384/*
6385 * Local variables:
6386 * c-indentation-style: bsd
6387 * c-basic-offset: 4
6388 * indent-tabs-mode: t
6389 * End:
6390 *
37442d52
RGS
6391 * ex: set ts=8 sts=4 sw=4 noet:
6392 */