This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Change sprintf() to my_sprintf(), and use the returned length from
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
6bda09f9 120 regnode **parens; /* offsets of each paren */
1aa99e6b 121 I32 utf8;
6bda09f9 122 HV *charnames; /* cache of named sequences */
830247a4
IZ
123#if ADD_TO_REGEXEC
124 char *starttry; /* -Dr: where regtry was called. */
125#define RExC_starttry (pRExC_state->starttry)
126#endif
3dab1dad 127#ifdef DEBUGGING
be8e71aa 128 const char *lastparse;
3dab1dad
YO
129 I32 lastnum;
130#define RExC_lastparse (pRExC_state->lastparse)
131#define RExC_lastnum (pRExC_state->lastnum)
132#endif
830247a4
IZ
133} RExC_state_t;
134
e2509266 135#define RExC_flags (pRExC_state->flags)
830247a4
IZ
136#define RExC_precomp (pRExC_state->precomp)
137#define RExC_rx (pRExC_state->rx)
fac92740 138#define RExC_start (pRExC_state->start)
830247a4
IZ
139#define RExC_end (pRExC_state->end)
140#define RExC_parse (pRExC_state->parse)
141#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 142#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 143#define RExC_emit (pRExC_state->emit)
fac92740 144#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
145#define RExC_naughty (pRExC_state->naughty)
146#define RExC_sawback (pRExC_state->sawback)
147#define RExC_seen (pRExC_state->seen)
148#define RExC_size (pRExC_state->size)
149#define RExC_npar (pRExC_state->npar)
150#define RExC_extralen (pRExC_state->extralen)
151#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 153#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 154#define RExC_charnames (pRExC_state->charnames)
6bda09f9 155#define RExC_parens (pRExC_state->parens)
830247a4 156
a687059c
LW
157#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
158#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
159 ((*s) == '{' && regcurly(s)))
a687059c 160
35c8bce7
LW
161#ifdef SPSTART
162#undef SPSTART /* dratted cpp namespace... */
163#endif
a687059c
LW
164/*
165 * Flags to be passed up and down.
166 */
a687059c 167#define WORST 0 /* Worst case. */
821b33a5 168#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
169#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
170#define SPSTART 0x4 /* Starts with * or +. */
171#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 172
3dab1dad
YO
173#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
174
07be1b83
YO
175/* whether trie related optimizations are enabled */
176#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
177#define TRIE_STUDY_OPT
786e8c11 178#define FULL_TRIE_STUDY
07be1b83
YO
179#define TRIE_STCLASS
180#endif
1de06328
YO
181
182
183/* About scan_data_t.
184
185 During optimisation we recurse through the regexp program performing
186 various inplace (keyhole style) optimisations. In addition study_chunk
187 and scan_commit populate this data structure with information about
188 what strings MUST appear in the pattern. We look for the longest
189 string that must appear for at a fixed location, and we look for the
190 longest string that may appear at a floating location. So for instance
191 in the pattern:
192
193 /FOO[xX]A.*B[xX]BAR/
194
195 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
196 strings (because they follow a .* construct). study_chunk will identify
197 both FOO and BAR as being the longest fixed and floating strings respectively.
198
199 The strings can be composites, for instance
200
201 /(f)(o)(o)/
202
203 will result in a composite fixed substring 'foo'.
204
205 For each string some basic information is maintained:
206
207 - offset or min_offset
208 This is the position the string must appear at, or not before.
209 It also implicitly (when combined with minlenp) tells us how many
210 character must match before the string we are searching.
211 Likewise when combined with minlenp and the length of the string
212 tells us how many characters must appear after the string we have
213 found.
214
215 - max_offset
216 Only used for floating strings. This is the rightmost point that
217 the string can appear at. Ifset to I32 max it indicates that the
218 string can occur infinitely far to the right.
219
220 - minlenp
221 A pointer to the minimum length of the pattern that the string
222 was found inside. This is important as in the case of positive
223 lookahead or positive lookbehind we can have multiple patterns
224 involved. Consider
225
226 /(?=FOO).*F/
227
228 The minimum length of the pattern overall is 3, the minimum length
229 of the lookahead part is 3, but the minimum length of the part that
230 will actually match is 1. So 'FOO's minimum length is 3, but the
231 minimum length for the F is 1. This is important as the minimum length
232 is used to determine offsets in front of and behind the string being
233 looked for. Since strings can be composites this is the length of the
234 pattern at the time it was commited with a scan_commit. Note that
235 the length is calculated by study_chunk, so that the minimum lengths
236 are not known until the full pattern has been compiled, thus the
237 pointer to the value.
238
239 - lookbehind
240
241 In the case of lookbehind the string being searched for can be
242 offset past the start point of the final matching string.
243 If this value was just blithely removed from the min_offset it would
244 invalidate some of the calculations for how many chars must match
245 before or after (as they are derived from min_offset and minlen and
246 the length of the string being searched for).
247 When the final pattern is compiled and the data is moved from the
248 scan_data_t structure into the regexp structure the information
249 about lookbehind is factored in, with the information that would
250 have been lost precalculated in the end_shift field for the
251 associated string.
252
253 The fields pos_min and pos_delta are used to store the minimum offset
254 and the delta to the maximum offset at the current point in the pattern.
255
256*/
2c2d71f5
JH
257
258typedef struct scan_data_t {
1de06328
YO
259 /*I32 len_min; unused */
260 /*I32 len_delta; unused */
2c2d71f5
JH
261 I32 pos_min;
262 I32 pos_delta;
263 SV *last_found;
1de06328 264 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
265 I32 last_start_min;
266 I32 last_start_max;
1de06328
YO
267 SV **longest; /* Either &l_fixed, or &l_float. */
268 SV *longest_fixed; /* longest fixed string found in pattern */
269 I32 offset_fixed; /* offset where it starts */
270 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
271 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
272 SV *longest_float; /* longest floating string found in pattern */
273 I32 offset_float_min; /* earliest point in string it can appear */
274 I32 offset_float_max; /* latest point in string it can appear */
275 I32 *minlen_float; /* pointer to the minlen relevent to the string */
276 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
277 I32 flags;
278 I32 whilem_c;
cb434fcc 279 I32 *last_closep;
653099ff 280 struct regnode_charclass_class *start_class;
2c2d71f5
JH
281} scan_data_t;
282
a687059c 283/*
e50aee73 284 * Forward declarations for pregcomp()'s friends.
a687059c 285 */
a0d0e21e 286
27da23d5 287static const scan_data_t zero_scan_data =
1de06328 288 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
289
290#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
291#define SF_BEFORE_SEOL 0x0001
292#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
293#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
294#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
295
09b7f37c
CB
296#ifdef NO_UNARY_PLUS
297# define SF_FIX_SHIFT_EOL (0+2)
298# define SF_FL_SHIFT_EOL (0+4)
299#else
300# define SF_FIX_SHIFT_EOL (+2)
301# define SF_FL_SHIFT_EOL (+4)
302#endif
c277df42
IZ
303
304#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
305#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
306
307#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
308#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
309#define SF_IS_INF 0x0040
310#define SF_HAS_PAR 0x0080
311#define SF_IN_PAR 0x0100
312#define SF_HAS_EVAL 0x0200
313#define SCF_DO_SUBSTR 0x0400
653099ff
GS
314#define SCF_DO_STCLASS_AND 0x0800
315#define SCF_DO_STCLASS_OR 0x1000
316#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 317#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 318
786e8c11
YO
319#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
320
07be1b83 321
eb160463 322#define UTF (RExC_utf8 != 0)
e2509266
JH
323#define LOC ((RExC_flags & PMf_LOCALE) != 0)
324#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 325
ffc61ed2 326#define OOB_UNICODE 12345678
93733859 327#define OOB_NAMEDCLASS -1
b8c5462f 328
a0ed51b3
LW
329#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
330#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
331
8615cb43 332
b45f050a
JF
333/* length of regex to show in messages that don't mark a position within */
334#define RegexLengthToShowInErrorMessages 127
335
336/*
337 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
338 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
339 * op/pragma/warn/regcomp.
340 */
7253e4e3
RK
341#define MARKER1 "<-- HERE" /* marker as it appears in the description */
342#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 343
7253e4e3 344#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
345
346/*
347 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
348 * arg. Show regex, up to a maximum length. If it's too long, chop and add
349 * "...".
350 */
ccb2c380 351#define FAIL(msg) STMT_START { \
bfed75c6 352 const char *ellipses = ""; \
ccb2c380
MP
353 IV len = RExC_end - RExC_precomp; \
354 \
355 if (!SIZE_ONLY) \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 if (len > RegexLengthToShowInErrorMessages) { \
358 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
359 len = RegexLengthToShowInErrorMessages - 10; \
360 ellipses = "..."; \
361 } \
362 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
363 msg, (int)len, RExC_precomp, ellipses); \
364} STMT_END
8615cb43 365
b45f050a 366/*
b45f050a
JF
367 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
368 */
ccb2c380 369#define Simple_vFAIL(m) STMT_START { \
a28509cc 370 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
371 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
373} STMT_END
b45f050a
JF
374
375/*
376 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
377 */
ccb2c380
MP
378#define vFAIL(m) STMT_START { \
379 if (!SIZE_ONLY) \
380 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
381 Simple_vFAIL(m); \
382} STMT_END
b45f050a
JF
383
384/*
385 * Like Simple_vFAIL(), but accepts two arguments.
386 */
ccb2c380 387#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 388 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
389 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
390 (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
b45f050a
JF
392
393/*
394 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
395 */
ccb2c380
MP
396#define vFAIL2(m,a1) STMT_START { \
397 if (!SIZE_ONLY) \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
399 Simple_vFAIL2(m, a1); \
400} STMT_END
b45f050a
JF
401
402
403/*
404 * Like Simple_vFAIL(), but accepts three arguments.
405 */
ccb2c380 406#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 407 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
408 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
409 (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
b45f050a
JF
411
412/*
413 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
414 */
ccb2c380
MP
415#define vFAIL3(m,a1,a2) STMT_START { \
416 if (!SIZE_ONLY) \
417 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
418 Simple_vFAIL3(m, a1, a2); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Like Simple_vFAIL(), but accepts four arguments.
423 */
ccb2c380 424#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a 429
ccb2c380 430#define vWARN(loc,m) STMT_START { \
a28509cc 431 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
432 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
433 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
434} STMT_END
435
436#define vWARNdep(loc,m) STMT_START { \
a28509cc 437 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
438 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
439 "%s" REPORT_LOCATION, \
440 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
441} STMT_END
442
443
444#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 445 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
446 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
447 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
449
450#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 451 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
453 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
455
456#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 457 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
458 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
459 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
460} STMT_END
461
462#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 463 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
464 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
465 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
466} STMT_END
9d1d55b5 467
8615cb43 468
cd439c50 469/* Allow for side effects in s */
ccb2c380
MP
470#define REGC(c,s) STMT_START { \
471 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
472} STMT_END
cd439c50 473
fac92740
MJD
474/* Macros for recording node offsets. 20001227 mjd@plover.com
475 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
476 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
477 * Element 0 holds the number n.
07be1b83 478 * Position is 1 indexed.
fac92740
MJD
479 */
480
ccb2c380
MP
481#define Set_Node_Offset_To_R(node,byte) STMT_START { \
482 if (! SIZE_ONLY) { \
483 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 484 __LINE__, (node), (int)(byte))); \
ccb2c380 485 if((node) < 0) { \
551405c4 486 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
487 } else { \
488 RExC_offsets[2*(node)-1] = (byte); \
489 } \
490 } \
491} STMT_END
492
493#define Set_Node_Offset(node,byte) \
494 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
495#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
496
497#define Set_Node_Length_To_R(node,len) STMT_START { \
498 if (! SIZE_ONLY) { \
499 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 500 __LINE__, (int)(node), (int)(len))); \
ccb2c380 501 if((node) < 0) { \
551405c4 502 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
503 } else { \
504 RExC_offsets[2*(node)] = (len); \
505 } \
506 } \
507} STMT_END
508
509#define Set_Node_Length(node,len) \
510 Set_Node_Length_To_R((node)-RExC_emit_start, len)
511#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
512#define Set_Node_Cur_Length(node) \
513 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
514
515/* Get offsets and lengths */
516#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
517#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
518
07be1b83
YO
519#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
520 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
521 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
522} STMT_END
523
524
525#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
526#define EXPERIMENTAL_INPLACESCAN
527#endif
528
1de06328 529#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 530DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
531 PerlIO_printf(Perl_debug_log, \
532 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
533 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
534 (int)(depth)*2, "", \
535 (IV)((data)->pos_min), \
536 (IV)((data)->pos_delta), \
537 (IV)((data)->flags), \
538 (IV)((data)->whilem_c), \
539 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
540 ); \
541 if ((data)->last_found) \
542 PerlIO_printf(Perl_debug_log, \
543 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
544 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
545 SvPVX_const((data)->last_found), \
546 (IV)((data)->last_end), \
547 (IV)((data)->last_start_min), \
548 (IV)((data)->last_start_max), \
549 ((data)->longest && \
550 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
551 SvPVX_const((data)->longest_fixed), \
552 (IV)((data)->offset_fixed), \
553 ((data)->longest && \
554 (data)->longest==&((data)->longest_float)) ? "*" : "", \
555 SvPVX_const((data)->longest_float), \
556 (IV)((data)->offset_float_min), \
557 (IV)((data)->offset_float_max) \
558 ); \
559 PerlIO_printf(Perl_debug_log,"\n"); \
560});
561
acfe0abc 562static void clear_re(pTHX_ void *r);
4327152a 563
653099ff 564/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 565 Update the longest found anchored substring and the longest found
653099ff
GS
566 floating substrings if needed. */
567
4327152a 568STATIC void
1de06328 569S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 570{
e1ec3a88
AL
571 const STRLEN l = CHR_SVLEN(data->last_found);
572 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 573 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 574
c277df42 575 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 576 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
577 if (*data->longest == data->longest_fixed) {
578 data->offset_fixed = l ? data->last_start_min : data->pos_min;
579 if (data->flags & SF_BEFORE_EOL)
b81d288d 580 data->flags
c277df42
IZ
581 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
582 else
583 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
584 data->minlen_fixed=minlenp;
585 data->lookbehind_fixed=0;
a0ed51b3
LW
586 }
587 else {
c277df42 588 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
589 data->offset_float_max = (l
590 ? data->last_start_max
c277df42 591 : data->pos_min + data->pos_delta);
9051bda5
HS
592 if ((U32)data->offset_float_max > (U32)I32_MAX)
593 data->offset_float_max = I32_MAX;
c277df42 594 if (data->flags & SF_BEFORE_EOL)
b81d288d 595 data->flags
c277df42
IZ
596 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
597 else
598 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
599 data->minlen_float=minlenp;
600 data->lookbehind_float=0;
c277df42
IZ
601 }
602 }
603 SvCUR_set(data->last_found, 0);
0eda9292 604 {
a28509cc 605 SV * const sv = data->last_found;
097eb12c
AL
606 if (SvUTF8(sv) && SvMAGICAL(sv)) {
607 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
608 if (mg)
609 mg->mg_len = 0;
610 }
0eda9292 611 }
c277df42
IZ
612 data->last_end = -1;
613 data->flags &= ~SF_BEFORE_EOL;
1de06328 614 DEBUG_STUDYDATA(data,0);
c277df42
IZ
615}
616
653099ff
GS
617/* Can match anything (initialization) */
618STATIC void
097eb12c 619S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 620{
653099ff 621 ANYOF_CLASS_ZERO(cl);
f8bef550 622 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 623 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
624 if (LOC)
625 cl->flags |= ANYOF_LOCALE;
626}
627
628/* Can match anything (initialization) */
629STATIC int
5f66b61c 630S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
631{
632 int value;
633
aaa51d5e 634 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
635 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
636 return 1;
1aa99e6b
IH
637 if (!(cl->flags & ANYOF_UNICODE_ALL))
638 return 0;
10edeb5d 639 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 640 return 0;
653099ff
GS
641 return 1;
642}
643
644/* Can match anything (initialization) */
645STATIC void
097eb12c 646S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 647{
8ecf7187 648 Zero(cl, 1, struct regnode_charclass_class);
653099ff 649 cl->type = ANYOF;
830247a4 650 cl_anything(pRExC_state, cl);
653099ff
GS
651}
652
653STATIC void
097eb12c 654S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 655{
8ecf7187 656 Zero(cl, 1, struct regnode_charclass_class);
653099ff 657 cl->type = ANYOF;
830247a4 658 cl_anything(pRExC_state, cl);
653099ff
GS
659 if (LOC)
660 cl->flags |= ANYOF_LOCALE;
661}
662
663/* 'And' a given class with another one. Can create false positives */
664/* We assume that cl is not inverted */
665STATIC void
5f66b61c 666S_cl_and(struct regnode_charclass_class *cl,
a28509cc 667 const struct regnode_charclass_class *and_with)
653099ff 668{
653099ff
GS
669 if (!(and_with->flags & ANYOF_CLASS)
670 && !(cl->flags & ANYOF_CLASS)
671 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
672 && !(and_with->flags & ANYOF_FOLD)
673 && !(cl->flags & ANYOF_FOLD)) {
674 int i;
675
676 if (and_with->flags & ANYOF_INVERT)
677 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
678 cl->bitmap[i] &= ~and_with->bitmap[i];
679 else
680 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
681 cl->bitmap[i] &= and_with->bitmap[i];
682 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
683 if (!(and_with->flags & ANYOF_EOS))
684 cl->flags &= ~ANYOF_EOS;
1aa99e6b 685
14ebb1a2
JH
686 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
687 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
688 cl->flags &= ~ANYOF_UNICODE_ALL;
689 cl->flags |= ANYOF_UNICODE;
690 ARG_SET(cl, ARG(and_with));
691 }
14ebb1a2
JH
692 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
693 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 694 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
695 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
696 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 697 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
698}
699
700/* 'OR' a given class with another one. Can create false positives */
701/* We assume that cl is not inverted */
702STATIC void
097eb12c 703S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 704{
653099ff
GS
705 if (or_with->flags & ANYOF_INVERT) {
706 /* We do not use
707 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
708 * <= (B1 | !B2) | (CL1 | !CL2)
709 * which is wasteful if CL2 is small, but we ignore CL2:
710 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
711 * XXXX Can we handle case-fold? Unclear:
712 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
713 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
714 */
715 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
716 && !(or_with->flags & ANYOF_FOLD)
717 && !(cl->flags & ANYOF_FOLD) ) {
718 int i;
719
720 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
721 cl->bitmap[i] |= ~or_with->bitmap[i];
722 } /* XXXX: logic is complicated otherwise */
723 else {
830247a4 724 cl_anything(pRExC_state, cl);
653099ff
GS
725 }
726 } else {
727 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
728 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 729 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
730 || (cl->flags & ANYOF_FOLD)) ) {
731 int i;
732
733 /* OR char bitmap and class bitmap separately */
734 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
735 cl->bitmap[i] |= or_with->bitmap[i];
736 if (or_with->flags & ANYOF_CLASS) {
737 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
738 cl->classflags[i] |= or_with->classflags[i];
739 cl->flags |= ANYOF_CLASS;
740 }
741 }
742 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 743 cl_anything(pRExC_state, cl);
653099ff
GS
744 }
745 }
746 if (or_with->flags & ANYOF_EOS)
747 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
748
749 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
750 ARG(cl) != ARG(or_with)) {
751 cl->flags |= ANYOF_UNICODE_ALL;
752 cl->flags &= ~ANYOF_UNICODE;
753 }
754 if (or_with->flags & ANYOF_UNICODE_ALL) {
755 cl->flags |= ANYOF_UNICODE_ALL;
756 cl->flags &= ~ANYOF_UNICODE;
757 }
653099ff
GS
758}
759
a3621e74
YO
760#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
761#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
762#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
763#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
764
3dab1dad
YO
765
766#ifdef DEBUGGING
07be1b83 767/*
3dab1dad
YO
768 dump_trie(trie)
769 dump_trie_interim_list(trie,next_alloc)
770 dump_trie_interim_table(trie,next_alloc)
771
772 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
773 The _interim_ variants are used for debugging the interim
774 tables that are used to generate the final compressed
775 representation which is what dump_trie expects.
776
3dab1dad
YO
777 Part of the reason for their existance is to provide a form
778 of documentation as to how the different representations function.
07be1b83
YO
779
780*/
3dab1dad
YO
781
782/*
783 dump_trie(trie)
784 Dumps the final compressed table form of the trie to Perl_debug_log.
785 Used for debugging make_trie().
786*/
787
788STATIC void
789S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
790{
791 U32 state;
ab3bbdeb
YO
792 SV *sv=sv_newmortal();
793 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
794 GET_RE_DEBUG_FLAGS_DECL;
795
ab3bbdeb 796
3dab1dad
YO
797 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
798 (int)depth * 2 + 2,"",
799 "Match","Base","Ofs" );
800
801 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 802 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 803 if ( tmp ) {
ab3bbdeb
YO
804 PerlIO_printf( Perl_debug_log, "%*s",
805 colwidth,
ddc5bc0f 806 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
807 PL_colors[0], PL_colors[1],
808 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
809 PERL_PV_ESCAPE_FIRSTCHAR
810 )
811 );
3dab1dad
YO
812 }
813 }
814 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
815 (int)depth * 2 + 2,"");
816
817 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 818 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
819 PerlIO_printf( Perl_debug_log, "\n");
820
786e8c11 821 for( state = 1 ; state < trie->laststate ; state++ ) {
be8e71aa 822 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
823
824 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
825
826 if ( trie->states[ state ].wordnum ) {
827 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
828 } else {
829 PerlIO_printf( Perl_debug_log, "%6s", "" );
830 }
831
832 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
833
834 if ( base ) {
835 U32 ofs = 0;
836
837 while( ( base + ofs < trie->uniquecharcount ) ||
838 ( base + ofs - trie->uniquecharcount < trie->lasttrans
839 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
840 ofs++;
841
842 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
843
844 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
845 if ( ( base + ofs >= trie->uniquecharcount ) &&
846 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
847 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
848 {
ab3bbdeb
YO
849 PerlIO_printf( Perl_debug_log, "%*"UVXf,
850 colwidth,
3dab1dad
YO
851 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
852 } else {
ab3bbdeb 853 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
854 }
855 }
856
857 PerlIO_printf( Perl_debug_log, "]");
858
859 }
860 PerlIO_printf( Perl_debug_log, "\n" );
861 }
862}
863/*
864 dump_trie_interim_list(trie,next_alloc)
865 Dumps a fully constructed but uncompressed trie in list form.
866 List tries normally only are used for construction when the number of
867 possible chars (trie->uniquecharcount) is very high.
868 Used for debugging make_trie().
869*/
870STATIC void
871S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
872{
873 U32 state;
ab3bbdeb
YO
874 SV *sv=sv_newmortal();
875 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
876 GET_RE_DEBUG_FLAGS_DECL;
877 /* print out the table precompression. */
ab3bbdeb
YO
878 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
879 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
880 "------:-----+-----------------\n" );
3dab1dad
YO
881
882 for( state=1 ; state < next_alloc ; state ++ ) {
883 U16 charid;
884
ab3bbdeb 885 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
886 (int)depth * 2 + 2,"", (UV)state );
887 if ( ! trie->states[ state ].wordnum ) {
888 PerlIO_printf( Perl_debug_log, "%5s| ","");
889 } else {
890 PerlIO_printf( Perl_debug_log, "W%4x| ",
891 trie->states[ state ].wordnum
892 );
893 }
894 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 895 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
896 if ( tmp ) {
897 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
898 colwidth,
ddc5bc0f 899 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
900 PL_colors[0], PL_colors[1],
901 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
902 PERL_PV_ESCAPE_FIRSTCHAR
903 ) ,
3dab1dad
YO
904 TRIE_LIST_ITEM(state,charid).forid,
905 (UV)TRIE_LIST_ITEM(state,charid).newstate
906 );
907 }
ab3bbdeb
YO
908 }
909 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
910 }
911}
912
913/*
914 dump_trie_interim_table(trie,next_alloc)
915 Dumps a fully constructed but uncompressed trie in table form.
916 This is the normal DFA style state transition table, with a few
917 twists to facilitate compression later.
918 Used for debugging make_trie().
919*/
920STATIC void
921S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
922{
923 U32 state;
924 U16 charid;
ab3bbdeb
YO
925 SV *sv=sv_newmortal();
926 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
927 GET_RE_DEBUG_FLAGS_DECL;
928
929 /*
930 print out the table precompression so that we can do a visual check
931 that they are identical.
932 */
933
934 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
935
936 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 937 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 938 if ( tmp ) {
ab3bbdeb
YO
939 PerlIO_printf( Perl_debug_log, "%*s",
940 colwidth,
ddc5bc0f 941 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
942 PL_colors[0], PL_colors[1],
943 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
944 PERL_PV_ESCAPE_FIRSTCHAR
945 )
946 );
3dab1dad
YO
947 }
948 }
949
950 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
951
952 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 953 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
954 }
955
956 PerlIO_printf( Perl_debug_log, "\n" );
957
958 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
959
960 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
961 (int)depth * 2 + 2,"",
962 (UV)TRIE_NODENUM( state ) );
963
964 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
965 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
966 if (v)
967 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
968 else
969 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
970 }
971 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
972 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
973 } else {
974 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
975 trie->states[ TRIE_NODENUM( state ) ].wordnum );
976 }
977 }
07be1b83 978}
3dab1dad
YO
979
980#endif
981
786e8c11
YO
982/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
983 startbranch: the first branch in the whole branch sequence
984 first : start branch of sequence of branch-exact nodes.
985 May be the same as startbranch
986 last : Thing following the last branch.
987 May be the same as tail.
988 tail : item following the branch sequence
989 count : words in the sequence
990 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
991 depth : indent depth
3dab1dad 992
786e8c11 993Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 994
786e8c11
YO
995A trie is an N'ary tree where the branches are determined by digital
996decomposition of the key. IE, at the root node you look up the 1st character and
997follow that branch repeat until you find the end of the branches. Nodes can be
998marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 999
786e8c11 1000 /he|she|his|hers/
72f13be8 1001
786e8c11
YO
1002would convert into the following structure. Numbers represent states, letters
1003following numbers represent valid transitions on the letter from that state, if
1004the number is in square brackets it represents an accepting state, otherwise it
1005will be in parenthesis.
07be1b83 1006
786e8c11
YO
1007 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1008 | |
1009 | (2)
1010 | |
1011 (1) +-i->(6)-+-s->[7]
1012 |
1013 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1014
786e8c11
YO
1015 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1016
1017This shows that when matching against the string 'hers' we will begin at state 1
1018read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1019then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1020is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1021single traverse. We store a mapping from accepting to state to which word was
1022matched, and then when we have multiple possibilities we try to complete the
1023rest of the regex in the order in which they occured in the alternation.
1024
1025The only prior NFA like behaviour that would be changed by the TRIE support is
1026the silent ignoring of duplicate alternations which are of the form:
1027
1028 / (DUPE|DUPE) X? (?{ ... }) Y /x
1029
1030Thus EVAL blocks follwing a trie may be called a different number of times with
1031and without the optimisation. With the optimisations dupes will be silently
1032ignored. This inconsistant behaviour of EVAL type nodes is well established as
1033the following demonstrates:
1034
1035 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1036
1037which prints out 'word' three times, but
1038
1039 'words'=~/(word|word|word)(?{ print $1 })S/
1040
1041which doesnt print it out at all. This is due to other optimisations kicking in.
1042
1043Example of what happens on a structural level:
1044
1045The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1046
1047 1: CURLYM[1] {1,32767}(18)
1048 5: BRANCH(8)
1049 6: EXACT <ac>(16)
1050 8: BRANCH(11)
1051 9: EXACT <ad>(16)
1052 11: BRANCH(14)
1053 12: EXACT <ab>(16)
1054 16: SUCCEED(0)
1055 17: NOTHING(18)
1056 18: END(0)
1057
1058This would be optimizable with startbranch=5, first=5, last=16, tail=16
1059and should turn into:
1060
1061 1: CURLYM[1] {1,32767}(18)
1062 5: TRIE(16)
1063 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1064 <ac>
1065 <ad>
1066 <ab>
1067 16: SUCCEED(0)
1068 17: NOTHING(18)
1069 18: END(0)
1070
1071Cases where tail != last would be like /(?foo|bar)baz/:
1072
1073 1: BRANCH(4)
1074 2: EXACT <foo>(8)
1075 4: BRANCH(7)
1076 5: EXACT <bar>(8)
1077 7: TAIL(8)
1078 8: EXACT <baz>(10)
1079 10: END(0)
1080
1081which would be optimizable with startbranch=1, first=1, last=7, tail=8
1082and would end up looking like:
1083
1084 1: TRIE(8)
1085 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1086 <foo>
1087 <bar>
1088 7: TAIL(8)
1089 8: EXACT <baz>(10)
1090 10: END(0)
1091
1092 d = uvuni_to_utf8_flags(d, uv, 0);
1093
1094is the recommended Unicode-aware way of saying
1095
1096 *(d++) = uv;
1097*/
1098
1099#define TRIE_STORE_REVCHAR \
1100 STMT_START { \
1101 SV *tmp = Perl_newSVpvf_nocontext( "%c", (int)uvc ); \
1102 if (UTF) SvUTF8_on(tmp); \
1103 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1104 } STMT_END
1105
1106#define TRIE_READ_CHAR STMT_START { \
1107 wordlen++; \
1108 if ( UTF ) { \
1109 if ( folder ) { \
1110 if ( foldlen > 0 ) { \
1111 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1112 foldlen -= len; \
1113 scan += len; \
1114 len = 0; \
1115 } else { \
1116 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1117 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1118 foldlen -= UNISKIP( uvc ); \
1119 scan = foldbuf + UNISKIP( uvc ); \
1120 } \
1121 } else { \
1122 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1123 } \
1124 } else { \
1125 uvc = (U32)*uc; \
1126 len = 1; \
1127 } \
1128} STMT_END
1129
1130
1131
1132#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1133 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1134 TRIE_LIST_LEN( state ) *= 2; \
1135 Renew( trie->states[ state ].trans.list, \
1136 TRIE_LIST_LEN( state ), reg_trie_trans_le ); \
1137 } \
1138 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1139 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1140 TRIE_LIST_CUR( state )++; \
1141} STMT_END
07be1b83 1142
786e8c11
YO
1143#define TRIE_LIST_NEW(state) STMT_START { \
1144 Newxz( trie->states[ state ].trans.list, \
1145 4, reg_trie_trans_le ); \
1146 TRIE_LIST_CUR( state ) = 1; \
1147 TRIE_LIST_LEN( state ) = 4; \
1148} STMT_END
07be1b83 1149
786e8c11
YO
1150#define TRIE_HANDLE_WORD(state) STMT_START { \
1151 U16 dupe= trie->states[ state ].wordnum; \
1152 regnode * const noper_next = regnext( noper ); \
1153 \
1154 if (trie->wordlen) \
1155 trie->wordlen[ curword ] = wordlen; \
1156 DEBUG_r({ \
1157 /* store the word for dumping */ \
1158 SV* tmp; \
1159 if (OP(noper) != NOTHING) \
1160 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1161 else \
1162 tmp = newSVpvn( "", 0 ); \
1163 if ( UTF ) SvUTF8_on( tmp ); \
1164 av_push( trie->words, tmp ); \
1165 }); \
1166 \
1167 curword++; \
1168 \
1169 if ( noper_next < tail ) { \
1170 if (!trie->jump) \
1171 Newxz( trie->jump, word_count + 1, U16); \
1172 trie->jump[curword] = (U16)(tail - noper_next); \
1173 if (!jumper) \
1174 jumper = noper_next; \
1175 if (!nextbranch) \
1176 nextbranch= regnext(cur); \
1177 } \
1178 \
1179 if ( dupe ) { \
1180 /* So it's a dupe. This means we need to maintain a */\
1181 /* linked-list from the first to the next. */\
1182 /* we only allocate the nextword buffer when there */\
1183 /* a dupe, so first time we have to do the allocation */\
1184 if (!trie->nextword) \
1185 Newxz( trie->nextword, word_count + 1, U16); \
1186 while ( trie->nextword[dupe] ) \
1187 dupe= trie->nextword[dupe]; \
1188 trie->nextword[dupe]= curword; \
1189 } else { \
1190 /* we haven't inserted this word yet. */ \
1191 trie->states[ state ].wordnum = curword; \
1192 } \
1193} STMT_END
07be1b83 1194
3dab1dad 1195
786e8c11
YO
1196#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1197 ( ( base + charid >= ucharcount \
1198 && base + charid < ubound \
1199 && state == trie->trans[ base - ucharcount + charid ].check \
1200 && trie->trans[ base - ucharcount + charid ].next ) \
1201 ? trie->trans[ base - ucharcount + charid ].next \
1202 : ( state==1 ? special : 0 ) \
1203 )
3dab1dad 1204
786e8c11
YO
1205#define MADE_TRIE 1
1206#define MADE_JUMP_TRIE 2
1207#define MADE_EXACT_TRIE 4
3dab1dad 1208
a3621e74 1209STATIC I32
786e8c11 1210S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1211{
27da23d5 1212 dVAR;
a3621e74
YO
1213 /* first pass, loop through and scan words */
1214 reg_trie_data *trie;
1215 regnode *cur;
9f7f3913 1216 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1217 STRLEN len = 0;
1218 UV uvc = 0;
1219 U16 curword = 0;
1220 U32 next_alloc = 0;
786e8c11
YO
1221 regnode *jumper = NULL;
1222 regnode *nextbranch = NULL;
a3621e74 1223 /* we just use folder as a flag in utf8 */
e1ec3a88 1224 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1225 ? PL_fold
1226 : ( flags == EXACTFL
1227 ? PL_fold_locale
1228 : NULL
1229 )
1230 );
1231
e1ec3a88 1232 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1233 SV *re_trie_maxbuff;
3dab1dad
YO
1234#ifndef DEBUGGING
1235 /* these are only used during construction but are useful during
8e11feef 1236 * debugging so we store them in the struct when debugging.
8e11feef 1237 */
3dab1dad 1238 STRLEN trie_charcount=0;
3dab1dad
YO
1239 AV *trie_revcharmap;
1240#endif
a3621e74 1241 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1242#ifndef DEBUGGING
1243 PERL_UNUSED_ARG(depth);
1244#endif
a3621e74 1245
a02a5408 1246 Newxz( trie, 1, reg_trie_data );
a3621e74 1247 trie->refcount = 1;
3dab1dad 1248 trie->startstate = 1;
786e8c11 1249 trie->wordcount = word_count;
a3621e74 1250 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1251 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1252 if (!(UTF && folder))
1253 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1254 DEBUG_r({
1255 trie->words = newAV();
a3621e74 1256 });
3dab1dad 1257 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1258
0111c4fd 1259 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1260 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1261 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1262 }
3dab1dad
YO
1263 DEBUG_OPTIMISE_r({
1264 PerlIO_printf( Perl_debug_log,
786e8c11 1265 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1266 (int)depth * 2 + 2, "",
1267 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1268 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1269 (int)depth);
3dab1dad 1270 });
a3621e74
YO
1271 /* -- First loop and Setup --
1272
1273 We first traverse the branches and scan each word to determine if it
1274 contains widechars, and how many unique chars there are, this is
1275 important as we have to build a table with at least as many columns as we
1276 have unique chars.
1277
1278 We use an array of integers to represent the character codes 0..255
1279 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1280 native representation of the character value as the key and IV's for the
1281 coded index.
1282
1283 *TODO* If we keep track of how many times each character is used we can
1284 remap the columns so that the table compression later on is more
1285 efficient in terms of memory by ensuring most common value is in the
1286 middle and the least common are on the outside. IMO this would be better
1287 than a most to least common mapping as theres a decent chance the most
1288 common letter will share a node with the least common, meaning the node
1289 will not be compressable. With a middle is most common approach the worst
1290 case is when we have the least common nodes twice.
1291
1292 */
1293
a3621e74 1294 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1295 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1296 const U8 *uc = (U8*)STRING( noper );
a28509cc 1297 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1298 STRLEN foldlen = 0;
1299 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1300 const U8 *scan = (U8*)NULL;
07be1b83 1301 U32 wordlen = 0; /* required init */
3dab1dad 1302 STRLEN chars=0;
a3621e74 1303
3dab1dad
YO
1304 if (OP(noper) == NOTHING) {
1305 trie->minlen= 0;
1306 continue;
1307 }
1308 if (trie->bitmap) {
1309 TRIE_BITMAP_SET(trie,*uc);
1310 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1311 }
a3621e74 1312 for ( ; uc < e ; uc += len ) {
3dab1dad 1313 TRIE_CHARCOUNT(trie)++;
a3621e74 1314 TRIE_READ_CHAR;
3dab1dad 1315 chars++;
a3621e74
YO
1316 if ( uvc < 256 ) {
1317 if ( !trie->charmap[ uvc ] ) {
1318 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1319 if ( folder )
1320 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1321 TRIE_STORE_REVCHAR;
a3621e74
YO
1322 }
1323 } else {
1324 SV** svpp;
1325 if ( !trie->widecharmap )
1326 trie->widecharmap = newHV();
1327
1328 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1329
1330 if ( !svpp )
e4584336 1331 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1332
1333 if ( !SvTRUE( *svpp ) ) {
1334 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1335 TRIE_STORE_REVCHAR;
a3621e74
YO
1336 }
1337 }
1338 }
3dab1dad
YO
1339 if( cur == first ) {
1340 trie->minlen=chars;
1341 trie->maxlen=chars;
1342 } else if (chars < trie->minlen) {
1343 trie->minlen=chars;
1344 } else if (chars > trie->maxlen) {
1345 trie->maxlen=chars;
1346 }
1347
a3621e74
YO
1348 } /* end first pass */
1349 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1350 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1351 (int)depth * 2 + 2,"",
85c3142d 1352 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1353 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1354 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1355 );
786e8c11 1356 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1357
1358 /*
1359 We now know what we are dealing with in terms of unique chars and
1360 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1361 representation using a flat table will take. If it's over a reasonable
1362 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1363 conservative but potentially much slower representation using an array
1364 of lists.
1365
1366 At the end we convert both representations into the same compressed
1367 form that will be used in regexec.c for matching with. The latter
1368 is a form that cannot be used to construct with but has memory
1369 properties similar to the list form and access properties similar
1370 to the table form making it both suitable for fast searches and
1371 small enough that its feasable to store for the duration of a program.
1372
1373 See the comment in the code where the compressed table is produced
1374 inplace from the flat tabe representation for an explanation of how
1375 the compression works.
1376
1377 */
1378
1379
3dab1dad 1380 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1381 /*
1382 Second Pass -- Array Of Lists Representation
1383
1384 Each state will be represented by a list of charid:state records
1385 (reg_trie_trans_le) the first such element holds the CUR and LEN
1386 points of the allocated array. (See defines above).
1387
1388 We build the initial structure using the lists, and then convert
1389 it into the compressed table form which allows faster lookups
1390 (but cant be modified once converted).
a3621e74
YO
1391 */
1392
a3621e74
YO
1393 STRLEN transcount = 1;
1394
3dab1dad 1395 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1396 TRIE_LIST_NEW(1);
1397 next_alloc = 2;
1398
1399 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1400
c445ea15
AL
1401 regnode * const noper = NEXTOPER( cur );
1402 U8 *uc = (U8*)STRING( noper );
1403 const U8 * const e = uc + STR_LEN( noper );
1404 U32 state = 1; /* required init */
1405 U16 charid = 0; /* sanity init */
1406 U8 *scan = (U8*)NULL; /* sanity init */
1407 STRLEN foldlen = 0; /* required init */
07be1b83 1408 U32 wordlen = 0; /* required init */
c445ea15
AL
1409 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1410
3dab1dad 1411 if (OP(noper) != NOTHING) {
786e8c11 1412 for ( ; uc < e ; uc += len ) {
c445ea15 1413
786e8c11 1414 TRIE_READ_CHAR;
c445ea15 1415
786e8c11
YO
1416 if ( uvc < 256 ) {
1417 charid = trie->charmap[ uvc ];
c445ea15 1418 } else {
786e8c11
YO
1419 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1420 if ( !svpp ) {
1421 charid = 0;
1422 } else {
1423 charid=(U16)SvIV( *svpp );
1424 }
c445ea15 1425 }
786e8c11
YO
1426 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1427 if ( charid ) {
a3621e74 1428
786e8c11
YO
1429 U16 check;
1430 U32 newstate = 0;
a3621e74 1431
786e8c11
YO
1432 charid--;
1433 if ( !trie->states[ state ].trans.list ) {
1434 TRIE_LIST_NEW( state );
c445ea15 1435 }
786e8c11
YO
1436 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1437 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1438 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1439 break;
1440 }
1441 }
1442 if ( ! newstate ) {
1443 newstate = next_alloc++;
1444 TRIE_LIST_PUSH( state, charid, newstate );
1445 transcount++;
1446 }
1447 state = newstate;
1448 } else {
1449 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1450 }
a28509cc 1451 }
c445ea15 1452 }
3dab1dad 1453 TRIE_HANDLE_WORD(state);
a3621e74
YO
1454
1455 } /* end second pass */
1456
786e8c11 1457 trie->laststate = next_alloc;
a3621e74
YO
1458 Renew( trie->states, next_alloc, reg_trie_state );
1459
3dab1dad
YO
1460 /* and now dump it out before we compress it */
1461 DEBUG_TRIE_COMPILE_MORE_r(
1462 dump_trie_interim_list(trie,next_alloc,depth+1)
a3621e74 1463 );
a3621e74 1464
a02a5408 1465 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1466 {
1467 U32 state;
a3621e74
YO
1468 U32 tp = 0;
1469 U32 zp = 0;
1470
1471
1472 for( state=1 ; state < next_alloc ; state ++ ) {
1473 U32 base=0;
1474
1475 /*
1476 DEBUG_TRIE_COMPILE_MORE_r(
1477 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1478 );
1479 */
1480
1481 if (trie->states[state].trans.list) {
1482 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1483 U16 maxid=minid;
a28509cc 1484 U16 idx;
a3621e74
YO
1485
1486 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1487 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1488 if ( forid < minid ) {
1489 minid=forid;
1490 } else if ( forid > maxid ) {
1491 maxid=forid;
1492 }
a3621e74
YO
1493 }
1494 if ( transcount < tp + maxid - minid + 1) {
1495 transcount *= 2;
1496 Renew( trie->trans, transcount, reg_trie_trans );
1497 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1498 }
1499 base = trie->uniquecharcount + tp - minid;
1500 if ( maxid == minid ) {
1501 U32 set = 0;
1502 for ( ; zp < tp ; zp++ ) {
1503 if ( ! trie->trans[ zp ].next ) {
1504 base = trie->uniquecharcount + zp - minid;
1505 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1506 trie->trans[ zp ].check = state;
1507 set = 1;
1508 break;
1509 }
1510 }
1511 if ( !set ) {
1512 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1513 trie->trans[ tp ].check = state;
1514 tp++;
1515 zp = tp;
1516 }
1517 } else {
1518 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1519 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1520 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1521 trie->trans[ tid ].check = state;
1522 }
1523 tp += ( maxid - minid + 1 );
1524 }
1525 Safefree(trie->states[ state ].trans.list);
1526 }
1527 /*
1528 DEBUG_TRIE_COMPILE_MORE_r(
1529 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1530 );
1531 */
1532 trie->states[ state ].trans.base=base;
1533 }
cc601c31 1534 trie->lasttrans = tp + 1;
a3621e74
YO
1535 }
1536 } else {
1537 /*
1538 Second Pass -- Flat Table Representation.
1539
1540 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1541 We know that we will need Charcount+1 trans at most to store the data
1542 (one row per char at worst case) So we preallocate both structures
1543 assuming worst case.
1544
1545 We then construct the trie using only the .next slots of the entry
1546 structs.
1547
1548 We use the .check field of the first entry of the node temporarily to
1549 make compression both faster and easier by keeping track of how many non
1550 zero fields are in the node.
1551
1552 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1553 transition.
1554
1555 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1556 number representing the first entry of the node, and state as a
1557 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1558 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1559 are 2 entrys per node. eg:
1560
1561 A B A B
1562 1. 2 4 1. 3 7
1563 2. 0 3 3. 0 5
1564 3. 0 0 5. 0 0
1565 4. 0 0 7. 0 0
1566
1567 The table is internally in the right hand, idx form. However as we also
1568 have to deal with the states array which is indexed by nodenum we have to
1569 use TRIE_NODENUM() to convert.
1570
1571 */
1572
3dab1dad
YO
1573
1574 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1575 reg_trie_trans );
3dab1dad 1576 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1577 next_alloc = trie->uniquecharcount + 1;
1578
3dab1dad 1579
a3621e74
YO
1580 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1581
c445ea15 1582 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1583 const U8 *uc = (U8*)STRING( noper );
1584 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1585
1586 U32 state = 1; /* required init */
1587
1588 U16 charid = 0; /* sanity init */
1589 U32 accept_state = 0; /* sanity init */
1590 U8 *scan = (U8*)NULL; /* sanity init */
1591
1592 STRLEN foldlen = 0; /* required init */
07be1b83 1593 U32 wordlen = 0; /* required init */
a3621e74
YO
1594 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1595
3dab1dad 1596 if ( OP(noper) != NOTHING ) {
786e8c11 1597 for ( ; uc < e ; uc += len ) {
a3621e74 1598
786e8c11 1599 TRIE_READ_CHAR;
a3621e74 1600
786e8c11
YO
1601 if ( uvc < 256 ) {
1602 charid = trie->charmap[ uvc ];
1603 } else {
1604 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1605 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1606 }
786e8c11
YO
1607 if ( charid ) {
1608 charid--;
1609 if ( !trie->trans[ state + charid ].next ) {
1610 trie->trans[ state + charid ].next = next_alloc;
1611 trie->trans[ state ].check++;
1612 next_alloc += trie->uniquecharcount;
1613 }
1614 state = trie->trans[ state + charid ].next;
1615 } else {
1616 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1617 }
1618 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1619 }
a3621e74 1620 }
3dab1dad
YO
1621 accept_state = TRIE_NODENUM( state );
1622 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1623
1624 } /* end second pass */
1625
3dab1dad
YO
1626 /* and now dump it out before we compress it */
1627 DEBUG_TRIE_COMPILE_MORE_r(
1628 dump_trie_interim_table(trie,next_alloc,depth+1)
1629 );
a3621e74 1630
a3621e74
YO
1631 {
1632 /*
1633 * Inplace compress the table.*
1634
1635 For sparse data sets the table constructed by the trie algorithm will
1636 be mostly 0/FAIL transitions or to put it another way mostly empty.
1637 (Note that leaf nodes will not contain any transitions.)
1638
1639 This algorithm compresses the tables by eliminating most such
1640 transitions, at the cost of a modest bit of extra work during lookup:
1641
1642 - Each states[] entry contains a .base field which indicates the
1643 index in the state[] array wheres its transition data is stored.
1644
1645 - If .base is 0 there are no valid transitions from that node.
1646
1647 - If .base is nonzero then charid is added to it to find an entry in
1648 the trans array.
1649
1650 -If trans[states[state].base+charid].check!=state then the
1651 transition is taken to be a 0/Fail transition. Thus if there are fail
1652 transitions at the front of the node then the .base offset will point
1653 somewhere inside the previous nodes data (or maybe even into a node
1654 even earlier), but the .check field determines if the transition is
1655 valid.
1656
786e8c11 1657 XXX - wrong maybe?
a3621e74
YO
1658 The following process inplace converts the table to the compressed
1659 table: We first do not compress the root node 1,and mark its all its
1660 .check pointers as 1 and set its .base pointer as 1 as well. This
1661 allows to do a DFA construction from the compressed table later, and
1662 ensures that any .base pointers we calculate later are greater than
1663 0.
1664
1665 - We set 'pos' to indicate the first entry of the second node.
1666
1667 - We then iterate over the columns of the node, finding the first and
1668 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1669 and set the .check pointers accordingly, and advance pos
1670 appropriately and repreat for the next node. Note that when we copy
1671 the next pointers we have to convert them from the original
1672 NODEIDX form to NODENUM form as the former is not valid post
1673 compression.
1674
1675 - If a node has no transitions used we mark its base as 0 and do not
1676 advance the pos pointer.
1677
1678 - If a node only has one transition we use a second pointer into the
1679 structure to fill in allocated fail transitions from other states.
1680 This pointer is independent of the main pointer and scans forward
1681 looking for null transitions that are allocated to a state. When it
1682 finds one it writes the single transition into the "hole". If the
786e8c11 1683 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1684
1685 - Once compressed we can Renew/realloc the structures to release the
1686 excess space.
1687
1688 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1689 specifically Fig 3.47 and the associated pseudocode.
1690
1691 demq
1692 */
a3b680e6 1693 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1694 U32 state, charid;
a3621e74 1695 U32 pos = 0, zp=0;
786e8c11 1696 trie->laststate = laststate;
a3621e74
YO
1697
1698 for ( state = 1 ; state < laststate ; state++ ) {
1699 U8 flag = 0;
a28509cc
AL
1700 const U32 stateidx = TRIE_NODEIDX( state );
1701 const U32 o_used = trie->trans[ stateidx ].check;
1702 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1703 trie->trans[ stateidx ].check = 0;
1704
1705 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1706 if ( flag || trie->trans[ stateidx + charid ].next ) {
1707 if ( trie->trans[ stateidx + charid ].next ) {
1708 if (o_used == 1) {
1709 for ( ; zp < pos ; zp++ ) {
1710 if ( ! trie->trans[ zp ].next ) {
1711 break;
1712 }
1713 }
1714 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1715 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1716 trie->trans[ zp ].check = state;
1717 if ( ++zp > pos ) pos = zp;
1718 break;
1719 }
1720 used--;
1721 }
1722 if ( !flag ) {
1723 flag = 1;
1724 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1725 }
1726 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1727 trie->trans[ pos ].check = state;
1728 pos++;
1729 }
1730 }
1731 }
cc601c31 1732 trie->lasttrans = pos + 1;
a3621e74
YO
1733 Renew( trie->states, laststate + 1, reg_trie_state);
1734 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1735 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1736 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1737 (int)depth * 2 + 2,"",
1738 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1739 (IV)next_alloc,
1740 (IV)pos,
a3621e74
YO
1741 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1742 );
1743
1744 } /* end table compress */
1745 }
cc601c31
YO
1746 /* resize the trans array to remove unused space */
1747 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1748
3dab1dad
YO
1749 /* and now dump out the compressed format */
1750 DEBUG_TRIE_COMPILE_r(
1751 dump_trie(trie,depth+1)
1752 );
07be1b83 1753
3dab1dad
YO
1754 { /* Modify the program and insert the new TRIE node*/
1755 regnode *convert;
1756 U8 nodetype =(U8)(flags & 0xFF);
1757 char *str=NULL;
786e8c11 1758
07be1b83 1759#ifdef DEBUGGING
a5ca303d 1760 regnode *optimize;
b57a0404
JH
1761 U32 mjd_offset = 0;
1762 U32 mjd_nodelen = 0;
07be1b83 1763#endif
a3621e74 1764 /*
3dab1dad
YO
1765 This means we convert either the first branch or the first Exact,
1766 depending on whether the thing following (in 'last') is a branch
1767 or not and whther first is the startbranch (ie is it a sub part of
1768 the alternation or is it the whole thing.)
1769 Assuming its a sub part we conver the EXACT otherwise we convert
1770 the whole branch sequence, including the first.
a3621e74 1771 */
3dab1dad
YO
1772 /* Find the node we are going to overwrite */
1773 if ( first == startbranch && OP( last ) != BRANCH ) {
07be1b83 1774 /* whole branch chain */
3dab1dad 1775 convert = first;
07be1b83
YO
1776 DEBUG_r({
1777 const regnode *nop = NEXTOPER( convert );
1778 mjd_offset= Node_Offset((nop));
1779 mjd_nodelen= Node_Length((nop));
1780 });
1781 } else {
1782 /* branch sub-chain */
3dab1dad
YO
1783 convert = NEXTOPER( first );
1784 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1785 DEBUG_r({
1786 mjd_offset= Node_Offset((convert));
1787 mjd_nodelen= Node_Length((convert));
1788 });
1789 }
1790 DEBUG_OPTIMISE_r(
1791 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1792 (int)depth * 2 + 2, "",
786e8c11 1793 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1794 );
a3621e74 1795
3dab1dad
YO
1796 /* But first we check to see if there is a common prefix we can
1797 split out as an EXACT and put in front of the TRIE node. */
1798 trie->startstate= 1;
786e8c11 1799 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad
YO
1800 U32 state;
1801 DEBUG_OPTIMISE_r(
8e11feef
RGS
1802 PerlIO_printf(Perl_debug_log, "%*sLaststate:%"UVuf"\n",
1803 (int)depth * 2 + 2, "",
786e8c11 1804 (UV)trie->laststate)
8e11feef 1805 );
786e8c11 1806 for ( state = 1 ; state < trie->laststate-1 ; state++ ) {
a3621e74 1807 U32 ofs = 0;
8e11feef
RGS
1808 I32 idx = -1;
1809 U32 count = 0;
1810 const U32 base = trie->states[ state ].trans.base;
a3621e74 1811
3dab1dad 1812 if ( trie->states[state].wordnum )
8e11feef 1813 count = 1;
a3621e74 1814
8e11feef 1815 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1816 if ( ( base + ofs >= trie->uniquecharcount ) &&
1817 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1818 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1819 {
3dab1dad 1820 if ( ++count > 1 ) {
8e11feef 1821 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1822 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1823 if ( state == 1 ) break;
3dab1dad
YO
1824 if ( count == 2 ) {
1825 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1826 DEBUG_OPTIMISE_r(
8e11feef
RGS
1827 PerlIO_printf(Perl_debug_log,
1828 "%*sNew Start State=%"UVuf" Class: [",
1829 (int)depth * 2 + 2, "",
786e8c11 1830 (UV)state));
be8e71aa
YO
1831 if (idx >= 0) {
1832 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1833 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1834
3dab1dad 1835 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1836 if ( folder )
1837 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1838 DEBUG_OPTIMISE_r(
07be1b83 1839 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1840 );
8e11feef
RGS
1841 }
1842 }
1843 TRIE_BITMAP_SET(trie,*ch);
1844 if ( folder )
1845 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1846 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1847 }
1848 idx = ofs;
1849 }
3dab1dad
YO
1850 }
1851 if ( count == 1 ) {
1852 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1853 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1854 DEBUG_OPTIMISE_r(
8e11feef
RGS
1855 PerlIO_printf( Perl_debug_log,
1856 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1857 (int)depth * 2 + 2, "",
786e8c11 1858 (UV)state, (UV)idx, ch)
3dab1dad
YO
1859 );
1860 if ( state==1 ) {
1861 OP( convert ) = nodetype;
1862 str=STRING(convert);
1863 STR_LEN(convert)=0;
1864 }
1865 *str++=*ch;
1866 STR_LEN(convert)++;
a3621e74 1867
8e11feef 1868 } else {
f9049ba1 1869#ifdef DEBUGGING
8e11feef
RGS
1870 if (state>1)
1871 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1872#endif
8e11feef
RGS
1873 break;
1874 }
1875 }
3dab1dad 1876 if (str) {
8e11feef 1877 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1878 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1879 trie->startstate = state;
07be1b83
YO
1880 trie->minlen -= (state - 1);
1881 trie->maxlen -= (state - 1);
1882 DEBUG_r({
1883 regnode *fix = convert;
1884 mjd_nodelen++;
1885 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1886 while( ++fix < n ) {
1887 Set_Node_Offset_Length(fix, 0, 0);
1888 }
1889 });
8e11feef
RGS
1890 if (trie->maxlen) {
1891 convert = n;
1892 } else {
3dab1dad 1893 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1894 DEBUG_r(optimize= n);
3dab1dad
YO
1895 }
1896 }
1897 }
a5ca303d
YO
1898 if (!jumper)
1899 jumper = last;
3dab1dad 1900 if ( trie->maxlen ) {
8e11feef
RGS
1901 NEXT_OFF( convert ) = (U16)(tail - convert);
1902 ARG_SET( convert, data_slot );
786e8c11
YO
1903 /* Store the offset to the first unabsorbed branch in
1904 jump[0], which is otherwise unused by the jump logic.
1905 We use this when dumping a trie and during optimisation. */
1906 if (trie->jump)
1907 trie->jump[0] = (U16)(tail - nextbranch);
a5ca303d 1908
786e8c11
YO
1909 /* XXXX */
1910 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1911 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1912 {
1913 OP( convert ) = TRIEC;
1914 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1915 Safefree(trie->bitmap);
1916 trie->bitmap= NULL;
1917 } else
1918 OP( convert ) = TRIE;
a3621e74 1919
3dab1dad
YO
1920 /* store the type in the flags */
1921 convert->flags = nodetype;
a5ca303d
YO
1922 DEBUG_r({
1923 optimize = convert
1924 + NODE_STEP_REGNODE
1925 + regarglen[ OP( convert ) ];
1926 });
1927 /* XXX We really should free up the resource in trie now,
1928 as we won't use them - (which resources?) dmq */
3dab1dad 1929 }
a3621e74
YO
1930 /* needed for dumping*/
1931 DEBUG_r({
07be1b83
YO
1932 regnode *opt = convert;
1933 while (++opt<optimize) {
1934 Set_Node_Offset_Length(opt,0,0);
1935 }
786e8c11
YO
1936 /*
1937 Try to clean up some of the debris left after the
1938 optimisation.
a3621e74 1939 */
786e8c11 1940 while( optimize < jumper ) {
07be1b83 1941 mjd_nodelen += Node_Length((optimize));
a3621e74 1942 OP( optimize ) = OPTIMIZED;
07be1b83 1943 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1944 optimize++;
1945 }
07be1b83 1946 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1947 });
1948 } /* end node insert */
07be1b83 1949#ifndef DEBUGGING
6e8b4190 1950 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1951#endif
786e8c11
YO
1952 return trie->jump
1953 ? MADE_JUMP_TRIE
1954 : trie->startstate>1
1955 ? MADE_EXACT_TRIE
1956 : MADE_TRIE;
1957}
1958
1959STATIC void
1960S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
1961{
1962/* The Trie is constructed and compressed now so we can build a fail array now if its needed
1963
1964 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
1965 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
1966 ISBN 0-201-10088-6
1967
1968 We find the fail state for each state in the trie, this state is the longest proper
1969 suffix of the current states 'word' that is also a proper prefix of another word in our
1970 trie. State 1 represents the word '' and is the thus the default fail state. This allows
1971 the DFA not to have to restart after its tried and failed a word at a given point, it
1972 simply continues as though it had been matching the other word in the first place.
1973 Consider
1974 'abcdgu'=~/abcdefg|cdgu/
1975 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
1976 fail, which would bring use to the state representing 'd' in the second word where we would
1977 try 'g' and succeed, prodceding to match 'cdgu'.
1978 */
1979 /* add a fail transition */
1980 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
1981 U32 *q;
1982 const U32 ucharcount = trie->uniquecharcount;
1983 const U32 numstates = trie->laststate;
1984 const U32 ubound = trie->lasttrans + ucharcount;
1985 U32 q_read = 0;
1986 U32 q_write = 0;
1987 U32 charid;
1988 U32 base = trie->states[ 1 ].trans.base;
1989 U32 *fail;
1990 reg_ac_data *aho;
1991 const U32 data_slot = add_data( pRExC_state, 1, "T" );
1992 GET_RE_DEBUG_FLAGS_DECL;
1993#ifndef DEBUGGING
1994 PERL_UNUSED_ARG(depth);
1995#endif
1996
1997
1998 ARG_SET( stclass, data_slot );
1999 Newxz( aho, 1, reg_ac_data );
2000 RExC_rx->data->data[ data_slot ] = (void*)aho;
2001 aho->trie=trie;
2002 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
2003 (trie->laststate+1)*sizeof(reg_trie_state));
2004 Newxz( q, numstates, U32);
2005 Newxz( aho->fail, numstates, U32 );
2006 aho->refcount = 1;
2007 fail = aho->fail;
2008 /* initialize fail[0..1] to be 1 so that we always have
2009 a valid final fail state */
2010 fail[ 0 ] = fail[ 1 ] = 1;
2011
2012 for ( charid = 0; charid < ucharcount ; charid++ ) {
2013 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2014 if ( newstate ) {
2015 q[ q_write ] = newstate;
2016 /* set to point at the root */
2017 fail[ q[ q_write++ ] ]=1;
2018 }
2019 }
2020 while ( q_read < q_write) {
2021 const U32 cur = q[ q_read++ % numstates ];
2022 base = trie->states[ cur ].trans.base;
2023
2024 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2025 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2026 if (ch_state) {
2027 U32 fail_state = cur;
2028 U32 fail_base;
2029 do {
2030 fail_state = fail[ fail_state ];
2031 fail_base = aho->states[ fail_state ].trans.base;
2032 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2033
2034 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2035 fail[ ch_state ] = fail_state;
2036 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2037 {
2038 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2039 }
2040 q[ q_write++ % numstates] = ch_state;
2041 }
2042 }
2043 }
2044 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2045 when we fail in state 1, this allows us to use the
2046 charclass scan to find a valid start char. This is based on the principle
2047 that theres a good chance the string being searched contains lots of stuff
2048 that cant be a start char.
2049 */
2050 fail[ 0 ] = fail[ 1 ] = 0;
2051 DEBUG_TRIE_COMPILE_r({
2052 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable: 0", (int)(depth * 2), "");
2053 for( q_read=1; q_read<numstates; q_read++ ) {
2054 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2055 }
2056 PerlIO_printf(Perl_debug_log, "\n");
2057 });
2058 Safefree(q);
2059 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2060}
2061
786e8c11 2062
a3621e74 2063/*
5d1c421c
JH
2064 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2065 * These need to be revisited when a newer toolchain becomes available.
2066 */
2067#if defined(__sparc64__) && defined(__GNUC__)
2068# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2069# undef SPARC64_GCC_WORKAROUND
2070# define SPARC64_GCC_WORKAROUND 1
2071# endif
2072#endif
2073
07be1b83
YO
2074#define DEBUG_PEEP(str,scan,depth) \
2075 DEBUG_OPTIMISE_r({ \
2076 SV * const mysv=sv_newmortal(); \
2077 regnode *Next = regnext(scan); \
2078 regprop(RExC_rx, mysv, scan); \
2079 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s [%d]\n", \
2080 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2081 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2082 });
2083
1de06328
YO
2084
2085
2086
2087
07be1b83
YO
2088#define JOIN_EXACT(scan,min,flags) \
2089 if (PL_regkind[OP(scan)] == EXACT) \
2090 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2091
be8e71aa 2092STATIC U32
07be1b83
YO
2093S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2094 /* Merge several consecutive EXACTish nodes into one. */
2095 regnode *n = regnext(scan);
2096 U32 stringok = 1;
2097 regnode *next = scan + NODE_SZ_STR(scan);
2098 U32 merged = 0;
2099 U32 stopnow = 0;
2100#ifdef DEBUGGING
2101 regnode *stop = scan;
72f13be8 2102 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2103#else
d47053eb
RGS
2104 PERL_UNUSED_ARG(depth);
2105#endif
2106#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2107 PERL_UNUSED_ARG(flags);
2108 PERL_UNUSED_ARG(val);
07be1b83 2109#endif
07be1b83
YO
2110 DEBUG_PEEP("join",scan,depth);
2111
2112 /* Skip NOTHING, merge EXACT*. */
2113 while (n &&
2114 ( PL_regkind[OP(n)] == NOTHING ||
2115 (stringok && (OP(n) == OP(scan))))
2116 && NEXT_OFF(n)
2117 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2118
2119 if (OP(n) == TAIL || n > next)
2120 stringok = 0;
2121 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2122 DEBUG_PEEP("skip:",n,depth);
2123 NEXT_OFF(scan) += NEXT_OFF(n);
2124 next = n + NODE_STEP_REGNODE;
2125#ifdef DEBUGGING
2126 if (stringok)
2127 stop = n;
2128#endif
2129 n = regnext(n);
2130 }
2131 else if (stringok) {
786e8c11 2132 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2133 regnode * const nnext = regnext(n);
2134
2135 DEBUG_PEEP("merg",n,depth);
2136
2137 merged++;
2138 if (oldl + STR_LEN(n) > U8_MAX)
2139 break;
2140 NEXT_OFF(scan) += NEXT_OFF(n);
2141 STR_LEN(scan) += STR_LEN(n);
2142 next = n + NODE_SZ_STR(n);
2143 /* Now we can overwrite *n : */
2144 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2145#ifdef DEBUGGING
2146 stop = next - 1;
2147#endif
2148 n = nnext;
2149 if (stopnow) break;
2150 }
2151
d47053eb
RGS
2152#ifdef EXPERIMENTAL_INPLACESCAN
2153 if (flags && !NEXT_OFF(n)) {
2154 DEBUG_PEEP("atch", val, depth);
2155 if (reg_off_by_arg[OP(n)]) {
2156 ARG_SET(n, val - n);
2157 }
2158 else {
2159 NEXT_OFF(n) = val - n;
2160 }
2161 stopnow = 1;
2162 }
07be1b83
YO
2163#endif
2164 }
2165
2166 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2167 /*
2168 Two problematic code points in Unicode casefolding of EXACT nodes:
2169
2170 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2171 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2172
2173 which casefold to
2174
2175 Unicode UTF-8
2176
2177 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2178 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2179
2180 This means that in case-insensitive matching (or "loose matching",
2181 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2182 length of the above casefolded versions) can match a target string
2183 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2184 This would rather mess up the minimum length computation.
2185
2186 What we'll do is to look for the tail four bytes, and then peek
2187 at the preceding two bytes to see whether we need to decrease
2188 the minimum length by four (six minus two).
2189
2190 Thanks to the design of UTF-8, there cannot be false matches:
2191 A sequence of valid UTF-8 bytes cannot be a subsequence of
2192 another valid sequence of UTF-8 bytes.
2193
2194 */
2195 char * const s0 = STRING(scan), *s, *t;
2196 char * const s1 = s0 + STR_LEN(scan) - 1;
2197 char * const s2 = s1 - 4;
e294cc5d
JH
2198#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2199 const char t0[] = "\xaf\x49\xaf\x42";
2200#else
07be1b83 2201 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2202#endif
07be1b83
YO
2203 const char * const t1 = t0 + 3;
2204
2205 for (s = s0 + 2;
2206 s < s2 && (t = ninstr(s, s1, t0, t1));
2207 s = t + 4) {
e294cc5d
JH
2208#ifdef EBCDIC
2209 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2210 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2211#else
07be1b83
YO
2212 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2213 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2214#endif
07be1b83
YO
2215 *min -= 4;
2216 }
2217 }
2218
2219#ifdef DEBUGGING
2220 /* Allow dumping */
2221 n = scan + NODE_SZ_STR(scan);
2222 while (n <= stop) {
2223 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2224 OP(n) = OPTIMIZED;
2225 NEXT_OFF(n) = 0;
2226 }
2227 n++;
2228 }
2229#endif
2230 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2231 return stopnow;
2232}
2233
653099ff
GS
2234/* REx optimizer. Converts nodes into quickier variants "in place".
2235 Finds fixed substrings. */
2236
a0288114 2237/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2238 to the position after last scanned or to NULL. */
2239
07be1b83
YO
2240
2241
76e3520e 2242STATIC I32
1de06328
YO
2243S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2244 I32 *minlenp, I32 *deltap,
9a957fbc 2245 regnode *last, scan_data_t *data, U32 flags, U32 depth)
c277df42
IZ
2246 /* scanp: Start here (read-write). */
2247 /* deltap: Write maxlen-minlen here. */
2248 /* last: Stop before this one. */
2249{
97aff369 2250 dVAR;
c277df42
IZ
2251 I32 min = 0, pars = 0, code;
2252 regnode *scan = *scanp, *next;
2253 I32 delta = 0;
2254 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2255 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2256 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2257 scan_data_t data_fake;
653099ff 2258 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
a3621e74 2259 SV *re_trie_maxbuff = NULL;
786e8c11
YO
2260 regnode *first_non_open = scan;
2261
a3621e74
YO
2262
2263 GET_RE_DEBUG_FLAGS_DECL;
13a24bad
YO
2264#ifdef DEBUGGING
2265 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2266#endif
786e8c11
YO
2267 if ( depth == 0 ) {
2268 while (first_non_open && OP(first_non_open) == OPEN)
2269 first_non_open=regnext(first_non_open);
2270 }
2271
b81d288d 2272
c277df42
IZ
2273 while (scan && OP(scan) != END && scan < last) {
2274 /* Peephole optimizer: */
1de06328 2275 DEBUG_STUDYDATA(data,depth);
07be1b83 2276 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2277 JOIN_EXACT(scan,&min,0);
a3621e74 2278
653099ff
GS
2279 /* Follow the next-chain of the current node and optimize
2280 away all the NOTHINGs from it. */
c277df42 2281 if (OP(scan) != CURLYX) {
a3b680e6 2282 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2283 ? I32_MAX
2284 /* I32 may be smaller than U16 on CRAYs! */
2285 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2286 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2287 int noff;
2288 regnode *n = scan;
b81d288d 2289
c277df42
IZ
2290 /* Skip NOTHING and LONGJMP. */
2291 while ((n = regnext(n))
3dab1dad 2292 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2293 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2294 && off + noff < max)
2295 off += noff;
2296 if (reg_off_by_arg[OP(scan)])
2297 ARG(scan) = off;
b81d288d 2298 else
c277df42
IZ
2299 NEXT_OFF(scan) = off;
2300 }
a3621e74 2301
07be1b83 2302
3dab1dad 2303
653099ff
GS
2304 /* The principal pseudo-switch. Cannot be a switch, since we
2305 look into several different things. */
b81d288d 2306 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2307 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2308 next = regnext(scan);
2309 code = OP(scan);
a3621e74 2310 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2311
2312 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2313 /* NOTE - There is similar code to this block below for handling
2314 TRIE nodes on a re-study. If you change stuff here check there
2315 too. */
c277df42 2316 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2317 struct regnode_charclass_class accum;
d4c19fe8 2318 regnode * const startbranch=scan;
c277df42 2319
653099ff 2320 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2321 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2322 if (flags & SCF_DO_STCLASS)
830247a4 2323 cl_init_zero(pRExC_state, &accum);
a3621e74 2324
c277df42 2325 while (OP(scan) == code) {
830247a4 2326 I32 deltanext, minnext, f = 0, fake;
653099ff 2327 struct regnode_charclass_class this_class;
c277df42
IZ
2328
2329 num++;
2330 data_fake.flags = 0;
b81d288d 2331 if (data) {
2c2d71f5 2332 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2333 data_fake.last_closep = data->last_closep;
2334 }
2335 else
2336 data_fake.last_closep = &fake;
c277df42
IZ
2337 next = regnext(scan);
2338 scan = NEXTOPER(scan);
2339 if (code != BRANCH)
2340 scan = NEXTOPER(scan);
653099ff 2341 if (flags & SCF_DO_STCLASS) {
830247a4 2342 cl_init(pRExC_state, &this_class);
653099ff
GS
2343 data_fake.start_class = &this_class;
2344 f = SCF_DO_STCLASS_AND;
b81d288d 2345 }
e1901655
IZ
2346 if (flags & SCF_WHILEM_VISITED_POS)
2347 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2348
653099ff 2349 /* we suppose the run is continuous, last=next...*/
1de06328 2350 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
a3621e74 2351 next, &data_fake, f,depth+1);
b81d288d 2352 if (min1 > minnext)
c277df42
IZ
2353 min1 = minnext;
2354 if (max1 < minnext + deltanext)
2355 max1 = minnext + deltanext;
2356 if (deltanext == I32_MAX)
aca2d497 2357 is_inf = is_inf_internal = 1;
c277df42
IZ
2358 scan = next;
2359 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2360 pars++;
3dab1dad
YO
2361 if (data) {
2362 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2363 data->flags |= SF_HAS_EVAL;
2c2d71f5 2364 data->whilem_c = data_fake.whilem_c;
3dab1dad 2365 }
653099ff 2366 if (flags & SCF_DO_STCLASS)
830247a4 2367 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2368 if (code == SUSPEND)
c277df42
IZ
2369 break;
2370 }
2371 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2372 min1 = 0;
2373 if (flags & SCF_DO_SUBSTR) {
2374 data->pos_min += min1;
2375 data->pos_delta += max1 - min1;
2376 if (max1 != min1 || is_inf)
2377 data->longest = &(data->longest_float);
2378 }
2379 min += min1;
2380 delta += max1 - min1;
653099ff 2381 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2382 cl_or(pRExC_state, data->start_class, &accum);
653099ff
GS
2383 if (min1) {
2384 cl_and(data->start_class, &and_with);
2385 flags &= ~SCF_DO_STCLASS;
2386 }
2387 }
2388 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2389 if (min1) {
2390 cl_and(data->start_class, &accum);
653099ff 2391 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2392 }
2393 else {
b81d288d 2394 /* Switch to OR mode: cache the old value of
de0c8cb8
GS
2395 * data->start_class */
2396 StructCopy(data->start_class, &and_with,
2397 struct regnode_charclass_class);
2398 flags &= ~SCF_DO_STCLASS_AND;
2399 StructCopy(&accum, data->start_class,
2400 struct regnode_charclass_class);
2401 flags |= SCF_DO_STCLASS_OR;
2402 data->start_class->flags |= ANYOF_EOS;
2403 }
653099ff 2404 }
a3621e74 2405
786e8c11 2406 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2407 /* demq.
2408
2409 Assuming this was/is a branch we are dealing with: 'scan' now
2410 points at the item that follows the branch sequence, whatever
2411 it is. We now start at the beginning of the sequence and look
2412 for subsequences of
2413
786e8c11
YO
2414 BRANCH->EXACT=>x1
2415 BRANCH->EXACT=>x2
2416 tail
a3621e74
YO
2417
2418 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2419
2420 If we can find such a subseqence we need to turn the first
2421 element into a trie and then add the subsequent branch exact
2422 strings to the trie.
2423
2424 We have two cases
2425
786e8c11 2426 1. patterns where the whole set of branch can be converted.
a3621e74 2427
786e8c11 2428 2. patterns where only a subset can be converted.
a3621e74
YO
2429
2430 In case 1 we can replace the whole set with a single regop
2431 for the trie. In case 2 we need to keep the start and end
2432 branchs so
2433
2434 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2435 becomes BRANCH TRIE; BRANCH X;
2436
786e8c11
YO
2437 There is an additional case, that being where there is a
2438 common prefix, which gets split out into an EXACT like node
2439 preceding the TRIE node.
2440
2441 If x(1..n)==tail then we can do a simple trie, if not we make
2442 a "jump" trie, such that when we match the appropriate word
2443 we "jump" to the appopriate tail node. Essentailly we turn
2444 a nested if into a case structure of sorts.
a3621e74
YO
2445
2446 */
786e8c11 2447
3dab1dad 2448 int made=0;
0111c4fd
RGS
2449 if (!re_trie_maxbuff) {
2450 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2451 if (!SvIOK(re_trie_maxbuff))
2452 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2453 }
786e8c11 2454 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2455 regnode *cur;
2456 regnode *first = (regnode *)NULL;
2457 regnode *last = (regnode *)NULL;
2458 regnode *tail = scan;
2459 U8 optype = 0;
2460 U32 count=0;
2461
2462#ifdef DEBUGGING
c445ea15 2463 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2464#endif
2465 /* var tail is used because there may be a TAIL
2466 regop in the way. Ie, the exacts will point to the
2467 thing following the TAIL, but the last branch will
2468 point at the TAIL. So we advance tail. If we
2469 have nested (?:) we may have to move through several
2470 tails.
2471 */
2472
2473 while ( OP( tail ) == TAIL ) {
2474 /* this is the TAIL generated by (?:) */
2475 tail = regnext( tail );
2476 }
2477
3dab1dad 2478
a3621e74 2479 DEBUG_OPTIMISE_r({
32fc9b6a 2480 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2481 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2482 (int)depth * 2 + 2, "",
2483 "Looking for TRIE'able sequences. Tail node is: ",
2484 SvPV_nolen_const( mysv )
a3621e74
YO
2485 );
2486 });
3dab1dad 2487
a3621e74
YO
2488 /*
2489
2490 step through the branches, cur represents each
2491 branch, noper is the first thing to be matched
2492 as part of that branch and noper_next is the
2493 regnext() of that node. if noper is an EXACT
2494 and noper_next is the same as scan (our current
2495 position in the regex) then the EXACT branch is
2496 a possible optimization target. Once we have
2497 two or more consequetive such branches we can
2498 create a trie of the EXACT's contents and stich
2499 it in place. If the sequence represents all of
2500 the branches we eliminate the whole thing and
2501 replace it with a single TRIE. If it is a
2502 subsequence then we need to stitch it in. This
2503 means the first branch has to remain, and needs
2504 to be repointed at the item on the branch chain
2505 following the last branch optimized. This could
2506 be either a BRANCH, in which case the
2507 subsequence is internal, or it could be the
2508 item following the branch sequence in which
2509 case the subsequence is at the end.
2510
2511 */
2512
2513 /* dont use tail as the end marker for this traverse */
2514 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2515 regnode * const noper = NEXTOPER( cur );
be981c67 2516#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2517 regnode * const noper_next = regnext( noper );
be981c67 2518#endif
a3621e74 2519
a3621e74 2520 DEBUG_OPTIMISE_r({
32fc9b6a 2521 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2522 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2523 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2524
32fc9b6a 2525 regprop(RExC_rx, mysv, noper);
a3621e74 2526 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2527 SvPV_nolen_const(mysv));
a3621e74
YO
2528
2529 if ( noper_next ) {
32fc9b6a 2530 regprop(RExC_rx, mysv, noper_next );
a3621e74 2531 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2532 SvPV_nolen_const(mysv));
a3621e74 2533 }
3dab1dad
YO
2534 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2535 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2536 });
3dab1dad
YO
2537 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2538 : PL_regkind[ OP( noper ) ] == EXACT )
2539 || OP(noper) == NOTHING )
786e8c11
YO
2540#ifdef NOJUMPTRIE
2541 && noper_next == tail
2542#endif
2543 && count < U16_MAX)
a3621e74
YO
2544 {
2545 count++;
3dab1dad
YO
2546 if ( !first || optype == NOTHING ) {
2547 if (!first) first = cur;
a3621e74
YO
2548 optype = OP( noper );
2549 } else {
a3621e74 2550 last = cur;
a3621e74
YO
2551 }
2552 } else {
2553 if ( last ) {
786e8c11
YO
2554 make_trie( pRExC_state,
2555 startbranch, first, cur, tail, count,
2556 optype, depth+1 );
a3621e74 2557 }
3dab1dad 2558 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2559#ifdef NOJUMPTRIE
2560 && noper_next == tail
2561#endif
2562 ){
a3621e74
YO
2563 count = 1;
2564 first = cur;
2565 optype = OP( noper );
2566 } else {
2567 count = 0;
2568 first = NULL;
2569 optype = 0;
2570 }
2571 last = NULL;
2572 }
2573 }
2574 DEBUG_OPTIMISE_r({
32fc9b6a 2575 regprop(RExC_rx, mysv, cur);
a3621e74 2576 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2577 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2578 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2579
2580 });
2581 if ( last ) {
786e8c11 2582 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2583#ifdef TRIE_STUDY_OPT
786e8c11
YO
2584 if ( ((made == MADE_EXACT_TRIE &&
2585 startbranch == first)
2586 || ( first_non_open == first )) &&
2587 depth==0 )
2588 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2589#endif
07be1b83 2590 }
a3621e74 2591 }
3dab1dad
YO
2592
2593 } /* do trie */
786e8c11 2594
a0ed51b3 2595 }
a3621e74 2596 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2597 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2598 } else /* single branch is optimized. */
c277df42
IZ
2599 scan = NEXTOPER(scan);
2600 continue;
a0ed51b3
LW
2601 }
2602 else if (OP(scan) == EXACT) {
cd439c50 2603 I32 l = STR_LEN(scan);
c445ea15 2604 UV uc;
a0ed51b3 2605 if (UTF) {
a3b680e6 2606 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2607 l = utf8_length(s, s + l);
9041c2e3 2608 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2609 } else {
2610 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2611 }
2612 min += l;
c277df42 2613 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2614 /* The code below prefers earlier match for fixed
2615 offset, later match for variable offset. */
2616 if (data->last_end == -1) { /* Update the start info. */
2617 data->last_start_min = data->pos_min;
2618 data->last_start_max = is_inf
b81d288d 2619 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2620 }
cd439c50 2621 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2622 if (UTF)
2623 SvUTF8_on(data->last_found);
0eda9292 2624 {
9a957fbc 2625 SV * const sv = data->last_found;
a28509cc 2626 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2627 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2628 if (mg && mg->mg_len >= 0)
5e43f467
JH
2629 mg->mg_len += utf8_length((U8*)STRING(scan),
2630 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2631 }
c277df42
IZ
2632 data->last_end = data->pos_min + l;
2633 data->pos_min += l; /* As in the first entry. */
2634 data->flags &= ~SF_BEFORE_EOL;
2635 }
653099ff
GS
2636 if (flags & SCF_DO_STCLASS_AND) {
2637 /* Check whether it is compatible with what we know already! */
2638 int compat = 1;
2639
1aa99e6b 2640 if (uc >= 0x100 ||
516a5887 2641 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2642 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2643 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2644 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2645 )
653099ff
GS
2646 compat = 0;
2647 ANYOF_CLASS_ZERO(data->start_class);
2648 ANYOF_BITMAP_ZERO(data->start_class);
2649 if (compat)
1aa99e6b 2650 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2651 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2652 if (uc < 0x100)
2653 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2654 }
2655 else if (flags & SCF_DO_STCLASS_OR) {
2656 /* false positive possible if the class is case-folded */
1aa99e6b 2657 if (uc < 0x100)
9b877dbb
IH
2658 ANYOF_BITMAP_SET(data->start_class, uc);
2659 else
2660 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff
GS
2661 data->start_class->flags &= ~ANYOF_EOS;
2662 cl_and(data->start_class, &and_with);
2663 }
2664 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2665 }
3dab1dad 2666 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2667 I32 l = STR_LEN(scan);
1aa99e6b 2668 UV uc = *((U8*)STRING(scan));
653099ff
GS
2669
2670 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2671 if (flags & SCF_DO_SUBSTR) {
2672 assert(data);
1de06328 2673 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2674 }
a0ed51b3 2675 if (UTF) {
6136c704 2676 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2677 l = utf8_length(s, s + l);
9041c2e3 2678 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2679 }
2680 min += l;
ecaa9b9c 2681 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2682 data->pos_min += l;
653099ff
GS
2683 if (flags & SCF_DO_STCLASS_AND) {
2684 /* Check whether it is compatible with what we know already! */
2685 int compat = 1;
2686
1aa99e6b 2687 if (uc >= 0x100 ||
516a5887 2688 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2689 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2690 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2691 compat = 0;
2692 ANYOF_CLASS_ZERO(data->start_class);
2693 ANYOF_BITMAP_ZERO(data->start_class);
2694 if (compat) {
1aa99e6b 2695 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2696 data->start_class->flags &= ~ANYOF_EOS;
2697 data->start_class->flags |= ANYOF_FOLD;
2698 if (OP(scan) == EXACTFL)
2699 data->start_class->flags |= ANYOF_LOCALE;
2700 }
2701 }
2702 else if (flags & SCF_DO_STCLASS_OR) {
2703 if (data->start_class->flags & ANYOF_FOLD) {
2704 /* false positive possible if the class is case-folded.
2705 Assume that the locale settings are the same... */
1aa99e6b
IH
2706 if (uc < 0x100)
2707 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2708 data->start_class->flags &= ~ANYOF_EOS;
2709 }
2710 cl_and(data->start_class, &and_with);
2711 }
2712 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2713 }
6bda09f9
YO
2714 else if (OP(scan)==RECURSE) {
2715 ARG2L_SET( scan, RExC_parens[ARG(scan)-1] - scan );
2716 }
bfed75c6 2717 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2718 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2719 I32 f = flags, pos_before = 0;
d4c19fe8 2720 regnode * const oscan = scan;
653099ff
GS
2721 struct regnode_charclass_class this_class;
2722 struct regnode_charclass_class *oclass = NULL;
727f22e3 2723 I32 next_is_eval = 0;
653099ff 2724
3dab1dad 2725 switch (PL_regkind[OP(scan)]) {
653099ff 2726 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2727 scan = NEXTOPER(scan);
2728 goto finish;
2729 case PLUS:
653099ff 2730 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2731 next = NEXTOPER(scan);
653099ff 2732 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2733 mincount = 1;
2734 maxcount = REG_INFTY;
c277df42
IZ
2735 next = regnext(scan);
2736 scan = NEXTOPER(scan);
2737 goto do_curly;
2738 }
2739 }
2740 if (flags & SCF_DO_SUBSTR)
2741 data->pos_min++;
2742 min++;
2743 /* Fall through. */
2744 case STAR:
653099ff
GS
2745 if (flags & SCF_DO_STCLASS) {
2746 mincount = 0;
b81d288d 2747 maxcount = REG_INFTY;
653099ff
GS
2748 next = regnext(scan);
2749 scan = NEXTOPER(scan);
2750 goto do_curly;
2751 }
b81d288d 2752 is_inf = is_inf_internal = 1;
c277df42
IZ
2753 scan = regnext(scan);
2754 if (flags & SCF_DO_SUBSTR) {
1de06328 2755 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2756 data->longest = &(data->longest_float);
2757 }
2758 goto optimize_curly_tail;
2759 case CURLY:
b81d288d 2760 mincount = ARG1(scan);
c277df42
IZ
2761 maxcount = ARG2(scan);
2762 next = regnext(scan);
cb434fcc
IZ
2763 if (OP(scan) == CURLYX) {
2764 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2765 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2766 }
c277df42 2767 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2768 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2769 do_curly:
2770 if (flags & SCF_DO_SUBSTR) {
1de06328 2771 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2772 pos_before = data->pos_min;
2773 }
2774 if (data) {
2775 fl = data->flags;
2776 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2777 if (is_inf)
2778 data->flags |= SF_IS_INF;
2779 }
653099ff 2780 if (flags & SCF_DO_STCLASS) {
830247a4 2781 cl_init(pRExC_state, &this_class);
653099ff
GS
2782 oclass = data->start_class;
2783 data->start_class = &this_class;
2784 f |= SCF_DO_STCLASS_AND;
2785 f &= ~SCF_DO_STCLASS_OR;
2786 }
e1901655
IZ
2787 /* These are the cases when once a subexpression
2788 fails at a particular position, it cannot succeed
2789 even after backtracking at the enclosing scope.
b81d288d 2790
e1901655
IZ
2791 XXXX what if minimal match and we are at the
2792 initial run of {n,m}? */
2793 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2794 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2795
c277df42 2796 /* This will finish on WHILEM, setting scan, or on NULL: */
1de06328 2797 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, last, data,
a3621e74
YO
2798 (mincount == 0
2799 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2800
2801 if (flags & SCF_DO_STCLASS)
2802 data->start_class = oclass;
2803 if (mincount == 0 || minnext == 0) {
2804 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2805 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2806 }
2807 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2808 /* Switch to OR mode: cache the old value of
653099ff
GS
2809 * data->start_class */
2810 StructCopy(data->start_class, &and_with,
2811 struct regnode_charclass_class);
2812 flags &= ~SCF_DO_STCLASS_AND;
2813 StructCopy(&this_class, data->start_class,
2814 struct regnode_charclass_class);
2815 flags |= SCF_DO_STCLASS_OR;
2816 data->start_class->flags |= ANYOF_EOS;
2817 }
2818 } else { /* Non-zero len */
2819 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2820 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2821 cl_and(data->start_class, &and_with);
2822 }
2823 else if (flags & SCF_DO_STCLASS_AND)
2824 cl_and(data->start_class, &this_class);
2825 flags &= ~SCF_DO_STCLASS;
2826 }
c277df42
IZ
2827 if (!scan) /* It was not CURLYX, but CURLY. */
2828 scan = next;
041457d9
DM
2829 if ( /* ? quantifier ok, except for (?{ ... }) */
2830 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2831 && (minnext == 0) && (deltanext == 0)
99799961 2832 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2833 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2834 && ckWARN(WARN_REGEXP))
b45f050a 2835 {
830247a4 2836 vWARN(RExC_parse,
b45f050a
JF
2837 "Quantifier unexpected on zero-length expression");
2838 }
2839
c277df42 2840 min += minnext * mincount;
b81d288d 2841 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2842 && (minnext + deltanext) > 0)
2843 || deltanext == I32_MAX);
aca2d497 2844 is_inf |= is_inf_internal;
c277df42
IZ
2845 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2846
2847 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2848 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2849 && data->flags & SF_IN_PAR
2850 && !(data->flags & SF_HAS_EVAL)
2851 && !deltanext && minnext == 1 ) {
2852 /* Try to optimize to CURLYN. */
2853 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2854 regnode * const nxt1 = nxt;
497b47a8
JH
2855#ifdef DEBUGGING
2856 regnode *nxt2;
2857#endif
c277df42
IZ
2858
2859 /* Skip open. */
2860 nxt = regnext(nxt);
bfed75c6 2861 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2862 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2863 && STR_LEN(nxt) == 1))
c277df42 2864 goto nogo;
497b47a8 2865#ifdef DEBUGGING
c277df42 2866 nxt2 = nxt;
497b47a8 2867#endif
c277df42 2868 nxt = regnext(nxt);
b81d288d 2869 if (OP(nxt) != CLOSE)
c277df42
IZ
2870 goto nogo;
2871 /* Now we know that nxt2 is the only contents: */
eb160463 2872 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2873 OP(oscan) = CURLYN;
2874 OP(nxt1) = NOTHING; /* was OPEN. */
2875#ifdef DEBUGGING
2876 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2877 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2878 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2879 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2880 OP(nxt + 1) = OPTIMIZED; /* was count. */
2881 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2882#endif
c277df42 2883 }
c277df42
IZ
2884 nogo:
2885
2886 /* Try optimization CURLYX => CURLYM. */
b81d288d 2887 if ( OP(oscan) == CURLYX && data
c277df42 2888 && !(data->flags & SF_HAS_PAR)
c277df42 2889 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2890 && !deltanext /* atom is fixed width */
2891 && minnext != 0 /* CURLYM can't handle zero width */
2892 ) {
c277df42
IZ
2893 /* XXXX How to optimize if data == 0? */
2894 /* Optimize to a simpler form. */
2895 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2896 regnode *nxt2;
2897
2898 OP(oscan) = CURLYM;
2899 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2900 && (OP(nxt2) != WHILEM))
c277df42
IZ
2901 nxt = nxt2;
2902 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2903 /* Need to optimize away parenths. */
2904 if (data->flags & SF_IN_PAR) {
2905 /* Set the parenth number. */
2906 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2907
b81d288d 2908 if (OP(nxt) != CLOSE)
b45f050a 2909 FAIL("Panic opt close");
eb160463 2910 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2911 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2912 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2913#ifdef DEBUGGING
2914 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2915 OP(nxt + 1) = OPTIMIZED; /* was count. */
2916 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2917 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2918#endif
c277df42
IZ
2919#if 0
2920 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2921 regnode *nnxt = regnext(nxt1);
b81d288d 2922
c277df42
IZ
2923 if (nnxt == nxt) {
2924 if (reg_off_by_arg[OP(nxt1)])
2925 ARG_SET(nxt1, nxt2 - nxt1);
2926 else if (nxt2 - nxt1 < U16_MAX)
2927 NEXT_OFF(nxt1) = nxt2 - nxt1;
2928 else
2929 OP(nxt) = NOTHING; /* Cannot beautify */
2930 }
2931 nxt1 = nnxt;
2932 }
2933#endif
2934 /* Optimize again: */
1de06328 2935 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
a3621e74 2936 NULL, 0,depth+1);
a0ed51b3
LW
2937 }
2938 else
c277df42 2939 oscan->flags = 0;
c277df42 2940 }
e1901655
IZ
2941 else if ((OP(oscan) == CURLYX)
2942 && (flags & SCF_WHILEM_VISITED_POS)
2943 /* See the comment on a similar expression above.
2944 However, this time it not a subexpression
2945 we care about, but the expression itself. */
2946 && (maxcount == REG_INFTY)
2947 && data && ++data->whilem_c < 16) {
2948 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
2949 /* Find WHILEM (as in regexec.c) */
2950 regnode *nxt = oscan + NEXT_OFF(oscan);
2951
2952 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
2953 nxt += ARG(nxt);
eb160463
GS
2954 PREVOPER(nxt)->flags = (U8)(data->whilem_c
2955 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 2956 }
b81d288d 2957 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
2958 pars++;
2959 if (flags & SCF_DO_SUBSTR) {
c445ea15 2960 SV *last_str = NULL;
c277df42
IZ
2961 int counted = mincount != 0;
2962
2963 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
2964#if defined(SPARC64_GCC_WORKAROUND)
2965 I32 b = 0;
2966 STRLEN l = 0;
cfd0369c 2967 const char *s = NULL;
5d1c421c
JH
2968 I32 old = 0;
2969
2970 if (pos_before >= data->last_start_min)
2971 b = pos_before;
2972 else
2973 b = data->last_start_min;
2974
2975 l = 0;
cfd0369c 2976 s = SvPV_const(data->last_found, l);
5d1c421c
JH
2977 old = b - data->last_start_min;
2978
2979#else
b81d288d 2980 I32 b = pos_before >= data->last_start_min
c277df42
IZ
2981 ? pos_before : data->last_start_min;
2982 STRLEN l;
d4c19fe8 2983 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 2984 I32 old = b - data->last_start_min;
5d1c421c 2985#endif
a0ed51b3
LW
2986
2987 if (UTF)
2988 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 2989
a0ed51b3 2990 l -= old;
c277df42 2991 /* Get the added string: */
79cb57f6 2992 last_str = newSVpvn(s + old, l);
0e933229
IH
2993 if (UTF)
2994 SvUTF8_on(last_str);
c277df42
IZ
2995 if (deltanext == 0 && pos_before == b) {
2996 /* What was added is a constant string */
2997 if (mincount > 1) {
2998 SvGROW(last_str, (mincount * l) + 1);
b81d288d 2999 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3000 SvPVX_const(last_str), l, mincount - 1);
b162af07 3001 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3002 /* Add additional parts. */
b81d288d 3003 SvCUR_set(data->last_found,
c277df42
IZ
3004 SvCUR(data->last_found) - l);
3005 sv_catsv(data->last_found, last_str);
0eda9292
JH
3006 {
3007 SV * sv = data->last_found;
3008 MAGIC *mg =
3009 SvUTF8(sv) && SvMAGICAL(sv) ?
3010 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3011 if (mg && mg->mg_len >= 0)
3012 mg->mg_len += CHR_SVLEN(last_str);
3013 }
c277df42
IZ
3014 data->last_end += l * (mincount - 1);
3015 }
2a8d9689
HS
3016 } else {
3017 /* start offset must point into the last copy */
3018 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3019 data->last_start_max += is_inf ? I32_MAX
3020 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3021 }
3022 }
3023 /* It is counted once already... */
3024 data->pos_min += minnext * (mincount - counted);
3025 data->pos_delta += - counted * deltanext +
3026 (minnext + deltanext) * maxcount - minnext * mincount;
3027 if (mincount != maxcount) {
653099ff
GS
3028 /* Cannot extend fixed substrings found inside
3029 the group. */
1de06328 3030 scan_commit(pRExC_state,data,minlenp);
c277df42 3031 if (mincount && last_str) {
d4c19fe8
AL
3032 SV * const sv = data->last_found;
3033 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3034 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3035
3036 if (mg)
3037 mg->mg_len = -1;
3038 sv_setsv(sv, last_str);
c277df42 3039 data->last_end = data->pos_min;
b81d288d 3040 data->last_start_min =
a0ed51b3 3041 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3042 data->last_start_max = is_inf
3043 ? I32_MAX
c277df42 3044 : data->pos_min + data->pos_delta
a0ed51b3 3045 - CHR_SVLEN(last_str);
c277df42
IZ
3046 }
3047 data->longest = &(data->longest_float);
3048 }
aca2d497 3049 SvREFCNT_dec(last_str);
c277df42 3050 }
405ff068 3051 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3052 data->flags |= SF_HAS_EVAL;
3053 optimize_curly_tail:
c277df42 3054 if (OP(oscan) != CURLYX) {
3dab1dad 3055 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3056 && NEXT_OFF(next))
3057 NEXT_OFF(oscan) += NEXT_OFF(next);
3058 }
c277df42 3059 continue;
653099ff 3060 default: /* REF and CLUMP only? */
c277df42 3061 if (flags & SCF_DO_SUBSTR) {
1de06328 3062 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3063 data->longest = &(data->longest_float);
3064 }
aca2d497 3065 is_inf = is_inf_internal = 1;
653099ff 3066 if (flags & SCF_DO_STCLASS_OR)
830247a4 3067 cl_anything(pRExC_state, data->start_class);
653099ff 3068 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3069 break;
3070 }
a0ed51b3 3071 }
bfed75c6 3072 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3073 int value = 0;
653099ff 3074
c277df42 3075 if (flags & SCF_DO_SUBSTR) {
1de06328 3076 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3077 data->pos_min++;
3078 }
3079 min++;
653099ff
GS
3080 if (flags & SCF_DO_STCLASS) {
3081 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3082
3083 /* Some of the logic below assumes that switching
3084 locale on will only add false positives. */
3dab1dad 3085 switch (PL_regkind[OP(scan)]) {
653099ff 3086 case SANY:
653099ff
GS
3087 default:
3088 do_default:
3089 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3090 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3091 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3092 break;
3093 case REG_ANY:
3094 if (OP(scan) == SANY)
3095 goto do_default;
3096 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3097 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3098 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3099 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3100 }
3101 if (flags & SCF_DO_STCLASS_AND || !value)
3102 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3103 break;
3104 case ANYOF:
3105 if (flags & SCF_DO_STCLASS_AND)
3106 cl_and(data->start_class,
3107 (struct regnode_charclass_class*)scan);
3108 else
830247a4 3109 cl_or(pRExC_state, data->start_class,
653099ff
GS
3110 (struct regnode_charclass_class*)scan);
3111 break;
3112 case ALNUM:
3113 if (flags & SCF_DO_STCLASS_AND) {
3114 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3115 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3116 for (value = 0; value < 256; value++)
3117 if (!isALNUM(value))
3118 ANYOF_BITMAP_CLEAR(data->start_class, value);
3119 }
3120 }
3121 else {
3122 if (data->start_class->flags & ANYOF_LOCALE)
3123 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3124 else {
3125 for (value = 0; value < 256; value++)
3126 if (isALNUM(value))
b81d288d 3127 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3128 }
3129 }
3130 break;
3131 case ALNUML:
3132 if (flags & SCF_DO_STCLASS_AND) {
3133 if (data->start_class->flags & ANYOF_LOCALE)
3134 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3135 }
3136 else {
3137 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3138 data->start_class->flags |= ANYOF_LOCALE;
3139 }
3140 break;
3141 case NALNUM:
3142 if (flags & SCF_DO_STCLASS_AND) {
3143 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3144 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3145 for (value = 0; value < 256; value++)
3146 if (isALNUM(value))
3147 ANYOF_BITMAP_CLEAR(data->start_class, value);
3148 }
3149 }
3150 else {
3151 if (data->start_class->flags & ANYOF_LOCALE)
3152 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3153 else {
3154 for (value = 0; value < 256; value++)
3155 if (!isALNUM(value))
b81d288d 3156 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3157 }
3158 }
3159 break;
3160 case NALNUML:
3161 if (flags & SCF_DO_STCLASS_AND) {
3162 if (data->start_class->flags & ANYOF_LOCALE)
3163 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3164 }
3165 else {
3166 data->start_class->flags |= ANYOF_LOCALE;
3167 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3168 }
3169 break;
3170 case SPACE:
3171 if (flags & SCF_DO_STCLASS_AND) {
3172 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3173 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3174 for (value = 0; value < 256; value++)
3175 if (!isSPACE(value))
3176 ANYOF_BITMAP_CLEAR(data->start_class, value);
3177 }
3178 }
3179 else {
3180 if (data->start_class->flags & ANYOF_LOCALE)
3181 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3182 else {
3183 for (value = 0; value < 256; value++)
3184 if (isSPACE(value))
b81d288d 3185 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3186 }
3187 }
3188 break;
3189 case SPACEL:
3190 if (flags & SCF_DO_STCLASS_AND) {
3191 if (data->start_class->flags & ANYOF_LOCALE)
3192 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3193 }
3194 else {
3195 data->start_class->flags |= ANYOF_LOCALE;
3196 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3197 }
3198 break;
3199 case NSPACE:
3200 if (flags & SCF_DO_STCLASS_AND) {
3201 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3202 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3203 for (value = 0; value < 256; value++)
3204 if (isSPACE(value))
3205 ANYOF_BITMAP_CLEAR(data->start_class, value);
3206 }
3207 }
3208 else {
3209 if (data->start_class->flags & ANYOF_LOCALE)
3210 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3211 else {
3212 for (value = 0; value < 256; value++)
3213 if (!isSPACE(value))
b81d288d 3214 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3215 }
3216 }
3217 break;
3218 case NSPACEL:
3219 if (flags & SCF_DO_STCLASS_AND) {
3220 if (data->start_class->flags & ANYOF_LOCALE) {
3221 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3222 for (value = 0; value < 256; value++)
3223 if (!isSPACE(value))
3224 ANYOF_BITMAP_CLEAR(data->start_class, value);
3225 }
3226 }
3227 else {
3228 data->start_class->flags |= ANYOF_LOCALE;
3229 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3230 }
3231 break;
3232 case DIGIT:
3233 if (flags & SCF_DO_STCLASS_AND) {
3234 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3235 for (value = 0; value < 256; value++)
3236 if (!isDIGIT(value))
3237 ANYOF_BITMAP_CLEAR(data->start_class, value);
3238 }
3239 else {
3240 if (data->start_class->flags & ANYOF_LOCALE)
3241 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3242 else {
3243 for (value = 0; value < 256; value++)
3244 if (isDIGIT(value))
b81d288d 3245 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3246 }
3247 }
3248 break;
3249 case NDIGIT:
3250 if (flags & SCF_DO_STCLASS_AND) {
3251 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3252 for (value = 0; value < 256; value++)
3253 if (isDIGIT(value))
3254 ANYOF_BITMAP_CLEAR(data->start_class, value);
3255 }
3256 else {
3257 if (data->start_class->flags & ANYOF_LOCALE)
3258 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3259 else {
3260 for (value = 0; value < 256; value++)
3261 if (!isDIGIT(value))
b81d288d 3262 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3263 }
3264 }
3265 break;
3266 }
3267 if (flags & SCF_DO_STCLASS_OR)
3268 cl_and(data->start_class, &and_with);
3269 flags &= ~SCF_DO_STCLASS;
3270 }
a0ed51b3 3271 }
3dab1dad 3272 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3273 data->flags |= (OP(scan) == MEOL
3274 ? SF_BEFORE_MEOL
3275 : SF_BEFORE_SEOL);
a0ed51b3 3276 }
3dab1dad 3277 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3278 /* Lookbehind, or need to calculate parens/evals/stclass: */
3279 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3280 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3281 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3282 || OP(scan) == UNLESSM )
3283 {
3284 /* Negative Lookahead/lookbehind
3285 In this case we can't do fixed string optimisation.
3286 */
3287
3288 I32 deltanext, minnext, fake = 0;
3289 regnode *nscan;
3290 struct regnode_charclass_class intrnl;
3291 int f = 0;
3292
3293 data_fake.flags = 0;
3294 if (data) {
3295 data_fake.whilem_c = data->whilem_c;
3296 data_fake.last_closep = data->last_closep;
a0ed51b3 3297 }
1de06328
YO
3298 else
3299 data_fake.last_closep = &fake;
3300 if ( flags & SCF_DO_STCLASS && !scan->flags
3301 && OP(scan) == IFMATCH ) { /* Lookahead */
3302 cl_init(pRExC_state, &intrnl);
3303 data_fake.start_class = &intrnl;
3304 f |= SCF_DO_STCLASS_AND;
c277df42 3305 }
1de06328
YO
3306 if (flags & SCF_WHILEM_VISITED_POS)
3307 f |= SCF_WHILEM_VISITED_POS;
3308 next = regnext(scan);
3309 nscan = NEXTOPER(NEXTOPER(scan));
3310 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext, last, &data_fake, f,depth+1);
3311 if (scan->flags) {
3312 if (deltanext) {
3313 vFAIL("Variable length lookbehind not implemented");
3314 }
3315 else if (minnext > (I32)U8_MAX) {
3316 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3317 }
3318 scan->flags = (U8)minnext;
3319 }
3320 if (data) {
3321 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3322 pars++;
3323 if (data_fake.flags & SF_HAS_EVAL)
3324 data->flags |= SF_HAS_EVAL;
3325 data->whilem_c = data_fake.whilem_c;
3326 }
3327 if (f & SCF_DO_STCLASS_AND) {
3328 const int was = (data->start_class->flags & ANYOF_EOS);
3329
3330 cl_and(data->start_class, &intrnl);
3331 if (was)
3332 data->start_class->flags |= ANYOF_EOS;
3333 }
be8e71aa 3334 }
1de06328
YO
3335#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3336 else {
3337 /* Positive Lookahead/lookbehind
3338 In this case we can do fixed string optimisation,
3339 but we must be careful about it. Note in the case of
3340 lookbehind the positions will be offset by the minimum
3341 length of the pattern, something we won't know about
3342 until after the recurse.
3343 */
3344 I32 deltanext, fake = 0;
3345 regnode *nscan;
3346 struct regnode_charclass_class intrnl;
3347 int f = 0;
3348 /* We use SAVEFREEPV so that when the full compile
3349 is finished perl will clean up the allocated
3350 minlens when its all done. This was we don't
3351 have to worry about freeing them when we know
3352 they wont be used, which would be a pain.
3353 */
3354 I32 *minnextp;
3355 Newx( minnextp, 1, I32 );
3356 SAVEFREEPV(minnextp);
3357
3358 if (data) {
3359 StructCopy(data, &data_fake, scan_data_t);
3360 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3361 f |= SCF_DO_SUBSTR;
3362 if (scan->flags)
3363 scan_commit(pRExC_state, &data_fake,minlenp);
3364 data_fake.last_found=newSVsv(data->last_found);
3365 }
3366 }
3367 else
3368 data_fake.last_closep = &fake;
3369 data_fake.flags = 0;
3370 if (is_inf)
3371 data_fake.flags |= SF_IS_INF;
3372 if ( flags & SCF_DO_STCLASS && !scan->flags
3373 && OP(scan) == IFMATCH ) { /* Lookahead */
3374 cl_init(pRExC_state, &intrnl);
3375 data_fake.start_class = &intrnl;
3376 f |= SCF_DO_STCLASS_AND;
3377 }
3378 if (flags & SCF_WHILEM_VISITED_POS)
3379 f |= SCF_WHILEM_VISITED_POS;
3380 next = regnext(scan);
3381 nscan = NEXTOPER(NEXTOPER(scan));
3382
3383 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext, last, &data_fake, f,depth+1);
3384 if (scan->flags) {
3385 if (deltanext) {
3386 vFAIL("Variable length lookbehind not implemented");
3387 }
3388 else if (*minnextp > (I32)U8_MAX) {
3389 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3390 }
3391 scan->flags = (U8)*minnextp;
3392 }
3393
3394 *minnextp += min;
3395
3396
3397 if (f & SCF_DO_STCLASS_AND) {
3398 const int was = (data->start_class->flags & ANYOF_EOS);
3399
3400 cl_and(data->start_class, &intrnl);
3401 if (was)
3402 data->start_class->flags |= ANYOF_EOS;
3403 }
3404 if (data) {
3405 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3406 pars++;
3407 if (data_fake.flags & SF_HAS_EVAL)
3408 data->flags |= SF_HAS_EVAL;
3409 data->whilem_c = data_fake.whilem_c;
3410 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3411 if (RExC_rx->minlen<*minnextp)
3412 RExC_rx->minlen=*minnextp;
3413 scan_commit(pRExC_state, &data_fake, minnextp);
3414 SvREFCNT_dec(data_fake.last_found);
3415
3416 if ( data_fake.minlen_fixed != minlenp )
3417 {
3418 data->offset_fixed= data_fake.offset_fixed;
3419 data->minlen_fixed= data_fake.minlen_fixed;
3420 data->lookbehind_fixed+= scan->flags;
3421 }
3422 if ( data_fake.minlen_float != minlenp )
3423 {
3424 data->minlen_float= data_fake.minlen_float;
3425 data->offset_float_min=data_fake.offset_float_min;
3426 data->offset_float_max=data_fake.offset_float_max;
3427 data->lookbehind_float+= scan->flags;
3428 }
3429 }
3430 }
3431
653099ff 3432
653099ff 3433 }
1de06328 3434#endif
a0ed51b3
LW
3435 }
3436 else if (OP(scan) == OPEN) {
c277df42 3437 pars++;
a0ed51b3 3438 }
cb434fcc 3439 else if (OP(scan) == CLOSE) {
eb160463 3440 if ((I32)ARG(scan) == is_par) {
cb434fcc 3441 next = regnext(scan);
c277df42 3442
cb434fcc
IZ
3443 if ( next && (OP(next) != WHILEM) && next < last)
3444 is_par = 0; /* Disable optimization */
3445 }
3446 if (data)
3447 *(data->last_closep) = ARG(scan);
a0ed51b3
LW
3448 }
3449 else if (OP(scan) == EVAL) {
c277df42
IZ
3450 if (data)
3451 data->flags |= SF_HAS_EVAL;
3452 }
96776eda 3453 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
0f5d15d6 3454 if (flags & SCF_DO_SUBSTR) {
1de06328 3455 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3456 data->longest = &(data->longest_float);
3457 }
3458 is_inf = is_inf_internal = 1;
653099ff 3459 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3460 cl_anything(pRExC_state, data->start_class);
96776eda 3461 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3462 }
786e8c11
YO
3463#ifdef TRIE_STUDY_OPT
3464#ifdef FULL_TRIE_STUDY
3465 else if (PL_regkind[OP(scan)] == TRIE) {
3466 /* NOTE - There is similar code to this block above for handling
3467 BRANCH nodes on the initial study. If you change stuff here
3468 check there too. */
3469 regnode *tail= regnext(scan);
3470 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3471 I32 max1 = 0, min1 = I32_MAX;
3472 struct regnode_charclass_class accum;
3473
3474 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3475 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3476 if (flags & SCF_DO_STCLASS)
3477 cl_init_zero(pRExC_state, &accum);
3478
3479 if (!trie->jump) {
3480 min1= trie->minlen;
3481 max1= trie->maxlen;
3482 } else {
3483 const regnode *nextbranch= NULL;
3484 U32 word;
3485
3486 for ( word=1 ; word <= trie->wordcount ; word++)
3487 {
3488 I32 deltanext=0, minnext=0, f = 0, fake;
3489 struct regnode_charclass_class this_class;
3490
3491 data_fake.flags = 0;
3492 if (data) {
3493 data_fake.whilem_c = data->whilem_c;
3494 data_fake.last_closep = data->last_closep;
3495 }
3496 else
3497 data_fake.last_closep = &fake;
3498
3499 if (flags & SCF_DO_STCLASS) {
3500 cl_init(pRExC_state, &this_class);
3501 data_fake.start_class = &this_class;
3502 f = SCF_DO_STCLASS_AND;
3503 }
3504 if (flags & SCF_WHILEM_VISITED_POS)
3505 f |= SCF_WHILEM_VISITED_POS;
3506
3507 if (trie->jump[word]) {
3508 if (!nextbranch)
3509 nextbranch = tail - trie->jump[0];
3510 scan= tail - trie->jump[word];
3511 /* We go from the jump point to the branch that follows
3512 it. Note this means we need the vestigal unused branches
3513 even though they arent otherwise used.
3514 */
1de06328 3515 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
786e8c11
YO
3516 (regnode *)nextbranch, &data_fake, f,depth+1);
3517 }
3518 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3519 nextbranch= regnext((regnode*)nextbranch);
3520
3521 if (min1 > (I32)(minnext + trie->minlen))
3522 min1 = minnext + trie->minlen;
3523 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3524 max1 = minnext + deltanext + trie->maxlen;
3525 if (deltanext == I32_MAX)
3526 is_inf = is_inf_internal = 1;
3527
3528 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3529 pars++;
3530
3531 if (data) {
3532 if (data_fake.flags & SF_HAS_EVAL)
3533 data->flags |= SF_HAS_EVAL;
3534 data->whilem_c = data_fake.whilem_c;
3535 }
3536 if (flags & SCF_DO_STCLASS)
3537 cl_or(pRExC_state, &accum, &this_class);
3538 }
3539 }
3540 if (flags & SCF_DO_SUBSTR) {
3541 data->pos_min += min1;
3542 data->pos_delta += max1 - min1;
3543 if (max1 != min1 || is_inf)
3544 data->longest = &(data->longest_float);
3545 }
3546 min += min1;
3547 delta += max1 - min1;
3548 if (flags & SCF_DO_STCLASS_OR) {
3549 cl_or(pRExC_state, data->start_class, &accum);
3550 if (min1) {
3551 cl_and(data->start_class, &and_with);
3552 flags &= ~SCF_DO_STCLASS;
3553 }
3554 }
3555 else if (flags & SCF_DO_STCLASS_AND) {
3556 if (min1) {
3557 cl_and(data->start_class, &accum);
3558 flags &= ~SCF_DO_STCLASS;
3559 }
3560 else {
3561 /* Switch to OR mode: cache the old value of
3562 * data->start_class */
3563 StructCopy(data->start_class, &and_with,
3564 struct regnode_charclass_class);
3565 flags &= ~SCF_DO_STCLASS_AND;
3566 StructCopy(&accum, data->start_class,
3567 struct regnode_charclass_class);
3568 flags |= SCF_DO_STCLASS_OR;
3569 data->start_class->flags |= ANYOF_EOS;
3570 }
3571 }
3572 scan= tail;
3573 continue;
3574 }
3575#else
3576 else if (PL_regkind[OP(scan)] == TRIE) {
3577 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3578 U8*bang=NULL;
3579
3580 min += trie->minlen;
3581 delta += (trie->maxlen - trie->minlen);
3582 flags &= ~SCF_DO_STCLASS; /* xxx */
3583 if (flags & SCF_DO_SUBSTR) {
1de06328 3584 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3585 data->pos_min += trie->minlen;
3586 data->pos_delta += (trie->maxlen - trie->minlen);
3587 if (trie->maxlen != trie->minlen)
3588 data->longest = &(data->longest_float);
3589 }
3590 if (trie->jump) /* no more substrings -- for now /grr*/
3591 flags &= ~SCF_DO_SUBSTR;
3592 }
3593#endif /* old or new */
3594#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3595 /* Else: zero-length, ignore. */
3596 scan = regnext(scan);
3597 }
3598
3599 finish:
3600 *scanp = scan;
aca2d497 3601 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3602 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3603 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3604 if (is_par > (I32)U8_MAX)
c277df42
IZ
3605 is_par = 0;
3606 if (is_par && pars==1 && data) {
3607 data->flags |= SF_IN_PAR;
3608 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3609 }
3610 else if (pars && data) {
c277df42
IZ
3611 data->flags |= SF_HAS_PAR;
3612 data->flags &= ~SF_IN_PAR;
3613 }
653099ff
GS
3614 if (flags & SCF_DO_STCLASS_OR)
3615 cl_and(data->start_class, &and_with);
786e8c11
YO
3616 if (flags & SCF_TRIE_RESTUDY)
3617 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3618
3619 DEBUG_STUDYDATA(data,depth);
3620
c277df42
IZ
3621 return min;
3622}
3623
76e3520e 3624STATIC I32
5f66b61c 3625S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3626{
830247a4 3627 if (RExC_rx->data) {
b81d288d
AB
3628 Renewc(RExC_rx->data,
3629 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
c277df42 3630 char, struct reg_data);
830247a4
IZ
3631 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
3632 RExC_rx->data->count += n;
a0ed51b3
LW
3633 }
3634 else {
a02a5408 3635 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3636 char, struct reg_data);
a02a5408 3637 Newx(RExC_rx->data->what, n, U8);
830247a4 3638 RExC_rx->data->count = n;
c277df42 3639 }
830247a4
IZ
3640 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3641 return RExC_rx->data->count - n;
c277df42
IZ
3642}
3643
76234dfb 3644#ifndef PERL_IN_XSUB_RE
d88dccdf 3645void
864dbfa3 3646Perl_reginitcolors(pTHX)
d88dccdf 3647{
97aff369 3648 dVAR;
1df70142 3649 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3650 if (s) {
1df70142
AL
3651 char *t = savepv(s);
3652 int i = 0;
3653 PL_colors[0] = t;
d88dccdf 3654 while (++i < 6) {
1df70142
AL
3655 t = strchr(t, '\t');
3656 if (t) {
3657 *t = '\0';
3658 PL_colors[i] = ++t;
d88dccdf
IZ
3659 }
3660 else
1df70142 3661 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3662 }
3663 } else {
1df70142 3664 int i = 0;
b81d288d 3665 while (i < 6)
06b5626a 3666 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3667 }
3668 PL_colorset = 1;
3669}
76234dfb 3670#endif
8615cb43 3671
07be1b83 3672
786e8c11
YO
3673#ifdef TRIE_STUDY_OPT
3674#define CHECK_RESTUDY_GOTO \
3675 if ( \
3676 (data.flags & SCF_TRIE_RESTUDY) \
3677 && ! restudied++ \
3678 ) goto reStudy
3679#else
3680#define CHECK_RESTUDY_GOTO
3681#endif
f9f4320a 3682
a687059c 3683/*
e50aee73 3684 - pregcomp - compile a regular expression into internal code
a687059c
LW
3685 *
3686 * We can't allocate space until we know how big the compiled form will be,
3687 * but we can't compile it (and thus know how big it is) until we've got a
3688 * place to put the code. So we cheat: we compile it twice, once with code
3689 * generation turned off and size counting turned on, and once "for real".
3690 * This also means that we don't allocate space until we are sure that the
3691 * thing really will compile successfully, and we never have to move the
3692 * code and thus invalidate pointers into it. (Note that it has to be in
3693 * one piece because free() must be able to free it all.) [NB: not true in perl]
3694 *
3695 * Beware that the optimization-preparation code in here knows about some
3696 * of the structure of the compiled regexp. [I'll say.]
3697 */
f9f4320a
YO
3698#ifndef PERL_IN_XSUB_RE
3699#define CORE_ONLY_BLOCK(c) {c}{
3700#define RE_ENGINE_PTR &PL_core_reg_engine
3701#else
3702#define CORE_ONLY_BLOCK(c) {
3703extern const struct regexp_engine my_reg_engine;
3704#define RE_ENGINE_PTR &my_reg_engine
3705#endif
3706#define END_BLOCK }
3707
a687059c 3708regexp *
864dbfa3 3709Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3710{
97aff369 3711 dVAR;
f9f4320a
YO
3712 GET_RE_DEBUG_FLAGS_DECL;
3713 DEBUG_r(if (!PL_colorset) reginitcolors());
3714 CORE_ONLY_BLOCK(
3715 /* Dispatch a request to compile a regexp to correct
3716 regexp engine. */
3717 HV * const table = GvHV(PL_hintgv);
3718 if (table) {
3719 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
3720 if (ptr && SvIOK(*ptr)) {
3721 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3722 DEBUG_COMPILE_r({
8d8756e7 3723 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3724 SvIV(*ptr));
3725 });
f2f78491 3726 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a
YO
3727 }
3728 })
a0d0e21e 3729 register regexp *r;
c277df42 3730 regnode *scan;
c277df42 3731 regnode *first;
a0d0e21e 3732 I32 flags;
a0d0e21e
LW
3733 I32 minlen = 0;
3734 I32 sawplus = 0;
3735 I32 sawopen = 0;
2c2d71f5 3736 scan_data_t data;
830247a4 3737 RExC_state_t RExC_state;
be8e71aa 3738 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3739#ifdef TRIE_STUDY_OPT
3740 int restudied= 0;
3741 RExC_state_t copyRExC_state;
3742#endif
a0d0e21e 3743 if (exp == NULL)
c277df42 3744 FAIL("NULL regexp argument");
a0d0e21e 3745
a5961de5 3746 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3747
5cfc7842 3748 RExC_precomp = exp;
a3621e74 3749 DEBUG_COMPILE_r({
ab3bbdeb
YO
3750 SV *dsv= sv_newmortal();
3751 RE_PV_QUOTED_DECL(s, RExC_utf8,
3752 dsv, RExC_precomp, (xend - exp), 60);
3753 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3754 PL_colors[4],PL_colors[5],s);
a5961de5 3755 });
e2509266 3756 RExC_flags = pm->op_pmflags;
830247a4 3757 RExC_sawback = 0;
bbce6d69 3758
830247a4
IZ
3759 RExC_seen = 0;
3760 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3761 RExC_seen_evals = 0;
3762 RExC_extralen = 0;
c277df42 3763
bbce6d69 3764 /* First pass: determine size, legality. */
830247a4 3765 RExC_parse = exp;
fac92740 3766 RExC_start = exp;
830247a4
IZ
3767 RExC_end = xend;
3768 RExC_naughty = 0;
3769 RExC_npar = 1;
3770 RExC_size = 0L;
3771 RExC_emit = &PL_regdummy;
3772 RExC_whilem_seen = 0;
fc8cd66c 3773 RExC_charnames = NULL;
6bda09f9 3774 RExC_parens= NULL;
fc8cd66c 3775
85ddcde9
JH
3776#if 0 /* REGC() is (currently) a NOP at the first pass.
3777 * Clever compilers notice this and complain. --jhi */
830247a4 3778 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3779#endif
3dab1dad
YO
3780 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3781 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3782 RExC_precomp = NULL;
a0d0e21e
LW
3783 return(NULL);
3784 }
3dab1dad
YO
3785 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Required "));
3786 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" nodes ", (IV)RExC_size));
3787 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nStarting second pass (creation)\n"));
07be1b83
YO
3788 DEBUG_PARSE_r({
3789 RExC_lastnum=0;
3790 RExC_lastparse=NULL;
3791 });
c277df42 3792
07be1b83 3793
c277df42
IZ
3794 /* Small enough for pointer-storage convention?
3795 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3796 if (RExC_size >= 0x10000L && RExC_extralen)
3797 RExC_size += RExC_extralen;
c277df42 3798 else
830247a4
IZ
3799 RExC_extralen = 0;
3800 if (RExC_whilem_seen > 15)
3801 RExC_whilem_seen = 15;
a0d0e21e 3802
f9f4320a
YO
3803 /* Allocate space and zero-initialize. Note, the two step process
3804 of zeroing when in debug mode, thus anything assigned has to
3805 happen after that */
a02a5408 3806 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3807 char, regexp);
a0d0e21e 3808 if (r == NULL)
b45f050a 3809 FAIL("Regexp out of space");
0f79a09d
GS
3810#ifdef DEBUGGING
3811 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3812 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3813#endif
f9f4320a
YO
3814 /* initialization begins here */
3815 r->engine= RE_ENGINE_PTR;
c277df42 3816 r->refcnt = 1;
bbce6d69 3817 r->prelen = xend - exp;
5cfc7842 3818 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3819 r->subbeg = NULL;
f8c7b90f 3820#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3821 r->saved_copy = NULL;
ed252734 3822#endif
cf93c79d 3823 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3824 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3825 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3826
3827 r->substrs = 0; /* Useful during FAIL. */
3828 r->startp = 0; /* Useful during FAIL. */
4327152a 3829
6bda09f9
YO
3830 r->endp = 0;
3831 if (RExC_seen & REG_SEEN_RECURSE) {
3832 Newx(RExC_parens, RExC_npar,regnode *);
3833 SAVEFREEPV(RExC_parens);
3834 }
3835
3836 /* Useful during FAIL. */
a02a5408 3837 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3838 if (r->offsets) {
2af232bd 3839 r->offsets[0] = RExC_size;
fac92740 3840 }
a3621e74 3841 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3842 "%s %"UVuf" bytes for offset annotations.\n",
3843 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3844 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 3845
830247a4 3846 RExC_rx = r;
bbce6d69 3847
3848 /* Second pass: emit code. */
e2509266 3849 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
3850 RExC_parse = exp;
3851 RExC_end = xend;
3852 RExC_naughty = 0;
3853 RExC_npar = 1;
fac92740 3854 RExC_emit_start = r->program;
830247a4 3855 RExC_emit = r->program;
2cd61cdb 3856 /* Store the count of eval-groups for security checks: */
786e8c11 3857 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 3858 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 3859 r->data = 0;
3dab1dad 3860 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 3861 return(NULL);
6bda09f9 3862
07be1b83
YO
3863 /* XXXX To minimize changes to RE engine we always allocate
3864 3-units-long substrs field. */
3865 Newx(r->substrs, 1, struct reg_substr_data);
a0d0e21e 3866
07be1b83 3867reStudy:
1de06328 3868 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83
YO
3869 Zero(r->substrs, 1, struct reg_substr_data);
3870 StructCopy(&zero_scan_data, &data, scan_data_t);
a3621e74 3871
07be1b83
YO
3872#ifdef TRIE_STUDY_OPT
3873 if ( restudied ) {
3874 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
3875 RExC_state=copyRExC_state;
1de06328 3876 if (data.last_found) {
07be1b83 3877 SvREFCNT_dec(data.longest_fixed);
07be1b83 3878 SvREFCNT_dec(data.longest_float);
07be1b83 3879 SvREFCNT_dec(data.last_found);
1de06328 3880 }
07be1b83
YO
3881 } else {
3882 copyRExC_state=RExC_state;
3883 }
3884#endif
fc8cd66c 3885
a0d0e21e 3886 /* Dig out information for optimizations. */
cf93c79d 3887 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 3888 pm->op_pmflags = RExC_flags;
a0ed51b3 3889 if (UTF)
5ff6fc6d 3890 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 3891 r->regstclass = NULL;
830247a4 3892 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 3893 r->reganch |= ROPT_NAUGHTY;
c277df42 3894 scan = r->program + 1; /* First BRANCH. */
2779dcf1 3895
1de06328
YO
3896 /* testing for BRANCH here tells us whether there is "must appear"
3897 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 3898 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 3899 I32 fake;
c5254dd6 3900 STRLEN longest_float_length, longest_fixed_length;
07be1b83 3901 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 3902 int stclass_flag;
07be1b83 3903 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
3904
3905 first = scan;
c277df42 3906 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 3907 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 3908 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 3909 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
3910 /* for now we can't handle lookbehind IFMATCH*/
3911 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
3912 (OP(first) == PLUS) ||
3913 (OP(first) == MINMOD) ||
653099ff 3914 /* An {n,m} with n>0 */
07be1b83
YO
3915 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
3916 {
786e8c11 3917
a0d0e21e
LW
3918 if (OP(first) == PLUS)
3919 sawplus = 1;
3920 else
3dab1dad 3921 first += regarglen[OP(first)];
07be1b83
YO
3922 if (OP(first) == IFMATCH) {
3923 first = NEXTOPER(first);
3924 first += EXTRA_STEP_2ARGS;
7c167cea 3925 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 3926 first = NEXTOPER(first);
a687059c
LW
3927 }
3928
a0d0e21e
LW
3929 /* Starting-point info. */
3930 again:
786e8c11 3931 DEBUG_PEEP("first:",first,0);
07be1b83 3932 /* Ignore EXACT as we deal with it later. */
3dab1dad 3933 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 3934 if (OP(first) == EXACT)
6f207bd3 3935 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 3936 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
3937 r->regstclass = first;
3938 }
07be1b83 3939#ifdef TRIE_STCLASS
786e8c11 3940 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
3941 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
3942 {
786e8c11 3943 regnode *trie_op;
07be1b83 3944 /* this can happen only on restudy */
786e8c11
YO
3945 if ( OP(first) == TRIE ) {
3946 struct regnode_1 *trieop;
3947 Newxz(trieop,1,struct regnode_1);
3948 StructCopy(first,trieop,struct regnode_1);
3949 trie_op=(regnode *)trieop;
3950 } else {
3951 struct regnode_charclass *trieop;
3952 Newxz(trieop,1,struct regnode_charclass);
3953 StructCopy(first,trieop,struct regnode_charclass);
3954 trie_op=(regnode *)trieop;
3955 }
1de06328 3956 OP(trie_op)+=2;
786e8c11
YO
3957 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
3958 r->regstclass = trie_op;
07be1b83
YO
3959 }
3960#endif
bfed75c6 3961 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 3962 r->regstclass = first;
3dab1dad
YO
3963 else if (PL_regkind[OP(first)] == BOUND ||
3964 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 3965 r->regstclass = first;
3dab1dad 3966 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
3967 r->reganch |= (OP(first) == MBOL
3968 ? ROPT_ANCH_MBOL
3969 : (OP(first) == SBOL
3970 ? ROPT_ANCH_SBOL
3971 : ROPT_ANCH_BOL));
a0d0e21e 3972 first = NEXTOPER(first);
774d564b 3973 goto again;
3974 }
3975 else if (OP(first) == GPOS) {
3976 r->reganch |= ROPT_ANCH_GPOS;
3977 first = NEXTOPER(first);
3978 goto again;
a0d0e21e 3979 }
e09294f4 3980 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 3981 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
3982 !(r->reganch & ROPT_ANCH) )
3983 {
3984 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
3985 const int type =
3986 (OP(NEXTOPER(first)) == REG_ANY)
3987 ? ROPT_ANCH_MBOL
3988 : ROPT_ANCH_SBOL;
cad2e5aa 3989 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 3990 first = NEXTOPER(first);
774d564b 3991 goto again;
a0d0e21e 3992 }
b81d288d 3993 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 3994 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
3995 /* x+ must match at the 1st pos of run of x's */
3996 r->reganch |= ROPT_SKIP;
a0d0e21e 3997
c277df42 3998 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa
YO
3999#ifdef TRIE_STUDY_OPT
4000 DEBUG_COMPILE_r(
4001 if (!restudied)
4002 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4003 (IV)(first - scan + 1))
4004 );
4005#else
4006 DEBUG_COMPILE_r(
4007 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4008 (IV)(first - scan + 1))
4009 );
4010#endif
4011
4012
a0d0e21e
LW
4013 /*
4014 * If there's something expensive in the r.e., find the
4015 * longest literal string that must appear and make it the
4016 * regmust. Resolve ties in favor of later strings, since
4017 * the regstart check works with the beginning of the r.e.
4018 * and avoiding duplication strengthens checking. Not a
4019 * strong reason, but sufficient in the absence of others.
4020 * [Now we resolve ties in favor of the earlier string if
c277df42 4021 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4022 * earlier string may buy us something the later one won't.]
4023 */
a0d0e21e 4024 minlen = 0;
a687059c 4025
396482e1
GA
4026 data.longest_fixed = newSVpvs("");
4027 data.longest_float = newSVpvs("");
4028 data.last_found = newSVpvs("");
c277df42
IZ
4029 data.longest = &(data.longest_fixed);
4030 first = scan;
653099ff 4031 if (!r->regstclass) {
830247a4 4032 cl_init(pRExC_state, &ch_class);
653099ff
GS
4033 data.start_class = &ch_class;
4034 stclass_flag = SCF_DO_STCLASS_AND;
4035 } else /* XXXX Check for BOUND? */
4036 stclass_flag = 0;
cb434fcc 4037 data.last_closep = &last_close;
653099ff 4038
1de06328 4039 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
a3621e74 4040 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4041
07be1b83 4042
786e8c11
YO
4043 CHECK_RESTUDY_GOTO;
4044
4045
830247a4 4046 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4047 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4048 && !RExC_seen_zerolen
4049 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4050 r->reganch |= ROPT_CHECK_ALL;
1de06328 4051 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4052 SvREFCNT_dec(data.last_found);
4053
1de06328
YO
4054 /* Note that code very similar to this but for anchored string
4055 follows immediately below, changes may need to be made to both.
4056 Be careful.
4057 */
a0ed51b3 4058 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4059 if (longest_float_length
c277df42
IZ
4060 || (data.flags & SF_FL_BEFORE_EOL
4061 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4062 || (RExC_flags & PMf_MULTILINE))))
4063 {
1182767e 4064 I32 t,ml;
cf93c79d 4065
1de06328 4066 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4067 && data.offset_fixed == data.offset_float_min
4068 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4069 goto remove_float; /* As in (a)+. */
4070
1de06328
YO
4071 /* copy the information about the longest float from the reg_scan_data
4072 over to the program. */
33b8afdf
JH
4073 if (SvUTF8(data.longest_float)) {
4074 r->float_utf8 = data.longest_float;
c445ea15 4075 r->float_substr = NULL;
33b8afdf
JH
4076 } else {
4077 r->float_substr = data.longest_float;
c445ea15 4078 r->float_utf8 = NULL;
33b8afdf 4079 }
1de06328
YO
4080 /* float_end_shift is how many chars that must be matched that
4081 follow this item. We calculate it ahead of time as once the
4082 lookbehind offset is added in we lose the ability to correctly
4083 calculate it.*/
4084 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4085 : (I32)longest_float_length;
1de06328
YO
4086 r->float_end_shift = ml - data.offset_float_min
4087 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4088 + data.lookbehind_float;
4089 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4090 r->float_max_offset = data.offset_float_max;
1182767e 4091 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4092 r->float_max_offset -= data.lookbehind_float;
4093
cf93c79d
IZ
4094 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4095 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4096 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4097 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4098 }
4099 else {
aca2d497 4100 remove_float:
c445ea15 4101 r->float_substr = r->float_utf8 = NULL;
c277df42 4102 SvREFCNT_dec(data.longest_float);
c5254dd6 4103 longest_float_length = 0;
a0d0e21e 4104 }
c277df42 4105
1de06328
YO
4106 /* Note that code very similar to this but for floating string
4107 is immediately above, changes may need to be made to both.
4108 Be careful.
4109 */
a0ed51b3 4110 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4111 if (longest_fixed_length
c277df42
IZ
4112 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4113 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4114 || (RExC_flags & PMf_MULTILINE))))
4115 {
1182767e 4116 I32 t,ml;
cf93c79d 4117
1de06328
YO
4118 /* copy the information about the longest fixed
4119 from the reg_scan_data over to the program. */
33b8afdf
JH
4120 if (SvUTF8(data.longest_fixed)) {
4121 r->anchored_utf8 = data.longest_fixed;
c445ea15 4122 r->anchored_substr = NULL;
33b8afdf
JH
4123 } else {
4124 r->anchored_substr = data.longest_fixed;
c445ea15 4125 r->anchored_utf8 = NULL;
33b8afdf 4126 }
1de06328
YO
4127 /* fixed_end_shift is how many chars that must be matched that
4128 follow this item. We calculate it ahead of time as once the
4129 lookbehind offset is added in we lose the ability to correctly
4130 calculate it.*/
4131 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4132 : (I32)longest_fixed_length;
1de06328
YO
4133 r->anchored_end_shift = ml - data.offset_fixed
4134 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4135 + data.lookbehind_fixed;
4136 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4137
cf93c79d
IZ
4138 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4139 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4140 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4141 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4142 }
4143 else {
c445ea15 4144 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4145 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4146 longest_fixed_length = 0;
a0d0e21e 4147 }
b81d288d 4148 if (r->regstclass
ffc61ed2 4149 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4150 r->regstclass = NULL;
33b8afdf
JH
4151 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4152 && stclass_flag
653099ff 4153 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4154 && !cl_is_anything(data.start_class))
4155 {
1df70142 4156 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4157
a02a5408 4158 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4159 struct regnode_charclass_class);
4160 StructCopy(data.start_class,
830247a4 4161 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4162 struct regnode_charclass_class);
830247a4 4163 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4164 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4165 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4166 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4167 PerlIO_printf(Perl_debug_log,
a0288114 4168 "synthetic stclass \"%s\".\n",
3f7c398e 4169 SvPVX_const(sv));});
653099ff 4170 }
c277df42
IZ
4171
4172 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4173 if (longest_fixed_length > longest_float_length) {
1de06328 4174 r->check_end_shift = r->anchored_end_shift;
c277df42 4175 r->check_substr = r->anchored_substr;
33b8afdf 4176 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4177 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4178 if (r->reganch & ROPT_ANCH_SINGLE)
4179 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4180 }
4181 else {
1de06328 4182 r->check_end_shift = r->float_end_shift;
c277df42 4183 r->check_substr = r->float_substr;
33b8afdf 4184 r->check_utf8 = r->float_utf8;
1de06328
YO
4185 r->check_offset_min = r->float_min_offset;
4186 r->check_offset_max = r->float_max_offset;
a0d0e21e 4187 }
30382c73
IZ
4188 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4189 This should be changed ASAP! */
33b8afdf 4190 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4191 r->reganch |= RE_USE_INTUIT;
33b8afdf 4192 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4193 r->reganch |= RE_INTUIT_TAIL;
4194 }
1de06328
YO
4195 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4196 if ( (STRLEN)minlen < longest_float_length )
4197 minlen= longest_float_length;
4198 if ( (STRLEN)minlen < longest_fixed_length )
4199 minlen= longest_fixed_length;
4200 */
a0ed51b3
LW
4201 }
4202 else {
c277df42
IZ
4203 /* Several toplevels. Best we can is to set minlen. */
4204 I32 fake;
653099ff 4205 struct regnode_charclass_class ch_class;
cb434fcc 4206 I32 last_close = 0;
c277df42 4207
a3621e74 4208 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4209
c277df42 4210 scan = r->program + 1;
830247a4 4211 cl_init(pRExC_state, &ch_class);
653099ff 4212 data.start_class = &ch_class;
cb434fcc 4213 data.last_closep = &last_close;
07be1b83 4214
1de06328 4215 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
07be1b83
YO
4216 &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
4217
786e8c11 4218 CHECK_RESTUDY_GOTO;
07be1b83 4219
33b8afdf 4220 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4221 = r->float_substr = r->float_utf8 = NULL;
653099ff 4222 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4223 && !cl_is_anything(data.start_class))
4224 {
1df70142 4225 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4226
a02a5408 4227 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4228 struct regnode_charclass_class);
4229 StructCopy(data.start_class,
830247a4 4230 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4231 struct regnode_charclass_class);
830247a4 4232 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4233 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4234 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4235 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4236 PerlIO_printf(Perl_debug_log,
a0288114 4237 "synthetic stclass \"%s\".\n",
3f7c398e 4238 SvPVX_const(sv));});
653099ff 4239 }
a0d0e21e
LW
4240 }
4241
1de06328
YO
4242 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4243 the "real" pattern. */
4244 if (r->minlen < minlen)
4245 r->minlen = minlen;
4246
b81d288d 4247 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4248 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4249 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4250 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4251 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4252 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4253 if (RExC_seen & REG_SEEN_CANY)
4254 r->reganch |= ROPT_CANY_SEEN;
a02a5408
JC
4255 Newxz(r->startp, RExC_npar, I32);
4256 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4257
f2278c82 4258 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4259 DEBUG_DUMP_r({
4260 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4261 regdump(r);
4262 });
8e9a8a48
YO
4263 DEBUG_OFFSETS_r(if (r->offsets) {
4264 const U32 len = r->offsets[0];
4265 U32 i;
4266 GET_RE_DEBUG_FLAGS_DECL;
4267 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4268 for (i = 1; i <= len; i++) {
4269 if (r->offsets[i*2-1] || r->offsets[i*2])
4270 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4271 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4272 }
4273 PerlIO_printf(Perl_debug_log, "\n");
4274 });
a0d0e21e 4275 return(r);
f9f4320a 4276 END_BLOCK
a687059c
LW
4277}
4278
f9f4320a
YO
4279#undef CORE_ONLY_BLOCK
4280#undef END_BLOCK
4281#undef RE_ENGINE_PTR
3dab1dad
YO
4282
4283#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4284 int rem=(int)(RExC_end - RExC_parse); \
4285 int cut; \
4286 int num; \
4287 int iscut=0; \
4288 if (rem>10) { \
4289 rem=10; \
4290 iscut=1; \
4291 } \
4292 cut=10-rem; \
4293 if (RExC_lastparse!=RExC_parse) \
4294 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4295 rem, RExC_parse, \
4296 cut + 4, \
4297 iscut ? "..." : "<" \
4298 ); \
4299 else \
4300 PerlIO_printf(Perl_debug_log,"%16s",""); \
4301 \
4302 if (SIZE_ONLY) \
4303 num=RExC_size; \
4304 else \
4305 num=REG_NODE_NUM(RExC_emit); \
4306 if (RExC_lastnum!=num) \
be8e71aa 4307 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4308 else \
be8e71aa
YO
4309 PerlIO_printf(Perl_debug_log,"|%4s",""); \
4310 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4311 (int)((depth*2)), "", \
3dab1dad
YO
4312 (funcname) \
4313 ); \
4314 RExC_lastnum=num; \
4315 RExC_lastparse=RExC_parse; \
4316})
4317
07be1b83
YO
4318
4319
3dab1dad
YO
4320#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4321 DEBUG_PARSE_MSG((funcname)); \
4322 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4323})
6bda09f9
YO
4324#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4325 DEBUG_PARSE_MSG((funcname)); \
4326 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4327})
a687059c
LW
4328/*
4329 - reg - regular expression, i.e. main body or parenthesized thing
4330 *
4331 * Caller must absorb opening parenthesis.
4332 *
4333 * Combining parenthesis handling with the base level of regular expression
4334 * is a trifle forced, but the need to tie the tails of the branches to what
4335 * follows makes it hard to avoid.
4336 */
07be1b83
YO
4337#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4338#ifdef DEBUGGING
4339#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4340#else
4341#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4342#endif
3dab1dad 4343
76e3520e 4344STATIC regnode *
3dab1dad 4345S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4346 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4347{
27da23d5 4348 dVAR;
c277df42
IZ
4349 register regnode *ret; /* Will be the head of the group. */
4350 register regnode *br;
4351 register regnode *lastbr;
cbbf8932 4352 register regnode *ender = NULL;
a0d0e21e 4353 register I32 parno = 0;
cbbf8932
AL
4354 I32 flags;
4355 const I32 oregflags = RExC_flags;
6136c704
AL
4356 bool have_branch = 0;
4357 bool is_open = 0;
9d1d55b5
JP
4358
4359 /* for (?g), (?gc), and (?o) warnings; warning
4360 about (?c) will warn about (?g) -- japhy */
4361
6136c704
AL
4362#define WASTED_O 0x01
4363#define WASTED_G 0x02
4364#define WASTED_C 0x04
4365#define WASTED_GC (0x02|0x04)
cbbf8932 4366 I32 wastedflags = 0x00;
9d1d55b5 4367
fac92740 4368 char * parse_start = RExC_parse; /* MJD */
a28509cc 4369 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4370
3dab1dad
YO
4371 GET_RE_DEBUG_FLAGS_DECL;
4372 DEBUG_PARSE("reg ");
4373
4374
821b33a5 4375 *flagp = 0; /* Tentatively. */
a0d0e21e 4376
9d1d55b5 4377
a0d0e21e
LW
4378 /* Make an OPEN node, if parenthesized. */
4379 if (paren) {
fac92740 4380 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4381 U32 posflags = 0, negflags = 0;
4382 U32 *flagsp = &posflags;
6136c704 4383 bool is_logical = 0;
a28509cc 4384 const char * const seqstart = RExC_parse;
ca9dfc88 4385
830247a4
IZ
4386 RExC_parse++;
4387 paren = *RExC_parse++;
c277df42 4388 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4389 switch (paren) {
fac92740 4390 case '<': /* (?<...) */
830247a4 4391 RExC_seen |= REG_SEEN_LOOKBEHIND;
b81d288d 4392 if (*RExC_parse == '!')
c277df42 4393 paren = ',';
b81d288d 4394 if (*RExC_parse != '=' && *RExC_parse != '!')
c277df42 4395 goto unknown;
830247a4 4396 RExC_parse++;
fac92740
MJD
4397 case '=': /* (?=...) */
4398 case '!': /* (?!...) */
830247a4 4399 RExC_seen_zerolen++;
fac92740
MJD
4400 case ':': /* (?:...) */
4401 case '>': /* (?>...) */
a0d0e21e 4402 break;
fac92740
MJD
4403 case '$': /* (?$...) */
4404 case '@': /* (?@...) */
8615cb43 4405 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4406 break;
fac92740 4407 case '#': /* (?#...) */
830247a4
IZ
4408 while (*RExC_parse && *RExC_parse != ')')
4409 RExC_parse++;
4410 if (*RExC_parse != ')')
c277df42 4411 FAIL("Sequence (?#... not terminated");
830247a4 4412 nextchar(pRExC_state);
a0d0e21e
LW
4413 *flagp = TRYAGAIN;
4414 return NULL;
6bda09f9
YO
4415 case 'R' :
4416 if (*RExC_parse != ')')
4417 FAIL("Sequence (?R) not terminated");
4418 reg_node(pRExC_state, SRECURSE);
4419 break;
4420 case '1': case '2': case '3': case '4': /* (?1) */
4421 case '5': case '6': case '7': case '8': case '9':
4422 RExC_parse--;
4423 {
4424 const I32 num = atoi(RExC_parse);
4425 char * const parse_start = RExC_parse - 1; /* MJD */
4426 while (isDIGIT(*RExC_parse))
4427 RExC_parse++;
4428 if (*RExC_parse!=')')
4429 vFAIL("Expecting close bracket");
4430 ret = reganode(pRExC_state, RECURSE, num);
4431 if (!SIZE_ONLY) {
4432 if (num > (I32)RExC_rx->nparens) {
4433 RExC_parse++;
4434 vFAIL("Reference to nonexistent group");
4435 }
4436 ARG2L_SET( ret, 0);
4437 RExC_emit++;
226de585
RGS
4438 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
4439 "Recurse #%"UVuf" to %"IVdf"\n", ARG(ret), ARG2L(ret)));
6bda09f9
YO
4440 } else{
4441 RExC_size++;
4442 RExC_seen|=REG_SEEN_RECURSE;
4443 }
4444 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4445 Set_Node_Offset(ret, parse_start); /* MJD */
4446
6bda09f9
YO
4447 nextchar(pRExC_state);
4448 return ret;
4449 }
fac92740 4450 case 'p': /* (?p...) */
9014280d 4451 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4452 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4453 /* FALL THROUGH*/
fac92740 4454 case '?': /* (??...) */
6136c704 4455 is_logical = 1;
438a3801
YST
4456 if (*RExC_parse != '{')
4457 goto unknown;
830247a4 4458 paren = *RExC_parse++;
0f5d15d6 4459 /* FALL THROUGH */
fac92740 4460 case '{': /* (?{...}) */
c277df42 4461 {
c277df42
IZ
4462 I32 count = 1, n = 0;
4463 char c;
830247a4 4464 char *s = RExC_parse;
c277df42 4465
830247a4
IZ
4466 RExC_seen_zerolen++;
4467 RExC_seen |= REG_SEEN_EVAL;
4468 while (count && (c = *RExC_parse)) {
6136c704
AL
4469 if (c == '\\') {
4470 if (RExC_parse[1])
4471 RExC_parse++;
4472 }
b81d288d 4473 else if (c == '{')
c277df42 4474 count++;
b81d288d 4475 else if (c == '}')
c277df42 4476 count--;
830247a4 4477 RExC_parse++;
c277df42 4478 }
6136c704 4479 if (*RExC_parse != ')') {
b81d288d 4480 RExC_parse = s;
b45f050a
JF
4481 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4482 }
c277df42 4483 if (!SIZE_ONLY) {
f3548bdc 4484 PAD *pad;
6136c704
AL
4485 OP_4tree *sop, *rop;
4486 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4487
569233ed
SB
4488 ENTER;
4489 Perl_save_re_context(aTHX);
f3548bdc 4490 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4491 sop->op_private |= OPpREFCOUNTED;
4492 /* re_dup will OpREFCNT_inc */
4493 OpREFCNT_set(sop, 1);
569233ed 4494 LEAVE;
c277df42 4495
830247a4
IZ
4496 n = add_data(pRExC_state, 3, "nop");
4497 RExC_rx->data->data[n] = (void*)rop;
4498 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4499 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4500 SvREFCNT_dec(sv);
a0ed51b3 4501 }
e24b16f9 4502 else { /* First pass */
830247a4 4503 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4504 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4505 /* No compiled RE interpolated, has runtime
4506 components ===> unsafe. */
4507 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4508 if (PL_tainting && PL_tainted)
cc6b7395 4509 FAIL("Eval-group in insecure regular expression");
54df2634 4510#if PERL_VERSION > 8
923e4eb5 4511 if (IN_PERL_COMPILETIME)
b5c19bd7 4512 PL_cv_has_eval = 1;
54df2634 4513#endif
c277df42 4514 }
b5c19bd7 4515
830247a4 4516 nextchar(pRExC_state);
6136c704 4517 if (is_logical) {
830247a4 4518 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4519 if (!SIZE_ONLY)
4520 ret->flags = 2;
3dab1dad 4521 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4522 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4523 return ret;
4524 }
ccb2c380
MP
4525 ret = reganode(pRExC_state, EVAL, n);
4526 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4527 Set_Node_Offset(ret, parse_start);
4528 return ret;
c277df42 4529 }
fac92740 4530 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4531 {
fac92740 4532 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4533 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4534 || RExC_parse[1] == '<'
830247a4 4535 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4536 I32 flag;
4537
830247a4 4538 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4539 if (!SIZE_ONLY)
4540 ret->flags = 1;
3dab1dad 4541 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4542 goto insert_if;
b81d288d 4543 }
a0ed51b3 4544 }
830247a4 4545 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4546 /* (?(1)...) */
6136c704 4547 char c;
830247a4 4548 parno = atoi(RExC_parse++);
c277df42 4549
830247a4
IZ
4550 while (isDIGIT(*RExC_parse))
4551 RExC_parse++;
fac92740 4552 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4553
830247a4 4554 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4555 vFAIL("Switch condition not recognized");
c277df42 4556 insert_if:
3dab1dad
YO
4557 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4558 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4559 if (br == NULL)
830247a4 4560 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4561 else
3dab1dad 4562 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4563 c = *nextchar(pRExC_state);
d1b80229
IZ
4564 if (flags&HASWIDTH)
4565 *flagp |= HASWIDTH;
c277df42 4566 if (c == '|') {
830247a4 4567 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4568 regbranch(pRExC_state, &flags, 1,depth+1);
4569 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4570 if (flags&HASWIDTH)
4571 *flagp |= HASWIDTH;
830247a4 4572 c = *nextchar(pRExC_state);
a0ed51b3
LW
4573 }
4574 else
c277df42
IZ
4575 lastbr = NULL;
4576 if (c != ')')
8615cb43 4577 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4578 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4579 REGTAIL(pRExC_state, br, ender);
c277df42 4580 if (lastbr) {
3dab1dad
YO
4581 REGTAIL(pRExC_state, lastbr, ender);
4582 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4583 }
4584 else
3dab1dad 4585 REGTAIL(pRExC_state, ret, ender);
c277df42 4586 return ret;
a0ed51b3
LW
4587 }
4588 else {
830247a4 4589 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
4590 }
4591 }
1b1626e4 4592 case 0:
830247a4 4593 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 4594 vFAIL("Sequence (? incomplete");
1b1626e4 4595 break;
a0d0e21e 4596 default:
830247a4 4597 --RExC_parse;
fac92740 4598 parse_flags: /* (?i) */
830247a4 4599 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
4600 /* (?g), (?gc) and (?o) are useless here
4601 and must be globally applied -- japhy */
4602
4603 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
4604 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 4605 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
4606 if (! (wastedflags & wflagbit) ) {
4607 wastedflags |= wflagbit;
4608 vWARN5(
4609 RExC_parse + 1,
4610 "Useless (%s%c) - %suse /%c modifier",
4611 flagsp == &negflags ? "?-" : "?",
4612 *RExC_parse,
4613 flagsp == &negflags ? "don't " : "",
4614 *RExC_parse
4615 );
4616 }
4617 }
4618 }
4619 else if (*RExC_parse == 'c') {
4620 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
4621 if (! (wastedflags & WASTED_C) ) {
4622 wastedflags |= WASTED_GC;
9d1d55b5
JP
4623 vWARN3(
4624 RExC_parse + 1,
4625 "Useless (%sc) - %suse /gc modifier",
4626 flagsp == &negflags ? "?-" : "?",
4627 flagsp == &negflags ? "don't " : ""
4628 );
4629 }
4630 }
4631 }
4632 else { pmflag(flagsp, *RExC_parse); }
4633
830247a4 4634 ++RExC_parse;
ca9dfc88 4635 }
830247a4 4636 if (*RExC_parse == '-') {
ca9dfc88 4637 flagsp = &negflags;
9d1d55b5 4638 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 4639 ++RExC_parse;
ca9dfc88 4640 goto parse_flags;
48c036b1 4641 }
e2509266
JH
4642 RExC_flags |= posflags;
4643 RExC_flags &= ~negflags;
830247a4
IZ
4644 if (*RExC_parse == ':') {
4645 RExC_parse++;
ca9dfc88
IZ
4646 paren = ':';
4647 break;
4648 }
c277df42 4649 unknown:
830247a4
IZ
4650 if (*RExC_parse != ')') {
4651 RExC_parse++;
4652 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 4653 }
830247a4 4654 nextchar(pRExC_state);
a0d0e21e
LW
4655 *flagp = TRYAGAIN;
4656 return NULL;
4657 }
4658 }
fac92740 4659 else { /* (...) */
830247a4
IZ
4660 parno = RExC_npar;
4661 RExC_npar++;
4662 ret = reganode(pRExC_state, OPEN, parno);
6bda09f9 4663 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
226de585
RGS
4664 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
4665 "Setting paren #%"IVdf" to %d\n", parno, REG_NODE_NUM(ret)));
6bda09f9 4666 RExC_parens[parno-1]= ret;
226de585 4667
6bda09f9 4668 }
fac92740
MJD
4669 Set_Node_Length(ret, 1); /* MJD */
4670 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 4671 is_open = 1;
a0d0e21e 4672 }
a0ed51b3 4673 }
fac92740 4674 else /* ! paren */
a0d0e21e
LW
4675 ret = NULL;
4676
4677 /* Pick up the branches, linking them together. */
fac92740 4678 parse_start = RExC_parse; /* MJD */
3dab1dad 4679 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 4680 /* branch_len = (paren != 0); */
2af232bd 4681
a0d0e21e
LW
4682 if (br == NULL)
4683 return(NULL);
830247a4
IZ
4684 if (*RExC_parse == '|') {
4685 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 4686 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 4687 }
fac92740 4688 else { /* MJD */
6bda09f9 4689 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
4690 Set_Node_Length(br, paren != 0);
4691 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
4692 }
c277df42
IZ
4693 have_branch = 1;
4694 if (SIZE_ONLY)
830247a4 4695 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
4696 }
4697 else if (paren == ':') {
c277df42
IZ
4698 *flagp |= flags&SIMPLE;
4699 }
6136c704 4700 if (is_open) { /* Starts with OPEN. */
3dab1dad 4701 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
4702 }
4703 else if (paren != '?') /* Not Conditional */
a0d0e21e 4704 ret = br;
32a0ca98 4705 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 4706 lastbr = br;
830247a4
IZ
4707 while (*RExC_parse == '|') {
4708 if (!SIZE_ONLY && RExC_extralen) {
4709 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 4710 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
4711 }
4712 if (SIZE_ONLY)
830247a4
IZ
4713 RExC_extralen += 2; /* Account for LONGJMP. */
4714 nextchar(pRExC_state);
3dab1dad 4715 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 4716
a687059c 4717 if (br == NULL)
a0d0e21e 4718 return(NULL);
3dab1dad 4719 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 4720 lastbr = br;
821b33a5
IZ
4721 if (flags&HASWIDTH)
4722 *flagp |= HASWIDTH;
a687059c 4723 *flagp |= flags&SPSTART;
a0d0e21e
LW
4724 }
4725
c277df42
IZ
4726 if (have_branch || paren != ':') {
4727 /* Make a closing node, and hook it on the end. */
4728 switch (paren) {
4729 case ':':
830247a4 4730 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
4731 break;
4732 case 1:
830247a4 4733 ender = reganode(pRExC_state, CLOSE, parno);
fac92740
MJD
4734 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
4735 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
4736 break;
4737 case '<':
c277df42
IZ
4738 case ',':
4739 case '=':
4740 case '!':
c277df42 4741 *flagp &= ~HASWIDTH;
821b33a5
IZ
4742 /* FALL THROUGH */
4743 case '>':
830247a4 4744 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
4745 break;
4746 case 0:
830247a4 4747 ender = reg_node(pRExC_state, END);
c277df42
IZ
4748 break;
4749 }
eaf3ca90 4750 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 4751
9674d46a 4752 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
4753 if (depth==1)
4754 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4755
c277df42 4756 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
4757 for (br = ret; br; br = regnext(br)) {
4758 const U8 op = PL_regkind[OP(br)];
4759 if (op == BRANCH) {
07be1b83 4760 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
4761 }
4762 else if (op == BRANCHJ) {
07be1b83 4763 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 4764 }
c277df42
IZ
4765 }
4766 }
a0d0e21e 4767 }
c277df42
IZ
4768
4769 {
e1ec3a88
AL
4770 const char *p;
4771 static const char parens[] = "=!<,>";
c277df42
IZ
4772
4773 if (paren && (p = strchr(parens, paren))) {
eb160463 4774 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
4775 int flag = (p - parens) > 1;
4776
4777 if (paren == '>')
4778 node = SUSPEND, flag = 0;
6bda09f9 4779 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
4780 Set_Node_Cur_Length(ret);
4781 Set_Node_Offset(ret, parse_start + 1);
c277df42 4782 ret->flags = flag;
07be1b83 4783 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 4784 }
a0d0e21e
LW
4785 }
4786
4787 /* Check for proper termination. */
ce3e6498 4788 if (paren) {
e2509266 4789 RExC_flags = oregflags;
830247a4
IZ
4790 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
4791 RExC_parse = oregcomp_parse;
380a0633 4792 vFAIL("Unmatched (");
ce3e6498 4793 }
a0ed51b3 4794 }
830247a4
IZ
4795 else if (!paren && RExC_parse < RExC_end) {
4796 if (*RExC_parse == ')') {
4797 RExC_parse++;
380a0633 4798 vFAIL("Unmatched )");
a0ed51b3
LW
4799 }
4800 else
b45f050a 4801 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
4802 /* NOTREACHED */
4803 }
a687059c 4804
a0d0e21e 4805 return(ret);
a687059c
LW
4806}
4807
4808/*
4809 - regbranch - one alternative of an | operator
4810 *
4811 * Implements the concatenation operator.
4812 */
76e3520e 4813STATIC regnode *
3dab1dad 4814S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 4815{
97aff369 4816 dVAR;
c277df42
IZ
4817 register regnode *ret;
4818 register regnode *chain = NULL;
4819 register regnode *latest;
4820 I32 flags = 0, c = 0;
3dab1dad
YO
4821 GET_RE_DEBUG_FLAGS_DECL;
4822 DEBUG_PARSE("brnc");
b81d288d 4823 if (first)
c277df42
IZ
4824 ret = NULL;
4825 else {
b81d288d 4826 if (!SIZE_ONLY && RExC_extralen)
830247a4 4827 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 4828 else {
830247a4 4829 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
4830 Set_Node_Length(ret, 1);
4831 }
c277df42
IZ
4832 }
4833
b81d288d 4834 if (!first && SIZE_ONLY)
830247a4 4835 RExC_extralen += 1; /* BRANCHJ */
b81d288d 4836
c277df42 4837 *flagp = WORST; /* Tentatively. */
a0d0e21e 4838
830247a4
IZ
4839 RExC_parse--;
4840 nextchar(pRExC_state);
4841 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 4842 flags &= ~TRYAGAIN;
3dab1dad 4843 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4844 if (latest == NULL) {
4845 if (flags & TRYAGAIN)
4846 continue;
4847 return(NULL);
a0ed51b3
LW
4848 }
4849 else if (ret == NULL)
c277df42 4850 ret = latest;
a0d0e21e 4851 *flagp |= flags&HASWIDTH;
c277df42 4852 if (chain == NULL) /* First piece. */
a0d0e21e
LW
4853 *flagp |= flags&SPSTART;
4854 else {
830247a4 4855 RExC_naughty++;
3dab1dad 4856 REGTAIL(pRExC_state, chain, latest);
a687059c 4857 }
a0d0e21e 4858 chain = latest;
c277df42
IZ
4859 c++;
4860 }
4861 if (chain == NULL) { /* Loop ran zero times. */
830247a4 4862 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
4863 if (ret == NULL)
4864 ret = chain;
4865 }
4866 if (c == 1) {
4867 *flagp |= flags&SIMPLE;
a0d0e21e 4868 }
a687059c 4869
d4c19fe8 4870 return ret;
a687059c
LW
4871}
4872
4873/*
4874 - regpiece - something followed by possible [*+?]
4875 *
4876 * Note that the branching code sequences used for ? and the general cases
4877 * of * and + are somewhat optimized: they use the same NOTHING node as
4878 * both the endmarker for their branch list and the body of the last branch.
4879 * It might seem that this node could be dispensed with entirely, but the
4880 * endmarker role is not redundant.
4881 */
76e3520e 4882STATIC regnode *
3dab1dad 4883S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 4884{
97aff369 4885 dVAR;
c277df42 4886 register regnode *ret;
a0d0e21e
LW
4887 register char op;
4888 register char *next;
4889 I32 flags;
1df70142 4890 const char * const origparse = RExC_parse;
a0d0e21e 4891 I32 min;
c277df42 4892 I32 max = REG_INFTY;
fac92740 4893 char *parse_start;
10edeb5d 4894 const char *maxpos = NULL;
3dab1dad
YO
4895 GET_RE_DEBUG_FLAGS_DECL;
4896 DEBUG_PARSE("piec");
a0d0e21e 4897
3dab1dad 4898 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
4899 if (ret == NULL) {
4900 if (flags & TRYAGAIN)
4901 *flagp |= TRYAGAIN;
4902 return(NULL);
4903 }
4904
830247a4 4905 op = *RExC_parse;
a0d0e21e 4906
830247a4 4907 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 4908 maxpos = NULL;
fac92740 4909 parse_start = RExC_parse; /* MJD */
830247a4 4910 next = RExC_parse + 1;
a0d0e21e
LW
4911 while (isDIGIT(*next) || *next == ',') {
4912 if (*next == ',') {
4913 if (maxpos)
4914 break;
4915 else
4916 maxpos = next;
a687059c 4917 }
a0d0e21e
LW
4918 next++;
4919 }
4920 if (*next == '}') { /* got one */
4921 if (!maxpos)
4922 maxpos = next;
830247a4
IZ
4923 RExC_parse++;
4924 min = atoi(RExC_parse);
a0d0e21e
LW
4925 if (*maxpos == ',')
4926 maxpos++;
4927 else
830247a4 4928 maxpos = RExC_parse;
a0d0e21e
LW
4929 max = atoi(maxpos);
4930 if (!max && *maxpos != '0')
c277df42
IZ
4931 max = REG_INFTY; /* meaning "infinity" */
4932 else if (max >= REG_INFTY)
8615cb43 4933 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
4934 RExC_parse = next;
4935 nextchar(pRExC_state);
a0d0e21e
LW
4936
4937 do_curly:
4938 if ((flags&SIMPLE)) {
830247a4 4939 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 4940 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
4941 Set_Node_Offset(ret, parse_start+1); /* MJD */
4942 Set_Node_Cur_Length(ret);
a0d0e21e
LW
4943 }
4944 else {
3dab1dad 4945 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
4946
4947 w->flags = 0;
3dab1dad 4948 REGTAIL(pRExC_state, ret, w);
830247a4 4949 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
4950 reginsert(pRExC_state, LONGJMP,ret, depth+1);
4951 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
4952 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
4953 }
6bda09f9 4954 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
4955 /* MJD hk */
4956 Set_Node_Offset(ret, parse_start+1);
2af232bd 4957 Set_Node_Length(ret,
fac92740 4958 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 4959
830247a4 4960 if (!SIZE_ONLY && RExC_extralen)
c277df42 4961 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 4962 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 4963 if (SIZE_ONLY)
830247a4
IZ
4964 RExC_whilem_seen++, RExC_extralen += 3;
4965 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 4966 }
c277df42 4967 ret->flags = 0;
a0d0e21e
LW
4968
4969 if (min > 0)
821b33a5
IZ
4970 *flagp = WORST;
4971 if (max > 0)
4972 *flagp |= HASWIDTH;
a0d0e21e 4973 if (max && max < min)
8615cb43 4974 vFAIL("Can't do {n,m} with n > m");
c277df42 4975 if (!SIZE_ONLY) {
eb160463
GS
4976 ARG1_SET(ret, (U16)min);
4977 ARG2_SET(ret, (U16)max);
a687059c 4978 }
a687059c 4979
a0d0e21e 4980 goto nest_check;
a687059c 4981 }
a0d0e21e 4982 }
a687059c 4983
a0d0e21e
LW
4984 if (!ISMULT1(op)) {
4985 *flagp = flags;
a687059c 4986 return(ret);
a0d0e21e 4987 }
6bda09f9
YO
4988 /* else if (OP(ret)==RECURSE) {
4989 RExC_parse++;
4990 vFAIL("Illegal quantifier on recursion group");
4991 } */
bb20fd44 4992
c277df42 4993#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
4994
4995 /* if this is reinstated, don't forget to put this back into perldiag:
4996
4997 =item Regexp *+ operand could be empty at {#} in regex m/%s/
4998
4999 (F) The part of the regexp subject to either the * or + quantifier
5000 could match an empty string. The {#} shows in the regular
5001 expression about where the problem was discovered.
5002
5003 */
5004
bb20fd44 5005 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5006 vFAIL("Regexp *+ operand could be empty");
b81d288d 5007#endif
bb20fd44 5008
fac92740 5009 parse_start = RExC_parse;
830247a4 5010 nextchar(pRExC_state);
a0d0e21e 5011
821b33a5 5012 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5013
5014 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5015 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5016 ret->flags = 0;
830247a4 5017 RExC_naughty += 4;
a0d0e21e
LW
5018 }
5019 else if (op == '*') {
5020 min = 0;
5021 goto do_curly;
a0ed51b3
LW
5022 }
5023 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5024 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5025 ret->flags = 0;
830247a4 5026 RExC_naughty += 3;
a0d0e21e
LW
5027 }
5028 else if (op == '+') {
5029 min = 1;
5030 goto do_curly;
a0ed51b3
LW
5031 }
5032 else if (op == '?') {
a0d0e21e
LW
5033 min = 0; max = 1;
5034 goto do_curly;
5035 }
5036 nest_check:
041457d9 5037 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5038 vWARN3(RExC_parse,
b45f050a 5039 "%.*s matches null string many times",
afd78fd5 5040 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5041 origparse);
a0d0e21e
LW
5042 }
5043
830247a4
IZ
5044 if (*RExC_parse == '?') {
5045 nextchar(pRExC_state);
6bda09f9 5046 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5047 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5048 }
830247a4
IZ
5049 if (ISMULT2(RExC_parse)) {
5050 RExC_parse++;
b45f050a
JF
5051 vFAIL("Nested quantifiers");
5052 }
a0d0e21e
LW
5053
5054 return(ret);
a687059c
LW
5055}
5056
fc8cd66c
YO
5057
5058/* reg_namedseq(pRExC_state,UVp)
5059
5060 This is expected to be called by a parser routine that has
5061 recognized'\N' and needs to handle the rest. RExC_parse is
5062 expected to point at the first char following the N at the time
5063 of the call.
5064
5065 If valuep is non-null then it is assumed that we are parsing inside
5066 of a charclass definition and the first codepoint in the resolved
5067 string is returned via *valuep and the routine will return NULL.
5068 In this mode if a multichar string is returned from the charnames
5069 handler a warning will be issued, and only the first char in the
5070 sequence will be examined. If the string returned is zero length
5071 then the value of *valuep is undefined and NON-NULL will
5072 be returned to indicate failure. (This will NOT be a valid pointer
5073 to a regnode.)
5074
5075 If value is null then it is assumed that we are parsing normal text
5076 and inserts a new EXACT node into the program containing the resolved
5077 string and returns a pointer to the new node. If the string is
5078 zerolength a NOTHING node is emitted.
5079
5080 On success RExC_parse is set to the char following the endbrace.
5081 Parsing failures will generate a fatal errorvia vFAIL(...)
5082
5083 NOTE: We cache all results from the charnames handler locally in
5084 the RExC_charnames hash (created on first use) to prevent a charnames
5085 handler from playing silly-buggers and returning a short string and
5086 then a long string for a given pattern. Since the regexp program
5087 size is calculated during an initial parse this would result
5088 in a buffer overrun so we cache to prevent the charname result from
5089 changing during the course of the parse.
5090
5091 */
5092STATIC regnode *
5093S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5094{
5095 char * name; /* start of the content of the name */
5096 char * endbrace; /* endbrace following the name */
5097 SV *sv_str = NULL;
5098 SV *sv_name = NULL;
5099 STRLEN len; /* this has various purposes throughout the code */
5100 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5101 regnode *ret = NULL;
5102
5103 if (*RExC_parse != '{') {
5104 vFAIL("Missing braces on \\N{}");
5105 }
5106 name = RExC_parse+1;
5107 endbrace = strchr(RExC_parse, '}');
5108 if ( ! endbrace ) {
5109 RExC_parse++;
5110 vFAIL("Missing right brace on \\N{}");
5111 }
5112 RExC_parse = endbrace + 1;
5113
5114
5115 /* RExC_parse points at the beginning brace,
5116 endbrace points at the last */
5117 if ( name[0]=='U' && name[1]=='+' ) {
5118 /* its a "unicode hex" notation {U+89AB} */
5119 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5120 | PERL_SCAN_DISALLOW_PREFIX
5121 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5122 UV cp;
196f1508 5123 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5124 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5125 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5126 cp = 0xFFFD;
5127 }
5128 if (cp > 0xff)
5129 RExC_utf8 = 1;
5130 if ( valuep ) {
5131 *valuep = cp;
5132 return NULL;
5133 }
5134 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5135 } else {
5136 /* fetch the charnames handler for this scope */
5137 HV * const table = GvHV(PL_hintgv);
5138 SV **cvp= table ?
5139 hv_fetchs(table, "charnames", FALSE) :
5140 NULL;
5141 SV *cv= cvp ? *cvp : NULL;
5142 HE *he_str;
5143 int count;
5144 /* create an SV with the name as argument */
5145 sv_name = newSVpvn(name, endbrace - name);
5146
5147 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5148 vFAIL2("Constant(\\N{%s}) unknown: "
5149 "(possibly a missing \"use charnames ...\")",
5150 SvPVX(sv_name));
5151 }
5152 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5153 vFAIL2("Constant(\\N{%s}): "
5154 "$^H{charnames} is not defined",SvPVX(sv_name));
5155 }
5156
5157
5158
5159 if (!RExC_charnames) {
5160 /* make sure our cache is allocated */
5161 RExC_charnames = newHV();
6bda09f9 5162 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5163 }
5164 /* see if we have looked this one up before */
5165 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5166 if ( he_str ) {
5167 sv_str = HeVAL(he_str);
5168 cached = 1;
5169 } else {
5170 dSP ;
5171
5172 ENTER ;
5173 SAVETMPS ;
5174 PUSHMARK(SP) ;
5175
5176 XPUSHs(sv_name);
5177
5178 PUTBACK ;
5179
5180 count= call_sv(cv, G_SCALAR);
5181
5182 if (count == 1) { /* XXXX is this right? dmq */
5183 sv_str = POPs;
5184 SvREFCNT_inc_simple_void(sv_str);
5185 }
5186
5187 SPAGAIN ;
5188 PUTBACK ;
5189 FREETMPS ;
5190 LEAVE ;
5191
5192 if ( !sv_str || !SvOK(sv_str) ) {
5193 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5194 "did not return a defined value",SvPVX(sv_name));
5195 }
5196 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5197 cached = 1;
5198 }
5199 }
5200 if (valuep) {
5201 char *p = SvPV(sv_str, len);
5202 if (len) {
5203 STRLEN numlen = 1;
5204 if ( SvUTF8(sv_str) ) {
196f1508 5205 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5206 if (*valuep > 0x7F)
5207 RExC_utf8 = 1;
5208 /* XXXX
5209 We have to turn on utf8 for high bit chars otherwise
5210 we get failures with
5211
5212 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5213 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5214
5215 This is different from what \x{} would do with the same
5216 codepoint, where the condition is > 0xFF.
5217 - dmq
5218 */
5219
5220
5221 } else {
5222 *valuep = (UV)*p;
5223 /* warn if we havent used the whole string? */
5224 }
5225 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5226 vWARN2(RExC_parse,
5227 "Ignoring excess chars from \\N{%s} in character class",
5228 SvPVX(sv_name)
5229 );
5230 }
5231 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5232 vWARN2(RExC_parse,
5233 "Ignoring zero length \\N{%s} in character class",
5234 SvPVX(sv_name)
5235 );
5236 }
5237 if (sv_name)
5238 SvREFCNT_dec(sv_name);
5239 if (!cached)
5240 SvREFCNT_dec(sv_str);
5241 return len ? NULL : (regnode *)&len;
5242 } else if(SvCUR(sv_str)) {
5243
5244 char *s;
5245 char *p, *pend;
5246 STRLEN charlen = 1;
5247 char * parse_start = name-3; /* needed for the offsets */
5248 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5249
5250 ret = reg_node(pRExC_state,
5251 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5252 s= STRING(ret);
5253
5254 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5255 sv_utf8_upgrade(sv_str);
5256 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5257 RExC_utf8= 1;
5258 }
5259
5260 p = SvPV(sv_str, len);
5261 pend = p + len;
5262 /* len is the length written, charlen is the size the char read */
5263 for ( len = 0; p < pend; p += charlen ) {
5264 if (UTF) {
196f1508 5265 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5266 if (FOLD) {
5267 STRLEN foldlen,numlen;
5268 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5269 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5270 /* Emit all the Unicode characters. */
5271
5272 for (foldbuf = tmpbuf;
5273 foldlen;
5274 foldlen -= numlen)
5275 {
5276 uvc = utf8_to_uvchr(foldbuf, &numlen);
5277 if (numlen > 0) {
5278 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5279 s += unilen;
5280 len += unilen;
5281 /* In EBCDIC the numlen
5282 * and unilen can differ. */
5283 foldbuf += numlen;
5284 if (numlen >= foldlen)
5285 break;
5286 }
5287 else
5288 break; /* "Can't happen." */
5289 }
5290 } else {
5291 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5292 if (unilen > 0) {
5293 s += unilen;
5294 len += unilen;
5295 }
5296 }
5297 } else {
5298 len++;
5299 REGC(*p, s++);
5300 }
5301 }
5302 if (SIZE_ONLY) {
5303 RExC_size += STR_SZ(len);
5304 } else {
5305 STR_LEN(ret) = len;
5306 RExC_emit += STR_SZ(len);
5307 }
5308 Set_Node_Cur_Length(ret); /* MJD */
5309 RExC_parse--;
5310 nextchar(pRExC_state);
5311 } else {
5312 ret = reg_node(pRExC_state,NOTHING);
5313 }
5314 if (!cached) {
5315 SvREFCNT_dec(sv_str);
5316 }
5317 if (sv_name) {
5318 SvREFCNT_dec(sv_name);
5319 }
5320 return ret;
5321
5322}
5323
5324
5325
a687059c
LW
5326/*
5327 - regatom - the lowest level
5328 *
5329 * Optimization: gobbles an entire sequence of ordinary characters so that
5330 * it can turn them into a single node, which is smaller to store and
5331 * faster to run. Backslashed characters are exceptions, each becoming a
5332 * separate node; the code is simpler that way and it's not worth fixing.
5333 *
7f6f358c
YO
5334 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5335 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5336 */
76e3520e 5337STATIC regnode *
3dab1dad 5338S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5339{
97aff369 5340 dVAR;
cbbf8932 5341 register regnode *ret = NULL;
a0d0e21e 5342 I32 flags;
45948336 5343 char *parse_start = RExC_parse;
3dab1dad
YO
5344 GET_RE_DEBUG_FLAGS_DECL;
5345 DEBUG_PARSE("atom");
a0d0e21e
LW
5346 *flagp = WORST; /* Tentatively. */
5347
5348tryagain:
830247a4 5349 switch (*RExC_parse) {
a0d0e21e 5350 case '^':
830247a4
IZ
5351 RExC_seen_zerolen++;
5352 nextchar(pRExC_state);
e2509266 5353 if (RExC_flags & PMf_MULTILINE)
830247a4 5354 ret = reg_node(pRExC_state, MBOL);
e2509266 5355 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5356 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5357 else
830247a4 5358 ret = reg_node(pRExC_state, BOL);
fac92740 5359 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5360 break;
5361 case '$':
830247a4 5362 nextchar(pRExC_state);
b81d288d 5363 if (*RExC_parse)
830247a4 5364 RExC_seen_zerolen++;
e2509266 5365 if (RExC_flags & PMf_MULTILINE)
830247a4 5366 ret = reg_node(pRExC_state, MEOL);
e2509266 5367 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5368 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5369 else
830247a4 5370 ret = reg_node(pRExC_state, EOL);
fac92740 5371 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5372 break;
5373 case '.':
830247a4 5374 nextchar(pRExC_state);
e2509266 5375 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5376 ret = reg_node(pRExC_state, SANY);
5377 else
5378 ret = reg_node(pRExC_state, REG_ANY);
5379 *flagp |= HASWIDTH|SIMPLE;
830247a4 5380 RExC_naughty++;
fac92740 5381 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5382 break;
5383 case '[':
b45f050a 5384 {
3dab1dad
YO
5385 char * const oregcomp_parse = ++RExC_parse;
5386 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5387 if (*RExC_parse != ']') {
5388 RExC_parse = oregcomp_parse;
b45f050a
JF
5389 vFAIL("Unmatched [");
5390 }
830247a4 5391 nextchar(pRExC_state);
a0d0e21e 5392 *flagp |= HASWIDTH|SIMPLE;
fac92740 5393 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5394 break;
b45f050a 5395 }
a0d0e21e 5396 case '(':
830247a4 5397 nextchar(pRExC_state);
3dab1dad 5398 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5399 if (ret == NULL) {
bf93d4cc 5400 if (flags & TRYAGAIN) {
830247a4 5401 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5402 /* Make parent create an empty node if needed. */
5403 *flagp |= TRYAGAIN;
5404 return(NULL);
5405 }
a0d0e21e 5406 goto tryagain;
bf93d4cc 5407 }
a0d0e21e
LW
5408 return(NULL);
5409 }
c277df42 5410 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5411 break;
5412 case '|':
5413 case ')':
5414 if (flags & TRYAGAIN) {
5415 *flagp |= TRYAGAIN;
5416 return NULL;
5417 }
b45f050a 5418 vFAIL("Internal urp");
a0d0e21e
LW
5419 /* Supposed to be caught earlier. */
5420 break;
85afd4ae 5421 case '{':
830247a4
IZ
5422 if (!regcurly(RExC_parse)) {
5423 RExC_parse++;
85afd4ae
CS
5424 goto defchar;
5425 }
5426 /* FALL THROUGH */
a0d0e21e
LW
5427 case '?':
5428 case '+':
5429 case '*':
830247a4 5430 RExC_parse++;
b45f050a 5431 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5432 break;
5433 case '\\':
830247a4 5434 switch (*++RExC_parse) {
a0d0e21e 5435 case 'A':
830247a4
IZ
5436 RExC_seen_zerolen++;
5437 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5438 *flagp |= SIMPLE;
830247a4 5439 nextchar(pRExC_state);
fac92740 5440 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5441 break;
5442 case 'G':
830247a4
IZ
5443 ret = reg_node(pRExC_state, GPOS);
5444 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5445 *flagp |= SIMPLE;
830247a4 5446 nextchar(pRExC_state);
fac92740 5447 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5448 break;
5449 case 'Z':
830247a4 5450 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5451 *flagp |= SIMPLE;
a1917ab9 5452 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5453 nextchar(pRExC_state);
a0d0e21e 5454 break;
b85d18e9 5455 case 'z':
830247a4 5456 ret = reg_node(pRExC_state, EOS);
b85d18e9 5457 *flagp |= SIMPLE;
830247a4
IZ
5458 RExC_seen_zerolen++; /* Do not optimize RE away */
5459 nextchar(pRExC_state);
fac92740 5460 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5461 break;
4a2d328f 5462 case 'C':
f33976b4
DB
5463 ret = reg_node(pRExC_state, CANY);
5464 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5465 *flagp |= HASWIDTH|SIMPLE;
830247a4 5466 nextchar(pRExC_state);
fac92740 5467 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5468 break;
5469 case 'X':
830247a4 5470 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5471 *flagp |= HASWIDTH;
830247a4 5472 nextchar(pRExC_state);
fac92740 5473 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5474 break;
a0d0e21e 5475 case 'w':
eb160463 5476 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5477 *flagp |= HASWIDTH|SIMPLE;
830247a4 5478 nextchar(pRExC_state);
fac92740 5479 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5480 break;
5481 case 'W':
eb160463 5482 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5483 *flagp |= HASWIDTH|SIMPLE;
830247a4 5484 nextchar(pRExC_state);
fac92740 5485 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5486 break;
5487 case 'b':
830247a4
IZ
5488 RExC_seen_zerolen++;
5489 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5490 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5491 *flagp |= SIMPLE;
830247a4 5492 nextchar(pRExC_state);
fac92740 5493 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5494 break;
5495 case 'B':
830247a4
IZ
5496 RExC_seen_zerolen++;
5497 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5498 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5499 *flagp |= SIMPLE;
830247a4 5500 nextchar(pRExC_state);
fac92740 5501 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5502 break;
5503 case 's':
eb160463 5504 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5505 *flagp |= HASWIDTH|SIMPLE;
830247a4 5506 nextchar(pRExC_state);
fac92740 5507 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5508 break;
5509 case 'S':
eb160463 5510 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5511 *flagp |= HASWIDTH|SIMPLE;
830247a4 5512 nextchar(pRExC_state);
fac92740 5513 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5514 break;
5515 case 'd':
ffc61ed2 5516 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5517 *flagp |= HASWIDTH|SIMPLE;
830247a4 5518 nextchar(pRExC_state);
fac92740 5519 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5520 break;
5521 case 'D':
ffc61ed2 5522 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5523 *flagp |= HASWIDTH|SIMPLE;
830247a4 5524 nextchar(pRExC_state);
fac92740 5525 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5526 break;
a14b48bc
LW
5527 case 'p':
5528 case 'P':
3568d838 5529 {
3dab1dad 5530 char* const oldregxend = RExC_end;
ccb2c380 5531 char* parse_start = RExC_parse - 2;
a14b48bc 5532
830247a4 5533 if (RExC_parse[1] == '{') {
3568d838 5534 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5535 RExC_end = strchr(RExC_parse, '}');
5536 if (!RExC_end) {
3dab1dad 5537 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5538 RExC_parse += 2;
5539 RExC_end = oldregxend;
0da60cf5 5540 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 5541 }
830247a4 5542 RExC_end++;
a14b48bc 5543 }
af6f566e 5544 else {
830247a4 5545 RExC_end = RExC_parse + 2;
af6f566e
HS
5546 if (RExC_end > oldregxend)
5547 RExC_end = oldregxend;
5548 }
830247a4 5549 RExC_parse--;
a14b48bc 5550
3dab1dad 5551 ret = regclass(pRExC_state,depth+1);
a14b48bc 5552
830247a4
IZ
5553 RExC_end = oldregxend;
5554 RExC_parse--;
ccb2c380
MP
5555
5556 Set_Node_Offset(ret, parse_start + 2);
5557 Set_Node_Cur_Length(ret);
830247a4 5558 nextchar(pRExC_state);
a14b48bc
LW
5559 *flagp |= HASWIDTH|SIMPLE;
5560 }
5561 break;
fc8cd66c
YO
5562 case 'N':
5563 /* Handle \N{NAME} here and not below because it can be
5564 multicharacter. join_exact() will join them up later on.
5565 Also this makes sure that things like /\N{BLAH}+/ and
5566 \N{BLAH} being multi char Just Happen. dmq*/
5567 ++RExC_parse;
5568 ret= reg_namedseq(pRExC_state, NULL);
5569 break;
a0d0e21e
LW
5570 case 'n':
5571 case 'r':
5572 case 't':
5573 case 'f':
5574 case 'e':
5575 case 'a':
5576 case 'x':
5577 case 'c':
5578 case '0':
5579 goto defchar;
5580 case '1': case '2': case '3': case '4':
5581 case '5': case '6': case '7': case '8': case '9':
5582 {
1df70142 5583 const I32 num = atoi(RExC_parse);
a0d0e21e 5584
830247a4 5585 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
5586 goto defchar;
5587 else {
3dab1dad 5588 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
5589 while (isDIGIT(*RExC_parse))
5590 RExC_parse++;
b45f050a 5591
eb160463 5592 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 5593 vFAIL("Reference to nonexistent group");
830247a4 5594 RExC_sawback = 1;
eb160463
GS
5595 ret = reganode(pRExC_state,
5596 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
5597 num);
a0d0e21e 5598 *flagp |= HASWIDTH;
2af232bd 5599
fac92740 5600 /* override incorrect value set in reganode MJD */
2af232bd 5601 Set_Node_Offset(ret, parse_start+1);
fac92740 5602 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
5603 RExC_parse--;
5604 nextchar(pRExC_state);
a0d0e21e
LW
5605 }
5606 }
5607 break;
5608 case '\0':
830247a4 5609 if (RExC_parse >= RExC_end)
b45f050a 5610 FAIL("Trailing \\");
a0d0e21e
LW
5611 /* FALL THROUGH */
5612 default:
a0288114 5613 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 5614 back into the quick-grab loop below */
45948336 5615 parse_start--;
a0d0e21e
LW
5616 goto defchar;
5617 }
5618 break;
4633a7c4
LW
5619
5620 case '#':
e2509266 5621 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
5622 while (RExC_parse < RExC_end && *RExC_parse != '\n')
5623 RExC_parse++;
830247a4 5624 if (RExC_parse < RExC_end)
4633a7c4
LW
5625 goto tryagain;
5626 }
5627 /* FALL THROUGH */
5628
a0d0e21e 5629 default: {
ba210ebe 5630 register STRLEN len;
58ae7d3f 5631 register UV ender;
a0d0e21e 5632 register char *p;
3dab1dad 5633 char *s;
80aecb99 5634 STRLEN foldlen;
89ebb4a3 5635 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
5636
5637 parse_start = RExC_parse - 1;
a0d0e21e 5638
830247a4 5639 RExC_parse++;
a0d0e21e
LW
5640
5641 defchar:
58ae7d3f 5642 ender = 0;
eb160463
GS
5643 ret = reg_node(pRExC_state,
5644 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 5645 s = STRING(ret);
830247a4
IZ
5646 for (len = 0, p = RExC_parse - 1;
5647 len < 127 && p < RExC_end;
a0d0e21e
LW
5648 len++)
5649 {
3dab1dad 5650 char * const oldp = p;
5b5a24f7 5651
e2509266 5652 if (RExC_flags & PMf_EXTENDED)
830247a4 5653 p = regwhite(p, RExC_end);
a0d0e21e
LW
5654 switch (*p) {
5655 case '^':
5656 case '$':
5657 case '.':
5658 case '[':
5659 case '(':
5660 case ')':
5661 case '|':
5662 goto loopdone;
5663 case '\\':
5664 switch (*++p) {
5665 case 'A':
1ed8eac0
JF
5666 case 'C':
5667 case 'X':
a0d0e21e
LW
5668 case 'G':
5669 case 'Z':
b85d18e9 5670 case 'z':
a0d0e21e
LW
5671 case 'w':
5672 case 'W':
5673 case 'b':
5674 case 'B':
5675 case 's':
5676 case 'S':
5677 case 'd':
5678 case 'D':
a14b48bc
LW
5679 case 'p':
5680 case 'P':
fc8cd66c 5681 case 'N':
a0d0e21e
LW
5682 --p;
5683 goto loopdone;
5684 case 'n':
5685 ender = '\n';
5686 p++;
a687059c 5687 break;
a0d0e21e
LW
5688 case 'r':
5689 ender = '\r';
5690 p++;
a687059c 5691 break;
a0d0e21e
LW
5692 case 't':
5693 ender = '\t';
5694 p++;
a687059c 5695 break;
a0d0e21e
LW
5696 case 'f':
5697 ender = '\f';
5698 p++;
a687059c 5699 break;
a0d0e21e 5700 case 'e':
c7f1f016 5701 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 5702 p++;
a687059c 5703 break;
a0d0e21e 5704 case 'a':
c7f1f016 5705 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 5706 p++;
a687059c 5707 break;
a0d0e21e 5708 case 'x':
a0ed51b3 5709 if (*++p == '{') {
1df70142 5710 char* const e = strchr(p, '}');
b81d288d 5711
b45f050a 5712 if (!e) {
830247a4 5713 RExC_parse = p + 1;
b45f050a
JF
5714 vFAIL("Missing right brace on \\x{}");
5715 }
de5f0749 5716 else {
a4c04bdc
NC
5717 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
5718 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 5719 STRLEN numlen = e - p - 1;
53305cf1 5720 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
5721 if (ender > 0xff)
5722 RExC_utf8 = 1;
a0ed51b3
LW
5723 p = e + 1;
5724 }
a0ed51b3
LW
5725 }
5726 else {
a4c04bdc 5727 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 5728 STRLEN numlen = 2;
53305cf1 5729 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
5730 p += numlen;
5731 }
a687059c 5732 break;
a0d0e21e
LW
5733 case 'c':
5734 p++;
bbce6d69 5735 ender = UCHARAT(p++);
5736 ender = toCTRL(ender);
a687059c 5737 break;
a0d0e21e
LW
5738 case '0': case '1': case '2': case '3':case '4':
5739 case '5': case '6': case '7': case '8':case '9':
5740 if (*p == '0' ||
830247a4 5741 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 5742 I32 flags = 0;
1df70142 5743 STRLEN numlen = 3;
53305cf1 5744 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
5745 p += numlen;
5746 }
5747 else {
5748 --p;
5749 goto loopdone;
a687059c
LW
5750 }
5751 break;
a0d0e21e 5752 case '\0':
830247a4 5753 if (p >= RExC_end)
b45f050a 5754 FAIL("Trailing \\");
a687059c 5755 /* FALL THROUGH */
a0d0e21e 5756 default:
041457d9 5757 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 5758 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 5759 goto normal_default;
a0d0e21e
LW
5760 }
5761 break;
a687059c 5762 default:
a0ed51b3 5763 normal_default:
fd400ab9 5764 if (UTF8_IS_START(*p) && UTF) {
1df70142 5765 STRLEN numlen;
5e12f4fb 5766 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 5767 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
5768 p += numlen;
5769 }
5770 else
5771 ender = *p++;
a0d0e21e 5772 break;
a687059c 5773 }
e2509266 5774 if (RExC_flags & PMf_EXTENDED)
830247a4 5775 p = regwhite(p, RExC_end);
60a8b682
JH
5776 if (UTF && FOLD) {
5777 /* Prime the casefolded buffer. */
ac7e0132 5778 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 5779 }
a0d0e21e
LW
5780 if (ISMULT2(p)) { /* Back off on ?+*. */
5781 if (len)
5782 p = oldp;
16ea2a2e 5783 else if (UTF) {
80aecb99 5784 if (FOLD) {
60a8b682 5785 /* Emit all the Unicode characters. */
1df70142 5786 STRLEN numlen;
80aecb99
JH
5787 for (foldbuf = tmpbuf;
5788 foldlen;
5789 foldlen -= numlen) {
5790 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5791 if (numlen > 0) {
71207a34 5792 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5793 s += unilen;
5794 len += unilen;
5795 /* In EBCDIC the numlen
5796 * and unilen can differ. */
9dc45d57 5797 foldbuf += numlen;
47654450
JH
5798 if (numlen >= foldlen)
5799 break;
9dc45d57
JH
5800 }
5801 else
5802 break; /* "Can't happen." */
80aecb99
JH
5803 }
5804 }
5805 else {
71207a34 5806 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5807 if (unilen > 0) {
0ebc6274
JH
5808 s += unilen;
5809 len += unilen;
9dc45d57 5810 }
80aecb99 5811 }
a0ed51b3 5812 }
a0d0e21e
LW
5813 else {
5814 len++;
eb160463 5815 REGC((char)ender, s++);
a0d0e21e
LW
5816 }
5817 break;
a687059c 5818 }
16ea2a2e 5819 if (UTF) {
80aecb99 5820 if (FOLD) {
60a8b682 5821 /* Emit all the Unicode characters. */
1df70142 5822 STRLEN numlen;
80aecb99
JH
5823 for (foldbuf = tmpbuf;
5824 foldlen;
5825 foldlen -= numlen) {
5826 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 5827 if (numlen > 0) {
71207a34 5828 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
5829 len += unilen;
5830 s += unilen;
5831 /* In EBCDIC the numlen
5832 * and unilen can differ. */
9dc45d57 5833 foldbuf += numlen;
47654450
JH
5834 if (numlen >= foldlen)
5835 break;
9dc45d57
JH
5836 }
5837 else
5838 break;
80aecb99
JH
5839 }
5840 }
5841 else {
71207a34 5842 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 5843 if (unilen > 0) {
0ebc6274
JH
5844 s += unilen;
5845 len += unilen;
9dc45d57 5846 }
80aecb99
JH
5847 }
5848 len--;
a0ed51b3
LW
5849 }
5850 else
eb160463 5851 REGC((char)ender, s++);
a0d0e21e
LW
5852 }
5853 loopdone:
830247a4 5854 RExC_parse = p - 1;
fac92740 5855 Set_Node_Cur_Length(ret); /* MJD */
830247a4 5856 nextchar(pRExC_state);
793db0cb
JH
5857 {
5858 /* len is STRLEN which is unsigned, need to copy to signed */
5859 IV iv = len;
5860 if (iv < 0)
5861 vFAIL("Internal disaster");
5862 }
a0d0e21e
LW
5863 if (len > 0)
5864 *flagp |= HASWIDTH;
090f7165 5865 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 5866 *flagp |= SIMPLE;
3dab1dad 5867
cd439c50 5868 if (SIZE_ONLY)
830247a4 5869 RExC_size += STR_SZ(len);
3dab1dad
YO
5870 else {
5871 STR_LEN(ret) = len;
830247a4 5872 RExC_emit += STR_SZ(len);
07be1b83 5873 }
3dab1dad 5874 }
a0d0e21e
LW
5875 break;
5876 }
a687059c 5877
60a8b682
JH
5878 /* If the encoding pragma is in effect recode the text of
5879 * any EXACT-kind nodes. */
fc8cd66c 5880 if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) {
3dab1dad
YO
5881 const STRLEN oldlen = STR_LEN(ret);
5882 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
5883
5884 if (RExC_utf8)
5885 SvUTF8_on(sv);
5886 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
5887 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
5888 const STRLEN newlen = SvCUR(sv);
d0063567
DK
5889
5890 if (SvUTF8(sv))
5891 RExC_utf8 = 1;
5892 if (!SIZE_ONLY) {
a3621e74
YO
5893 GET_RE_DEBUG_FLAGS_DECL;
5894 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
5895 (int)oldlen, STRING(ret),
5896 (int)newlen, s));
5897 Copy(s, STRING(ret), newlen, char);
5898 STR_LEN(ret) += newlen - oldlen;
5899 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
5900 } else
5901 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
5902 }
a72c7584
JH
5903 }
5904
a0d0e21e 5905 return(ret);
a687059c
LW
5906}
5907
873ef191 5908STATIC char *
5f66b61c 5909S_regwhite(char *p, const char *e)
5b5a24f7
CS
5910{
5911 while (p < e) {
5912 if (isSPACE(*p))
5913 ++p;
5914 else if (*p == '#') {
5915 do {
5916 p++;
5917 } while (p < e && *p != '\n');
5918 }
5919 else
5920 break;
5921 }
5922 return p;
5923}
5924
b8c5462f
JH
5925/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
5926 Character classes ([:foo:]) can also be negated ([:^foo:]).
5927 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
5928 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 5929 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
5930
5931#define POSIXCC_DONE(c) ((c) == ':')
5932#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
5933#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
5934
b8c5462f 5935STATIC I32
830247a4 5936S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 5937{
97aff369 5938 dVAR;
936ed897 5939 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 5940
830247a4 5941 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 5942 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 5943 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 5944 const char c = UCHARAT(RExC_parse);
097eb12c 5945 char* const s = RExC_parse++;
b81d288d 5946
9a86a77b 5947 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
5948 RExC_parse++;
5949 if (RExC_parse == RExC_end)
620e46c5 5950 /* Grandfather lone [:, [=, [. */
830247a4 5951 RExC_parse = s;
620e46c5 5952 else {
3dab1dad 5953 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
5954 assert(*t == c);
5955
9a86a77b 5956 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 5957 const char *posixcc = s + 1;
830247a4 5958 RExC_parse++; /* skip over the ending ] */
3dab1dad 5959
b8c5462f 5960 if (*s == ':') {
1df70142
AL
5961 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
5962 const I32 skip = t - posixcc;
80916619
NC
5963
5964 /* Initially switch on the length of the name. */
5965 switch (skip) {
5966 case 4:
3dab1dad
YO
5967 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
5968 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 5969 break;
80916619
NC
5970 case 5:
5971 /* Names all of length 5. */
5972 /* alnum alpha ascii blank cntrl digit graph lower
5973 print punct space upper */
5974 /* Offset 4 gives the best switch position. */
5975 switch (posixcc[4]) {
5976 case 'a':
3dab1dad
YO
5977 if (memEQ(posixcc, "alph", 4)) /* alpha */
5978 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
5979 break;
5980 case 'e':
3dab1dad
YO
5981 if (memEQ(posixcc, "spac", 4)) /* space */
5982 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
5983 break;
5984 case 'h':
3dab1dad
YO
5985 if (memEQ(posixcc, "grap", 4)) /* graph */
5986 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
5987 break;
5988 case 'i':
3dab1dad
YO
5989 if (memEQ(posixcc, "asci", 4)) /* ascii */
5990 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
5991 break;
5992 case 'k':
3dab1dad
YO
5993 if (memEQ(posixcc, "blan", 4)) /* blank */
5994 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
5995 break;
5996 case 'l':
3dab1dad
YO
5997 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
5998 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
5999 break;
6000 case 'm':
3dab1dad
YO
6001 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6002 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6003 break;
6004 case 'r':
3dab1dad
YO
6005 if (memEQ(posixcc, "lowe", 4)) /* lower */
6006 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6007 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6008 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6009 break;
6010 case 't':
3dab1dad
YO
6011 if (memEQ(posixcc, "digi", 4)) /* digit */
6012 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6013 else if (memEQ(posixcc, "prin", 4)) /* print */
6014 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6015 else if (memEQ(posixcc, "punc", 4)) /* punct */
6016 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6017 break;
b8c5462f
JH
6018 }
6019 break;
80916619 6020 case 6:
3dab1dad
YO
6021 if (memEQ(posixcc, "xdigit", 6))
6022 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6023 break;
6024 }
80916619
NC
6025
6026 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6027 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6028 t - s - 1, s + 1);
80916619
NC
6029 assert (posixcc[skip] == ':');
6030 assert (posixcc[skip+1] == ']');
b45f050a 6031 } else if (!SIZE_ONLY) {
b8c5462f 6032 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6033
830247a4 6034 /* adjust RExC_parse so the warning shows after
b45f050a 6035 the class closes */
9a86a77b 6036 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6037 RExC_parse++;
b45f050a
JF
6038 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6039 }
b8c5462f
JH
6040 } else {
6041 /* Maternal grandfather:
6042 * "[:" ending in ":" but not in ":]" */
830247a4 6043 RExC_parse = s;
767d463e 6044 }
620e46c5
JH
6045 }
6046 }
6047
b8c5462f
JH
6048 return namedclass;
6049}
6050
6051STATIC void
830247a4 6052S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6053{
97aff369 6054 dVAR;
3dab1dad 6055 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6056 const char *s = RExC_parse;
6057 const char c = *s++;
b8c5462f 6058
3dab1dad 6059 while (isALNUM(*s))
b8c5462f
JH
6060 s++;
6061 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6062 if (ckWARN(WARN_REGEXP))
6063 vWARN3(s+2,
6064 "POSIX syntax [%c %c] belongs inside character classes",
6065 c, c);
b45f050a
JF
6066
6067 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6068 if (POSIXCC_NOTYET(c)) {
830247a4 6069 /* adjust RExC_parse so the error shows after
b45f050a 6070 the class closes */
9a86a77b 6071 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6072 NOOP;
b45f050a
JF
6073 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6074 }
b8c5462f
JH
6075 }
6076 }
620e46c5
JH
6077}
6078
7f6f358c
YO
6079
6080/*
6081 parse a class specification and produce either an ANYOF node that
6082 matches the pattern. If the pattern matches a single char only and
6083 that char is < 256 then we produce an EXACT node instead.
6084*/
76e3520e 6085STATIC regnode *
3dab1dad 6086S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6087{
97aff369 6088 dVAR;
9ef43ace 6089 register UV value = 0;
9a86a77b 6090 register UV nextvalue;
3568d838 6091 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6092 register IV range = 0;
c277df42 6093 register regnode *ret;
ba210ebe 6094 STRLEN numlen;
ffc61ed2 6095 IV namedclass;
cbbf8932 6096 char *rangebegin = NULL;
936ed897 6097 bool need_class = 0;
c445ea15 6098 SV *listsv = NULL;
ffc61ed2 6099 UV n;
9e55ce06 6100 bool optimize_invert = TRUE;
cbbf8932 6101 AV* unicode_alternate = NULL;
1b2d223b
JH
6102#ifdef EBCDIC
6103 UV literal_endpoint = 0;
6104#endif
7f6f358c 6105 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6106
3dab1dad 6107 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6108 case we need to change the emitted regop to an EXACT. */
07be1b83 6109 const char * orig_parse = RExC_parse;
72f13be8 6110 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6111#ifndef DEBUGGING
6112 PERL_UNUSED_ARG(depth);
6113#endif
72f13be8 6114
3dab1dad 6115 DEBUG_PARSE("clas");
7f6f358c
YO
6116
6117 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6118 ret = reganode(pRExC_state, ANYOF, 0);
6119
6120 if (!SIZE_ONLY)
6121 ANYOF_FLAGS(ret) = 0;
6122
9a86a77b 6123 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6124 RExC_naughty++;
6125 RExC_parse++;
6126 if (!SIZE_ONLY)
6127 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6128 }
a0d0e21e 6129
73060fc4 6130 if (SIZE_ONLY) {
830247a4 6131 RExC_size += ANYOF_SKIP;
73060fc4
JH
6132 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6133 }
936ed897 6134 else {
830247a4 6135 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6136 if (FOLD)
6137 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6138 if (LOC)
6139 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6140 ANYOF_BITMAP_ZERO(ret);
396482e1 6141 listsv = newSVpvs("# comment\n");
a0d0e21e 6142 }
b8c5462f 6143
9a86a77b
JH
6144 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6145
b938889d 6146 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6147 checkposixcc(pRExC_state);
b8c5462f 6148
f064b6ad
HS
6149 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6150 if (UCHARAT(RExC_parse) == ']')
6151 goto charclassloop;
ffc61ed2 6152
fc8cd66c 6153parseit:
9a86a77b 6154 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6155
6156 charclassloop:
6157
6158 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6159
73b437c8 6160 if (!range)
830247a4 6161 rangebegin = RExC_parse;
ffc61ed2 6162 if (UTF) {
5e12f4fb 6163 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6164 RExC_end - RExC_parse,
9f7f3913 6165 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6166 RExC_parse += numlen;
6167 }
6168 else
6169 value = UCHARAT(RExC_parse++);
7f6f358c 6170
9a86a77b
JH
6171 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6172 if (value == '[' && POSIXCC(nextvalue))
830247a4 6173 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6174 else if (value == '\\') {
ffc61ed2 6175 if (UTF) {
5e12f4fb 6176 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6177 RExC_end - RExC_parse,
9f7f3913 6178 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6179 RExC_parse += numlen;
6180 }
6181 else
6182 value = UCHARAT(RExC_parse++);
470c3474 6183 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6184 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6185 * be a problem later if we want switch on Unicode.
6186 * A similar issue a little bit later when switching on
6187 * namedclass. --jhi */
ffc61ed2 6188 switch ((I32)value) {
b8c5462f
JH
6189 case 'w': namedclass = ANYOF_ALNUM; break;
6190 case 'W': namedclass = ANYOF_NALNUM; break;
6191 case 's': namedclass = ANYOF_SPACE; break;
6192 case 'S': namedclass = ANYOF_NSPACE; break;
6193 case 'd': namedclass = ANYOF_DIGIT; break;
6194 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6195 case 'N': /* Handle \N{NAME} in class */
6196 {
6197 /* We only pay attention to the first char of
6198 multichar strings being returned. I kinda wonder
6199 if this makes sense as it does change the behaviour
6200 from earlier versions, OTOH that behaviour was broken
6201 as well. */
6202 UV v; /* value is register so we cant & it /grrr */
6203 if (reg_namedseq(pRExC_state, &v)) {
6204 goto parseit;
6205 }
6206 value= v;
6207 }
6208 break;
ffc61ed2
JH
6209 case 'p':
6210 case 'P':
3dab1dad
YO
6211 {
6212 char *e;
af6f566e 6213 if (RExC_parse >= RExC_end)
2a4859cd 6214 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6215 if (*RExC_parse == '{') {
1df70142 6216 const U8 c = (U8)value;
ffc61ed2
JH
6217 e = strchr(RExC_parse++, '}');
6218 if (!e)
0da60cf5 6219 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6220 while (isSPACE(UCHARAT(RExC_parse)))
6221 RExC_parse++;
6222 if (e == RExC_parse)
0da60cf5 6223 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6224 n = e - RExC_parse;
ab13f0c7
JH
6225 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6226 n--;
ffc61ed2
JH
6227 }
6228 else {
6229 e = RExC_parse;
6230 n = 1;
6231 }
6232 if (!SIZE_ONLY) {
ab13f0c7
JH
6233 if (UCHARAT(RExC_parse) == '^') {
6234 RExC_parse++;
6235 n--;
6236 value = value == 'p' ? 'P' : 'p'; /* toggle */
6237 while (isSPACE(UCHARAT(RExC_parse))) {
6238 RExC_parse++;
6239 n--;
6240 }
6241 }
097eb12c
AL
6242 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6243 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6244 }
6245 RExC_parse = e + 1;
6246 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6247 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6248 }
f81125e2 6249 break;
b8c5462f
JH
6250 case 'n': value = '\n'; break;
6251 case 'r': value = '\r'; break;
6252 case 't': value = '\t'; break;
6253 case 'f': value = '\f'; break;
6254 case 'b': value = '\b'; break;
c7f1f016
NIS
6255 case 'e': value = ASCII_TO_NATIVE('\033');break;
6256 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6257 case 'x':
ffc61ed2 6258 if (*RExC_parse == '{') {
a4c04bdc
NC
6259 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6260 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6261 char * const e = strchr(RExC_parse++, '}');
b81d288d 6262 if (!e)
ffc61ed2 6263 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6264
6265 numlen = e - RExC_parse;
6266 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6267 RExC_parse = e + 1;
6268 }
6269 else {
a4c04bdc 6270 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6271 numlen = 2;
6272 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6273 RExC_parse += numlen;
6274 }
b8c5462f
JH
6275 break;
6276 case 'c':
830247a4 6277 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6278 value = toCTRL(value);
6279 break;
6280 case '0': case '1': case '2': case '3': case '4':
6281 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
6282 {
6283 I32 flags = 0;
6284 numlen = 3;
6285 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 6286 RExC_parse += numlen;
b8c5462f 6287 break;
53305cf1 6288 }
1028017a 6289 default:
041457d9 6290 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6291 vWARN2(RExC_parse,
6292 "Unrecognized escape \\%c in character class passed through",
6293 (int)value);
1028017a 6294 break;
b8c5462f 6295 }
ffc61ed2 6296 } /* end of \blah */
1b2d223b
JH
6297#ifdef EBCDIC
6298 else
6299 literal_endpoint++;
6300#endif
ffc61ed2
JH
6301
6302 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6303
6304 if (!SIZE_ONLY && !need_class)
936ed897 6305 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6306
936ed897 6307 need_class = 1;
ffc61ed2
JH
6308
6309 /* a bad range like a-\d, a-[:digit:] ? */
6310 if (range) {
73b437c8 6311 if (!SIZE_ONLY) {
afd78fd5 6312 if (ckWARN(WARN_REGEXP)) {
097eb12c 6313 const int w =
afd78fd5
JH
6314 RExC_parse >= rangebegin ?
6315 RExC_parse - rangebegin : 0;
830247a4 6316 vWARN4(RExC_parse,
b45f050a 6317 "False [] range \"%*.*s\"",
097eb12c 6318 w, w, rangebegin);
afd78fd5 6319 }
3568d838
JH
6320 if (prevvalue < 256) {
6321 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6322 ANYOF_BITMAP_SET(ret, '-');
6323 }
6324 else {
6325 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6326 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6327 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6328 }
b8c5462f 6329 }
ffc61ed2
JH
6330
6331 range = 0; /* this was not a true range */
73b437c8 6332 }
ffc61ed2 6333
73b437c8 6334 if (!SIZE_ONLY) {
c49a72a9
NC
6335 const char *what = NULL;
6336 char yesno = 0;
6337
3568d838
JH
6338 if (namedclass > OOB_NAMEDCLASS)
6339 optimize_invert = FALSE;
e2962f66
JH
6340 /* Possible truncation here but in some 64-bit environments
6341 * the compiler gets heartburn about switch on 64-bit values.
6342 * A similar issue a little earlier when switching on value.
98f323fa 6343 * --jhi */
e2962f66 6344 switch ((I32)namedclass) {
73b437c8
JH
6345 case ANYOF_ALNUM:
6346 if (LOC)
936ed897 6347 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6348 else {
6349 for (value = 0; value < 256; value++)
6350 if (isALNUM(value))
936ed897 6351 ANYOF_BITMAP_SET(ret, value);
73b437c8 6352 }
c49a72a9
NC
6353 yesno = '+';
6354 what = "Word";
73b437c8
JH
6355 break;
6356 case ANYOF_NALNUM:
6357 if (LOC)
936ed897 6358 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6359 else {
6360 for (value = 0; value < 256; value++)
6361 if (!isALNUM(value))
936ed897 6362 ANYOF_BITMAP_SET(ret, value);
73b437c8 6363 }
c49a72a9
NC
6364 yesno = '!';
6365 what = "Word";
73b437c8 6366 break;
ffc61ed2 6367 case ANYOF_ALNUMC:
73b437c8 6368 if (LOC)
ffc61ed2 6369 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6370 else {
6371 for (value = 0; value < 256; value++)
ffc61ed2 6372 if (isALNUMC(value))
936ed897 6373 ANYOF_BITMAP_SET(ret, value);
73b437c8 6374 }
c49a72a9
NC
6375 yesno = '+';
6376 what = "Alnum";
73b437c8
JH
6377 break;
6378 case ANYOF_NALNUMC:
6379 if (LOC)
936ed897 6380 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
6381 else {
6382 for (value = 0; value < 256; value++)
6383 if (!isALNUMC(value))
936ed897 6384 ANYOF_BITMAP_SET(ret, value);
73b437c8 6385 }
c49a72a9
NC
6386 yesno = '!';
6387 what = "Alnum";
73b437c8
JH
6388 break;
6389 case ANYOF_ALPHA:
6390 if (LOC)
936ed897 6391 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
6392 else {
6393 for (value = 0; value < 256; value++)
6394 if (isALPHA(value))
936ed897 6395 ANYOF_BITMAP_SET(ret, value);
73b437c8 6396 }
c49a72a9
NC
6397 yesno = '+';
6398 what = "Alpha";
73b437c8
JH
6399 break;
6400 case ANYOF_NALPHA:
6401 if (LOC)
936ed897 6402 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6403 else {
6404 for (value = 0; value < 256; value++)
6405 if (!isALPHA(value))
936ed897 6406 ANYOF_BITMAP_SET(ret, value);
73b437c8 6407 }
c49a72a9
NC
6408 yesno = '!';
6409 what = "Alpha";
73b437c8
JH
6410 break;
6411 case ANYOF_ASCII:
6412 if (LOC)
936ed897 6413 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6414 else {
c7f1f016 6415#ifndef EBCDIC
1ba5c669
JH
6416 for (value = 0; value < 128; value++)
6417 ANYOF_BITMAP_SET(ret, value);
6418#else /* EBCDIC */
ffbc6a93 6419 for (value = 0; value < 256; value++) {
3a3c4447
JH
6420 if (isASCII(value))
6421 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6422 }
1ba5c669 6423#endif /* EBCDIC */
73b437c8 6424 }
c49a72a9
NC
6425 yesno = '+';
6426 what = "ASCII";
73b437c8
JH
6427 break;
6428 case ANYOF_NASCII:
6429 if (LOC)
936ed897 6430 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6431 else {
c7f1f016 6432#ifndef EBCDIC
1ba5c669
JH
6433 for (value = 128; value < 256; value++)
6434 ANYOF_BITMAP_SET(ret, value);
6435#else /* EBCDIC */
ffbc6a93 6436 for (value = 0; value < 256; value++) {
3a3c4447
JH
6437 if (!isASCII(value))
6438 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6439 }
1ba5c669 6440#endif /* EBCDIC */
73b437c8 6441 }
c49a72a9
NC
6442 yesno = '!';
6443 what = "ASCII";
73b437c8 6444 break;
aaa51d5e
JF
6445 case ANYOF_BLANK:
6446 if (LOC)
6447 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6448 else {
6449 for (value = 0; value < 256; value++)
6450 if (isBLANK(value))
6451 ANYOF_BITMAP_SET(ret, value);
6452 }
c49a72a9
NC
6453 yesno = '+';
6454 what = "Blank";
aaa51d5e
JF
6455 break;
6456 case ANYOF_NBLANK:
6457 if (LOC)
6458 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6459 else {
6460 for (value = 0; value < 256; value++)
6461 if (!isBLANK(value))
6462 ANYOF_BITMAP_SET(ret, value);
6463 }
c49a72a9
NC
6464 yesno = '!';
6465 what = "Blank";
aaa51d5e 6466 break;
73b437c8
JH
6467 case ANYOF_CNTRL:
6468 if (LOC)
936ed897 6469 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6470 else {
6471 for (value = 0; value < 256; value++)
6472 if (isCNTRL(value))
936ed897 6473 ANYOF_BITMAP_SET(ret, value);
73b437c8 6474 }
c49a72a9
NC
6475 yesno = '+';
6476 what = "Cntrl";
73b437c8
JH
6477 break;
6478 case ANYOF_NCNTRL:
6479 if (LOC)
936ed897 6480 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6481 else {
6482 for (value = 0; value < 256; value++)
6483 if (!isCNTRL(value))
936ed897 6484 ANYOF_BITMAP_SET(ret, value);
73b437c8 6485 }
c49a72a9
NC
6486 yesno = '!';
6487 what = "Cntrl";
ffc61ed2
JH
6488 break;
6489 case ANYOF_DIGIT:
6490 if (LOC)
6491 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6492 else {
6493 /* consecutive digits assumed */
6494 for (value = '0'; value <= '9'; value++)
6495 ANYOF_BITMAP_SET(ret, value);
6496 }
c49a72a9
NC
6497 yesno = '+';
6498 what = "Digit";
ffc61ed2
JH
6499 break;
6500 case ANYOF_NDIGIT:
6501 if (LOC)
6502 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6503 else {
6504 /* consecutive digits assumed */
6505 for (value = 0; value < '0'; value++)
6506 ANYOF_BITMAP_SET(ret, value);
6507 for (value = '9' + 1; value < 256; value++)
6508 ANYOF_BITMAP_SET(ret, value);
6509 }
c49a72a9
NC
6510 yesno = '!';
6511 what = "Digit";
73b437c8
JH
6512 break;
6513 case ANYOF_GRAPH:
6514 if (LOC)
936ed897 6515 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
6516 else {
6517 for (value = 0; value < 256; value++)
6518 if (isGRAPH(value))
936ed897 6519 ANYOF_BITMAP_SET(ret, value);
73b437c8 6520 }
c49a72a9
NC
6521 yesno = '+';
6522 what = "Graph";
73b437c8
JH
6523 break;
6524 case ANYOF_NGRAPH:
6525 if (LOC)
936ed897 6526 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
6527 else {
6528 for (value = 0; value < 256; value++)
6529 if (!isGRAPH(value))
936ed897 6530 ANYOF_BITMAP_SET(ret, value);
73b437c8 6531 }
c49a72a9
NC
6532 yesno = '!';
6533 what = "Graph";
73b437c8
JH
6534 break;
6535 case ANYOF_LOWER:
6536 if (LOC)
936ed897 6537 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
6538 else {
6539 for (value = 0; value < 256; value++)
6540 if (isLOWER(value))
936ed897 6541 ANYOF_BITMAP_SET(ret, value);
73b437c8 6542 }
c49a72a9
NC
6543 yesno = '+';
6544 what = "Lower";
73b437c8
JH
6545 break;
6546 case ANYOF_NLOWER:
6547 if (LOC)
936ed897 6548 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
6549 else {
6550 for (value = 0; value < 256; value++)
6551 if (!isLOWER(value))
936ed897 6552 ANYOF_BITMAP_SET(ret, value);
73b437c8 6553 }
c49a72a9
NC
6554 yesno = '!';
6555 what = "Lower";
73b437c8
JH
6556 break;
6557 case ANYOF_PRINT:
6558 if (LOC)
936ed897 6559 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
6560 else {
6561 for (value = 0; value < 256; value++)
6562 if (isPRINT(value))
936ed897 6563 ANYOF_BITMAP_SET(ret, value);
73b437c8 6564 }
c49a72a9
NC
6565 yesno = '+';
6566 what = "Print";
73b437c8
JH
6567 break;
6568 case ANYOF_NPRINT:
6569 if (LOC)
936ed897 6570 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
6571 else {
6572 for (value = 0; value < 256; value++)
6573 if (!isPRINT(value))
936ed897 6574 ANYOF_BITMAP_SET(ret, value);
73b437c8 6575 }
c49a72a9
NC
6576 yesno = '!';
6577 what = "Print";
73b437c8 6578 break;
aaa51d5e
JF
6579 case ANYOF_PSXSPC:
6580 if (LOC)
6581 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
6582 else {
6583 for (value = 0; value < 256; value++)
6584 if (isPSXSPC(value))
6585 ANYOF_BITMAP_SET(ret, value);
6586 }
c49a72a9
NC
6587 yesno = '+';
6588 what = "Space";
aaa51d5e
JF
6589 break;
6590 case ANYOF_NPSXSPC:
6591 if (LOC)
6592 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
6593 else {
6594 for (value = 0; value < 256; value++)
6595 if (!isPSXSPC(value))
6596 ANYOF_BITMAP_SET(ret, value);
6597 }
c49a72a9
NC
6598 yesno = '!';
6599 what = "Space";
aaa51d5e 6600 break;
73b437c8
JH
6601 case ANYOF_PUNCT:
6602 if (LOC)
936ed897 6603 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
6604 else {
6605 for (value = 0; value < 256; value++)
6606 if (isPUNCT(value))
936ed897 6607 ANYOF_BITMAP_SET(ret, value);
73b437c8 6608 }
c49a72a9
NC
6609 yesno = '+';
6610 what = "Punct";
73b437c8
JH
6611 break;
6612 case ANYOF_NPUNCT:
6613 if (LOC)
936ed897 6614 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
6615 else {
6616 for (value = 0; value < 256; value++)
6617 if (!isPUNCT(value))
936ed897 6618 ANYOF_BITMAP_SET(ret, value);
73b437c8 6619 }
c49a72a9
NC
6620 yesno = '!';
6621 what = "Punct";
ffc61ed2
JH
6622 break;
6623 case ANYOF_SPACE:
6624 if (LOC)
6625 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
6626 else {
6627 for (value = 0; value < 256; value++)
6628 if (isSPACE(value))
6629 ANYOF_BITMAP_SET(ret, value);
6630 }
c49a72a9
NC
6631 yesno = '+';
6632 what = "SpacePerl";
ffc61ed2
JH
6633 break;
6634 case ANYOF_NSPACE:
6635 if (LOC)
6636 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
6637 else {
6638 for (value = 0; value < 256; value++)
6639 if (!isSPACE(value))
6640 ANYOF_BITMAP_SET(ret, value);
6641 }
c49a72a9
NC
6642 yesno = '!';
6643 what = "SpacePerl";
73b437c8
JH
6644 break;
6645 case ANYOF_UPPER:
6646 if (LOC)
936ed897 6647 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
6648 else {
6649 for (value = 0; value < 256; value++)
6650 if (isUPPER(value))
936ed897 6651 ANYOF_BITMAP_SET(ret, value);
73b437c8 6652 }
c49a72a9
NC
6653 yesno = '+';
6654 what = "Upper";
73b437c8
JH
6655 break;
6656 case ANYOF_NUPPER:
6657 if (LOC)
936ed897 6658 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
6659 else {
6660 for (value = 0; value < 256; value++)
6661 if (!isUPPER(value))
936ed897 6662 ANYOF_BITMAP_SET(ret, value);
73b437c8 6663 }
c49a72a9
NC
6664 yesno = '!';
6665 what = "Upper";
73b437c8
JH
6666 break;
6667 case ANYOF_XDIGIT:
6668 if (LOC)
936ed897 6669 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
6670 else {
6671 for (value = 0; value < 256; value++)
6672 if (isXDIGIT(value))
936ed897 6673 ANYOF_BITMAP_SET(ret, value);
73b437c8 6674 }
c49a72a9
NC
6675 yesno = '+';
6676 what = "XDigit";
73b437c8
JH
6677 break;
6678 case ANYOF_NXDIGIT:
6679 if (LOC)
936ed897 6680 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
6681 else {
6682 for (value = 0; value < 256; value++)
6683 if (!isXDIGIT(value))
936ed897 6684 ANYOF_BITMAP_SET(ret, value);
73b437c8 6685 }
c49a72a9
NC
6686 yesno = '!';
6687 what = "XDigit";
73b437c8 6688 break;
f81125e2
JP
6689 case ANYOF_MAX:
6690 /* this is to handle \p and \P */
6691 break;
73b437c8 6692 default:
b45f050a 6693 vFAIL("Invalid [::] class");
73b437c8 6694 break;
b8c5462f 6695 }
c49a72a9
NC
6696 if (what) {
6697 /* Strings such as "+utf8::isWord\n" */
6698 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
6699 }
b8c5462f 6700 if (LOC)
936ed897 6701 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 6702 continue;
a0d0e21e 6703 }
ffc61ed2
JH
6704 } /* end of namedclass \blah */
6705
a0d0e21e 6706 if (range) {
eb160463 6707 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
6708 const int w = RExC_parse - rangebegin;
6709 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 6710 range = 0; /* not a valid range */
73b437c8 6711 }
a0d0e21e
LW
6712 }
6713 else {
3568d838 6714 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
6715 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
6716 RExC_parse[1] != ']') {
6717 RExC_parse++;
ffc61ed2
JH
6718
6719 /* a bad range like \w-, [:word:]- ? */
6720 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 6721 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 6722 const int w =
afd78fd5
JH
6723 RExC_parse >= rangebegin ?
6724 RExC_parse - rangebegin : 0;
830247a4 6725 vWARN4(RExC_parse,
b45f050a 6726 "False [] range \"%*.*s\"",
097eb12c 6727 w, w, rangebegin);
afd78fd5 6728 }
73b437c8 6729 if (!SIZE_ONLY)
936ed897 6730 ANYOF_BITMAP_SET(ret, '-');
73b437c8 6731 } else
ffc61ed2
JH
6732 range = 1; /* yeah, it's a range! */
6733 continue; /* but do it the next time */
a0d0e21e 6734 }
a687059c 6735 }
ffc61ed2 6736
93733859 6737 /* now is the next time */
07be1b83 6738 /*stored += (value - prevvalue + 1);*/
ae5c130c 6739 if (!SIZE_ONLY) {
3568d838 6740 if (prevvalue < 256) {
1df70142 6741 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 6742 IV i;
3568d838 6743#ifdef EBCDIC
1b2d223b
JH
6744 /* In EBCDIC [\x89-\x91] should include
6745 * the \x8e but [i-j] should not. */
6746 if (literal_endpoint == 2 &&
6747 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
6748 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 6749 {
3568d838
JH
6750 if (isLOWER(prevvalue)) {
6751 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6752 if (isLOWER(i))
6753 ANYOF_BITMAP_SET(ret, i);
6754 } else {
3568d838 6755 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
6756 if (isUPPER(i))
6757 ANYOF_BITMAP_SET(ret, i);
6758 }
8ada0baa 6759 }
ffc61ed2 6760 else
8ada0baa 6761#endif
07be1b83
YO
6762 for (i = prevvalue; i <= ceilvalue; i++) {
6763 if (!ANYOF_BITMAP_TEST(ret,i)) {
6764 stored++;
6765 ANYOF_BITMAP_SET(ret, i);
6766 }
6767 }
3568d838 6768 }
a5961de5 6769 if (value > 255 || UTF) {
1df70142
AL
6770 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
6771 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 6772 stored+=2; /* can't optimize this class */
ffc61ed2 6773 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 6774 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 6775 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
6776 prevnatvalue, natvalue);
6777 }
6778 else if (prevnatvalue == natvalue) {
6779 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 6780 if (FOLD) {
89ebb4a3 6781 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 6782 STRLEN foldlen;
1df70142 6783 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 6784
e294cc5d
JH
6785#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
6786 if (RExC_precomp[0] == ':' &&
6787 RExC_precomp[1] == '[' &&
6788 (f == 0xDF || f == 0x92)) {
6789 f = NATIVE_TO_UNI(f);
6790 }
6791#endif
c840d2a2
JH
6792 /* If folding and foldable and a single
6793 * character, insert also the folded version
6794 * to the charclass. */
9e55ce06 6795 if (f != value) {
e294cc5d
JH
6796#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
6797 if ((RExC_precomp[0] == ':' &&
6798 RExC_precomp[1] == '[' &&
6799 (f == 0xA2 &&
6800 (value == 0xFB05 || value == 0xFB06))) ?
6801 foldlen == ((STRLEN)UNISKIP(f) - 1) :
6802 foldlen == (STRLEN)UNISKIP(f) )
6803#else
eb160463 6804 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 6805#endif
9e55ce06
JH
6806 Perl_sv_catpvf(aTHX_ listsv,
6807 "%04"UVxf"\n", f);
6808 else {
6809 /* Any multicharacter foldings
6810 * require the following transform:
6811 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
6812 * where E folds into "pq" and F folds
6813 * into "rst", all other characters
6814 * fold to single characters. We save
6815 * away these multicharacter foldings,
6816 * to be later saved as part of the
6817 * additional "s" data. */
6818 SV *sv;
6819
6820 if (!unicode_alternate)
6821 unicode_alternate = newAV();
6822 sv = newSVpvn((char*)foldbuf, foldlen);
6823 SvUTF8_on(sv);
6824 av_push(unicode_alternate, sv);
6825 }
6826 }
254ba52a 6827
60a8b682
JH
6828 /* If folding and the value is one of the Greek
6829 * sigmas insert a few more sigmas to make the
6830 * folding rules of the sigmas to work right.
6831 * Note that not all the possible combinations
6832 * are handled here: some of them are handled
9e55ce06
JH
6833 * by the standard folding rules, and some of
6834 * them (literal or EXACTF cases) are handled
6835 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
6836 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
6837 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6838 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 6839 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6840 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6841 }
6842 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
6843 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 6844 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
6845 }
6846 }
ffc61ed2 6847 }
1b2d223b
JH
6848#ifdef EBCDIC
6849 literal_endpoint = 0;
6850#endif
8ada0baa 6851 }
ffc61ed2
JH
6852
6853 range = 0; /* this range (if it was one) is done now */
a0d0e21e 6854 }
ffc61ed2 6855
936ed897 6856 if (need_class) {
4f66b38d 6857 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 6858 if (SIZE_ONLY)
830247a4 6859 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 6860 else
830247a4 6861 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 6862 }
ffc61ed2 6863
7f6f358c
YO
6864
6865 if (SIZE_ONLY)
6866 return ret;
6867 /****** !SIZE_ONLY AFTER HERE *********/
6868
6869 if( stored == 1 && value < 256
6870 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
6871 ) {
6872 /* optimize single char class to an EXACT node
6873 but *only* when its not a UTF/high char */
07be1b83
YO
6874 const char * cur_parse= RExC_parse;
6875 RExC_emit = (regnode *)orig_emit;
6876 RExC_parse = (char *)orig_parse;
7f6f358c
YO
6877 ret = reg_node(pRExC_state,
6878 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 6879 RExC_parse = (char *)cur_parse;
7f6f358c
YO
6880 *STRING(ret)= (char)value;
6881 STR_LEN(ret)= 1;
6882 RExC_emit += STR_SZ(1);
6883 return ret;
6884 }
ae5c130c 6885 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 6886 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
6887 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
6888 ) {
a0ed51b3 6889 for (value = 0; value < 256; ++value) {
936ed897 6890 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 6891 UV fold = PL_fold[value];
ffc61ed2
JH
6892
6893 if (fold != value)
6894 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
6895 }
6896 }
936ed897 6897 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 6898 }
ffc61ed2 6899
ae5c130c 6900 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 6901 if (optimize_invert &&
ffc61ed2
JH
6902 /* If the only flag is inversion. */
6903 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 6904 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 6905 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 6906 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 6907 }
7f6f358c 6908 {
097eb12c 6909 AV * const av = newAV();
ffc61ed2 6910 SV *rv;
9e55ce06 6911 /* The 0th element stores the character class description
6a0407ee 6912 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
6913 * to initialize the appropriate swash (which gets stored in
6914 * the 1st element), and also useful for dumping the regnode.
6915 * The 2nd element stores the multicharacter foldings,
6a0407ee 6916 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
6917 av_store(av, 0, listsv);
6918 av_store(av, 1, NULL);
9e55ce06 6919 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 6920 rv = newRV_noinc((SV*)av);
19860706 6921 n = add_data(pRExC_state, 1, "s");
830247a4 6922 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 6923 ARG_SET(ret, n);
a0ed51b3 6924 }
a0ed51b3
LW
6925 return ret;
6926}
6927
76e3520e 6928STATIC char*
830247a4 6929S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 6930{
097eb12c 6931 char* const retval = RExC_parse++;
a0d0e21e 6932
4633a7c4 6933 for (;;) {
830247a4
IZ
6934 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
6935 RExC_parse[2] == '#') {
e994fd66
AE
6936 while (*RExC_parse != ')') {
6937 if (RExC_parse == RExC_end)
6938 FAIL("Sequence (?#... not terminated");
830247a4 6939 RExC_parse++;
e994fd66 6940 }
830247a4 6941 RExC_parse++;
4633a7c4
LW
6942 continue;
6943 }
e2509266 6944 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
6945 if (isSPACE(*RExC_parse)) {
6946 RExC_parse++;
748a9306
LW
6947 continue;
6948 }
830247a4 6949 else if (*RExC_parse == '#') {
e994fd66
AE
6950 while (RExC_parse < RExC_end)
6951 if (*RExC_parse++ == '\n') break;
748a9306
LW
6952 continue;
6953 }
748a9306 6954 }
4633a7c4 6955 return retval;
a0d0e21e 6956 }
a687059c
LW
6957}
6958
6959/*
c277df42 6960- reg_node - emit a node
a0d0e21e 6961*/
76e3520e 6962STATIC regnode * /* Location. */
830247a4 6963S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 6964{
97aff369 6965 dVAR;
c277df42 6966 register regnode *ptr;
504618e9 6967 regnode * const ret = RExC_emit;
07be1b83 6968 GET_RE_DEBUG_FLAGS_DECL;
a687059c 6969
c277df42 6970 if (SIZE_ONLY) {
830247a4
IZ
6971 SIZE_ALIGN(RExC_size);
6972 RExC_size += 1;
a0d0e21e
LW
6973 return(ret);
6974 }
c277df42 6975 NODE_ALIGN_FILL(ret);
a0d0e21e 6976 ptr = ret;
c277df42 6977 FILL_ADVANCE_NODE(ptr, op);
fac92740 6978 if (RExC_offsets) { /* MJD */
07be1b83 6979 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
6980 "reg_node", __LINE__,
6981 reg_name[op],
07be1b83
YO
6982 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
6983 ? "Overwriting end of array!\n" : "OK",
6984 (UV)(RExC_emit - RExC_emit_start),
6985 (UV)(RExC_parse - RExC_start),
6986 (UV)RExC_offsets[0]));
ccb2c380 6987 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 6988 }
07be1b83 6989
830247a4 6990 RExC_emit = ptr;
a687059c 6991
a0d0e21e 6992 return(ret);
a687059c
LW
6993}
6994
6995/*
a0d0e21e
LW
6996- reganode - emit a node with an argument
6997*/
76e3520e 6998STATIC regnode * /* Location. */
830247a4 6999S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7000{
97aff369 7001 dVAR;
c277df42 7002 register regnode *ptr;
504618e9 7003 regnode * const ret = RExC_emit;
07be1b83 7004 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7005
c277df42 7006 if (SIZE_ONLY) {
830247a4
IZ
7007 SIZE_ALIGN(RExC_size);
7008 RExC_size += 2;
6bda09f9
YO
7009 /*
7010 We can't do this:
7011
7012 assert(2==regarglen[op]+1);
7013
7014 Anything larger than this has to allocate the extra amount.
7015 If we changed this to be:
7016
7017 RExC_size += (1 + regarglen[op]);
7018
7019 then it wouldn't matter. Its not clear what side effect
7020 might come from that so its not done so far.
7021 -- dmq
7022 */
a0d0e21e
LW
7023 return(ret);
7024 }
fe14fcc3 7025
c277df42 7026 NODE_ALIGN_FILL(ret);
a0d0e21e 7027 ptr = ret;
c277df42 7028 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7029 if (RExC_offsets) { /* MJD */
07be1b83 7030 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7031 "reganode",
ccb2c380
MP
7032 __LINE__,
7033 reg_name[op],
07be1b83 7034 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7035 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7036 (UV)(RExC_emit - RExC_emit_start),
7037 (UV)(RExC_parse - RExC_start),
7038 (UV)RExC_offsets[0]));
ccb2c380 7039 Set_Cur_Node_Offset;
fac92740
MJD
7040 }
7041
830247a4 7042 RExC_emit = ptr;
fe14fcc3 7043
a0d0e21e 7044 return(ret);
fe14fcc3
LW
7045}
7046
7047/*
cd439c50 7048- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7049*/
71207a34
AL
7050STATIC STRLEN
7051S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7052{
97aff369 7053 dVAR;
71207a34 7054 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7055}
7056
7057/*
a0d0e21e
LW
7058- reginsert - insert an operator in front of already-emitted operand
7059*
7060* Means relocating the operand.
7061*/
76e3520e 7062STATIC void
6bda09f9 7063S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7064{
97aff369 7065 dVAR;
c277df42
IZ
7066 register regnode *src;
7067 register regnode *dst;
7068 register regnode *place;
504618e9 7069 const int offset = regarglen[(U8)op];
6bda09f9 7070 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7071 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7072/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7073 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7074 if (SIZE_ONLY) {
6bda09f9 7075 RExC_size += size;
a0d0e21e
LW
7076 return;
7077 }
a687059c 7078
830247a4 7079 src = RExC_emit;
6bda09f9 7080 RExC_emit += size;
830247a4 7081 dst = RExC_emit;
6bda09f9
YO
7082 if (RExC_parens) {
7083 int paren;
7084 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
7085 if ( RExC_parens[paren] >= src )
7086 RExC_parens[paren] += size;
7087 }
7088 }
7089
fac92740 7090 while (src > opnd) {
c277df42 7091 StructCopy(--src, --dst, regnode);
fac92740 7092 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7093 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7094 "reg_insert",
ccb2c380
MP
7095 __LINE__,
7096 reg_name[op],
07be1b83
YO
7097 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7098 ? "Overwriting end of array!\n" : "OK",
7099 (UV)(src - RExC_emit_start),
7100 (UV)(dst - RExC_emit_start),
7101 (UV)RExC_offsets[0]));
ccb2c380
MP
7102 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7103 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7104 }
7105 }
7106
a0d0e21e
LW
7107
7108 place = opnd; /* Op node, where operand used to be. */
fac92740 7109 if (RExC_offsets) { /* MJD */
07be1b83 7110 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7111 "reginsert",
ccb2c380
MP
7112 __LINE__,
7113 reg_name[op],
07be1b83 7114 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7115 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7116 (UV)(place - RExC_emit_start),
7117 (UV)(RExC_parse - RExC_start),
786e8c11 7118 (UV)RExC_offsets[0]));
ccb2c380 7119 Set_Node_Offset(place, RExC_parse);
45948336 7120 Set_Node_Length(place, 1);
fac92740 7121 }
c277df42
IZ
7122 src = NEXTOPER(place);
7123 FILL_ADVANCE_NODE(place, op);
7124 Zero(src, offset, regnode);
a687059c
LW
7125}
7126
7127/*
c277df42 7128- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7129- SEE ALSO: regtail_study
a0d0e21e 7130*/
097eb12c 7131/* TODO: All three parms should be const */
76e3520e 7132STATIC void
3dab1dad 7133S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7134{
97aff369 7135 dVAR;
c277df42 7136 register regnode *scan;
72f13be8 7137 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7138#ifndef DEBUGGING
7139 PERL_UNUSED_ARG(depth);
7140#endif
a0d0e21e 7141
c277df42 7142 if (SIZE_ONLY)
a0d0e21e
LW
7143 return;
7144
7145 /* Find last node. */
7146 scan = p;
7147 for (;;) {
504618e9 7148 regnode * const temp = regnext(scan);
3dab1dad
YO
7149 DEBUG_PARSE_r({
7150 SV * const mysv=sv_newmortal();
7151 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7152 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7153 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7154 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7155 (temp == NULL ? "->" : ""),
7156 (temp == NULL ? reg_name[OP(val)] : "")
7157 );
3dab1dad
YO
7158 });
7159 if (temp == NULL)
7160 break;
7161 scan = temp;
7162 }
7163
7164 if (reg_off_by_arg[OP(scan)]) {
7165 ARG_SET(scan, val - scan);
7166 }
7167 else {
7168 NEXT_OFF(scan) = val - scan;
7169 }
7170}
7171
07be1b83 7172#ifdef DEBUGGING
3dab1dad
YO
7173/*
7174- regtail_study - set the next-pointer at the end of a node chain of p to val.
7175- Look for optimizable sequences at the same time.
7176- currently only looks for EXACT chains.
07be1b83
YO
7177
7178This is expermental code. The idea is to use this routine to perform
7179in place optimizations on branches and groups as they are constructed,
7180with the long term intention of removing optimization from study_chunk so
7181that it is purely analytical.
7182
7183Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7184to control which is which.
7185
3dab1dad
YO
7186*/
7187/* TODO: All four parms should be const */
07be1b83 7188
3dab1dad
YO
7189STATIC U8
7190S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7191{
7192 dVAR;
7193 register regnode *scan;
07be1b83
YO
7194 U8 exact = PSEUDO;
7195#ifdef EXPERIMENTAL_INPLACESCAN
7196 I32 min = 0;
7197#endif
7198
3dab1dad
YO
7199 GET_RE_DEBUG_FLAGS_DECL;
7200
07be1b83 7201
3dab1dad
YO
7202 if (SIZE_ONLY)
7203 return exact;
7204
7205 /* Find last node. */
7206
7207 scan = p;
7208 for (;;) {
7209 regnode * const temp = regnext(scan);
07be1b83
YO
7210#ifdef EXPERIMENTAL_INPLACESCAN
7211 if (PL_regkind[OP(scan)] == EXACT)
7212 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7213 return EXACT;
7214#endif
3dab1dad
YO
7215 if ( exact ) {
7216 switch (OP(scan)) {
7217 case EXACT:
7218 case EXACTF:
7219 case EXACTFL:
7220 if( exact == PSEUDO )
7221 exact= OP(scan);
07be1b83
YO
7222 else if ( exact != OP(scan) )
7223 exact= 0;
3dab1dad
YO
7224 case NOTHING:
7225 break;
7226 default:
7227 exact= 0;
7228 }
7229 }
7230 DEBUG_PARSE_r({
7231 SV * const mysv=sv_newmortal();
7232 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7233 regprop(RExC_rx, mysv, scan);
eaf3ca90 7234 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7235 SvPV_nolen_const(mysv),
eaf3ca90
YO
7236 REG_NODE_NUM(scan),
7237 reg_name[exact]);
3dab1dad 7238 });
a0d0e21e
LW
7239 if (temp == NULL)
7240 break;
7241 scan = temp;
7242 }
07be1b83
YO
7243 DEBUG_PARSE_r({
7244 SV * const mysv_val=sv_newmortal();
7245 DEBUG_PARSE_MSG("");
7246 regprop(RExC_rx, mysv_val, val);
7247 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7248 SvPV_nolen_const(mysv_val),
7249 REG_NODE_NUM(val),
7250 val - scan
7251 );
7252 });
c277df42
IZ
7253 if (reg_off_by_arg[OP(scan)]) {
7254 ARG_SET(scan, val - scan);
a0ed51b3
LW
7255 }
7256 else {
c277df42
IZ
7257 NEXT_OFF(scan) = val - scan;
7258 }
3dab1dad
YO
7259
7260 return exact;
a687059c 7261}
07be1b83 7262#endif
a687059c
LW
7263
7264/*
a687059c
LW
7265 - regcurly - a little FSA that accepts {\d+,?\d*}
7266 */
79072805 7267STATIC I32
5f66b61c 7268S_regcurly(register const char *s)
a687059c
LW
7269{
7270 if (*s++ != '{')
7271 return FALSE;
f0fcb552 7272 if (!isDIGIT(*s))
a687059c 7273 return FALSE;
f0fcb552 7274 while (isDIGIT(*s))
a687059c
LW
7275 s++;
7276 if (*s == ',')
7277 s++;
f0fcb552 7278 while (isDIGIT(*s))
a687059c
LW
7279 s++;
7280 if (*s != '}')
7281 return FALSE;
7282 return TRUE;
7283}
7284
a687059c
LW
7285
7286/*
fd181c75 7287 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7288 */
7289void
097eb12c 7290Perl_regdump(pTHX_ const regexp *r)
a687059c 7291{
35ff7856 7292#ifdef DEBUGGING
97aff369 7293 dVAR;
c445ea15 7294 SV * const sv = sv_newmortal();
ab3bbdeb 7295 SV *dsv= sv_newmortal();
a687059c 7296
786e8c11 7297 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7298
7299 /* Header fields of interest. */
ab3bbdeb
YO
7300 if (r->anchored_substr) {
7301 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7302 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7303 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7304 "anchored %s%s at %"IVdf" ",
7305 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7306 (IV)r->anchored_offset);
ab3bbdeb
YO
7307 } else if (r->anchored_utf8) {
7308 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7309 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7310 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7311 "anchored utf8 %s%s at %"IVdf" ",
7312 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7313 (IV)r->anchored_offset);
ab3bbdeb
YO
7314 }
7315 if (r->float_substr) {
7316 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7317 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7318 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7319 "floating %s%s at %"IVdf"..%"UVuf" ",
7320 s, RE_SV_TAIL(r->float_substr),
7b0972df 7321 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7322 } else if (r->float_utf8) {
7323 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7324 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7325 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7326 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7327 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7328 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7329 }
33b8afdf 7330 if (r->check_substr || r->check_utf8)
b81d288d 7331 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7332 (const char *)
7333 (r->check_substr == r->float_substr
7334 && r->check_utf8 == r->float_utf8
7335 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7336 if (r->reganch & ROPT_NOSCAN)
7337 PerlIO_printf(Perl_debug_log, " noscan");
7338 if (r->reganch & ROPT_CHECK_ALL)
7339 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7340 if (r->check_substr || r->check_utf8)
c277df42
IZ
7341 PerlIO_printf(Perl_debug_log, ") ");
7342
46fc3d4c 7343 if (r->regstclass) {
32fc9b6a 7344 regprop(r, sv, r->regstclass);
1de06328 7345 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7346 }
774d564b 7347 if (r->reganch & ROPT_ANCH) {
7348 PerlIO_printf(Perl_debug_log, "anchored");
7349 if (r->reganch & ROPT_ANCH_BOL)
7350 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7351 if (r->reganch & ROPT_ANCH_MBOL)
7352 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7353 if (r->reganch & ROPT_ANCH_SBOL)
7354 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7355 if (r->reganch & ROPT_ANCH_GPOS)
7356 PerlIO_printf(Perl_debug_log, "(GPOS)");
7357 PerlIO_putc(Perl_debug_log, ' ');
7358 }
c277df42
IZ
7359 if (r->reganch & ROPT_GPOS_SEEN)
7360 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 7361 if (r->reganch & ROPT_SKIP)
760ac839 7362 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 7363 if (r->reganch & ROPT_IMPLICIT)
760ac839 7364 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 7365 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
7366 if (r->reganch & ROPT_EVAL_SEEN)
7367 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 7368 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 7369#else
96a5add6 7370 PERL_UNUSED_CONTEXT;
65e66c80 7371 PERL_UNUSED_ARG(r);
17c3b450 7372#endif /* DEBUGGING */
a687059c
LW
7373}
7374
7375/*
a0d0e21e
LW
7376- regprop - printable representation of opcode
7377*/
46fc3d4c 7378void
32fc9b6a 7379Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 7380{
35ff7856 7381#ifdef DEBUGGING
97aff369 7382 dVAR;
9b155405 7383 register int k;
1de06328 7384 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 7385
54dc92de 7386 sv_setpvn(sv, "", 0);
03363afd 7387 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
7388 /* It would be nice to FAIL() here, but this may be called from
7389 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 7390 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 7391 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 7392
3dab1dad 7393 k = PL_regkind[OP(o)];
9b155405 7394
2a782b5b 7395 if (k == EXACT) {
396482e1 7396 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
7397 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
7398 * is a crude hack but it may be the best for now since
7399 * we have no flag "this EXACTish node was UTF-8"
7400 * --jhi */
7401 const char * const s =
ddc5bc0f 7402 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
7403 PL_colors[0], PL_colors[1],
7404 PERL_PV_ESCAPE_UNI_DETECT |
7405 PERL_PV_PRETTY_ELIPSES |
7406 PERL_PV_PRETTY_LTGT
7407 );
7408 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 7409 } else if (k == TRIE) {
3dab1dad 7410 /* print the details of the trie in dumpuntil instead, as
4f639d21 7411 * prog->data isn't available here */
1de06328
YO
7412 const char op = OP(o);
7413 const I32 n = ARG(o);
7414 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
7415 (reg_ac_data *)prog->data->data[n] :
7416 NULL;
7417 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
7418 (reg_trie_data*)prog->data->data[n] :
7419 ac->trie;
7420
7421 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
7422 DEBUG_TRIE_COMPILE_r(
7423 Perl_sv_catpvf(aTHX_ sv,
7424 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7425 (UV)trie->startstate,
7426 (IV)trie->laststate-1,
7427 (UV)trie->wordcount,
7428 (UV)trie->minlen,
7429 (UV)trie->maxlen,
7430 (UV)TRIE_CHARCOUNT(trie),
7431 (UV)trie->uniquecharcount
7432 )
7433 );
7434 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7435 int i;
7436 int rangestart = -1;
f46cb337 7437 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7438 Perl_sv_catpvf(aTHX_ sv, "[");
7439 for (i = 0; i <= 256; i++) {
7440 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7441 if (rangestart == -1)
7442 rangestart = i;
7443 } else if (rangestart != -1) {
7444 if (i <= rangestart + 3)
7445 for (; rangestart < i; rangestart++)
7446 put_byte(sv, rangestart);
7447 else {
7448 put_byte(sv, rangestart);
7449 sv_catpvs(sv, "-");
7450 put_byte(sv, i - 1);
7451 }
7452 rangestart = -1;
7453 }
7454 }
7455 Perl_sv_catpvf(aTHX_ sv, "]");
7456 }
7457
a3621e74 7458 } else if (k == CURLY) {
cb434fcc 7459 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7460 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7461 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7462 }
2c2d71f5
JH
7463 else if (k == WHILEM && o->flags) /* Ordinal/of */
7464 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6bda09f9 7465 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP)
894356b3 7466 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
6bda09f9
YO
7467 else if (k == RECURSE)
7468 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9b155405 7469 else if (k == LOGICAL)
04ebc1ab 7470 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7471 else if (k == ANYOF) {
7472 int i, rangestart = -1;
2d03de9c 7473 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7474
7475 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7476 static const char * const anyofs[] = {
653099ff
GS
7477 "\\w",
7478 "\\W",
7479 "\\s",
7480 "\\S",
7481 "\\d",
7482 "\\D",
7483 "[:alnum:]",
7484 "[:^alnum:]",
7485 "[:alpha:]",
7486 "[:^alpha:]",
7487 "[:ascii:]",
7488 "[:^ascii:]",
7489 "[:ctrl:]",
7490 "[:^ctrl:]",
7491 "[:graph:]",
7492 "[:^graph:]",
7493 "[:lower:]",
7494 "[:^lower:]",
7495 "[:print:]",
7496 "[:^print:]",
7497 "[:punct:]",
7498 "[:^punct:]",
7499 "[:upper:]",
aaa51d5e 7500 "[:^upper:]",
653099ff 7501 "[:xdigit:]",
aaa51d5e
JF
7502 "[:^xdigit:]",
7503 "[:space:]",
7504 "[:^space:]",
7505 "[:blank:]",
7506 "[:^blank:]"
653099ff
GS
7507 };
7508
19860706 7509 if (flags & ANYOF_LOCALE)
396482e1 7510 sv_catpvs(sv, "{loc}");
19860706 7511 if (flags & ANYOF_FOLD)
396482e1 7512 sv_catpvs(sv, "{i}");
653099ff 7513 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 7514 if (flags & ANYOF_INVERT)
396482e1 7515 sv_catpvs(sv, "^");
ffc61ed2
JH
7516 for (i = 0; i <= 256; i++) {
7517 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
7518 if (rangestart == -1)
7519 rangestart = i;
7520 } else if (rangestart != -1) {
7521 if (i <= rangestart + 3)
7522 for (; rangestart < i; rangestart++)
653099ff 7523 put_byte(sv, rangestart);
ffc61ed2
JH
7524 else {
7525 put_byte(sv, rangestart);
396482e1 7526 sv_catpvs(sv, "-");
ffc61ed2 7527 put_byte(sv, i - 1);
653099ff 7528 }
ffc61ed2 7529 rangestart = -1;
653099ff 7530 }
847a199f 7531 }
ffc61ed2
JH
7532
7533 if (o->flags & ANYOF_CLASS)
bb7a0f54 7534 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
7535 if (ANYOF_CLASS_TEST(o,i))
7536 sv_catpv(sv, anyofs[i]);
7537
7538 if (flags & ANYOF_UNICODE)
396482e1 7539 sv_catpvs(sv, "{unicode}");
1aa99e6b 7540 else if (flags & ANYOF_UNICODE_ALL)
396482e1 7541 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
7542
7543 {
7544 SV *lv;
32fc9b6a 7545 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 7546
ffc61ed2
JH
7547 if (lv) {
7548 if (sw) {
89ebb4a3 7549 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 7550
ffc61ed2 7551 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 7552 uvchr_to_utf8(s, i);
ffc61ed2 7553
3568d838 7554 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
7555 if (rangestart == -1)
7556 rangestart = i;
7557 } else if (rangestart != -1) {
ffc61ed2
JH
7558 if (i <= rangestart + 3)
7559 for (; rangestart < i; rangestart++) {
2d03de9c
AL
7560 const U8 * const e = uvchr_to_utf8(s,rangestart);
7561 U8 *p;
7562 for(p = s; p < e; p++)
ffc61ed2
JH
7563 put_byte(sv, *p);
7564 }
7565 else {
2d03de9c
AL
7566 const U8 *e = uvchr_to_utf8(s,rangestart);
7567 U8 *p;
7568 for (p = s; p < e; p++)
ffc61ed2 7569 put_byte(sv, *p);
396482e1 7570 sv_catpvs(sv, "-");
2d03de9c
AL
7571 e = uvchr_to_utf8(s, i-1);
7572 for (p = s; p < e; p++)
1df70142 7573 put_byte(sv, *p);
ffc61ed2
JH
7574 }
7575 rangestart = -1;
7576 }
19860706 7577 }
ffc61ed2 7578
396482e1 7579 sv_catpvs(sv, "..."); /* et cetera */
19860706 7580 }
fde631ed 7581
ffc61ed2 7582 {
2e0de35c 7583 char *s = savesvpv(lv);
c445ea15 7584 char * const origs = s;
b81d288d 7585
3dab1dad
YO
7586 while (*s && *s != '\n')
7587 s++;
b81d288d 7588
ffc61ed2 7589 if (*s == '\n') {
2d03de9c 7590 const char * const t = ++s;
ffc61ed2
JH
7591
7592 while (*s) {
7593 if (*s == '\n')
7594 *s = ' ';
7595 s++;
7596 }
7597 if (s[-1] == ' ')
7598 s[-1] = 0;
7599
7600 sv_catpv(sv, t);
fde631ed 7601 }
b81d288d 7602
ffc61ed2 7603 Safefree(origs);
fde631ed
JH
7604 }
7605 }
653099ff 7606 }
ffc61ed2 7607
653099ff
GS
7608 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
7609 }
9b155405 7610 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 7611 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 7612#else
96a5add6 7613 PERL_UNUSED_CONTEXT;
65e66c80
SP
7614 PERL_UNUSED_ARG(sv);
7615 PERL_UNUSED_ARG(o);
f9049ba1 7616 PERL_UNUSED_ARG(prog);
17c3b450 7617#endif /* DEBUGGING */
35ff7856 7618}
a687059c 7619
cad2e5aa
JH
7620SV *
7621Perl_re_intuit_string(pTHX_ regexp *prog)
7622{ /* Assume that RE_INTUIT is set */
97aff369 7623 dVAR;
a3621e74 7624 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
7625 PERL_UNUSED_CONTEXT;
7626
a3621e74 7627 DEBUG_COMPILE_r(
cfd0369c 7628 {
2d03de9c 7629 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 7630 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
7631
7632 if (!PL_colorset) reginitcolors();
7633 PerlIO_printf(Perl_debug_log,
a0288114 7634 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
7635 PL_colors[4],
7636 prog->check_substr ? "" : "utf8 ",
7637 PL_colors[5],PL_colors[0],
cad2e5aa
JH
7638 s,
7639 PL_colors[1],
7640 (strlen(s) > 60 ? "..." : ""));
7641 } );
7642
33b8afdf 7643 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
7644}
7645
84da74a7
YO
7646/*
7647 pregfree - free a regexp
7648
7649 See regdupe below if you change anything here.
7650*/
7651
2b69d0c2 7652void
864dbfa3 7653Perl_pregfree(pTHX_ struct regexp *r)
a687059c 7654{
27da23d5 7655 dVAR;
0df25f3d 7656
fc32ee4a 7657 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 7658
7821416a
IZ
7659 if (!r || (--r->refcnt > 0))
7660 return;
ab3bbdeb 7661 DEBUG_COMPILE_r({
0df25f3d
YO
7662 if (!PL_colorset)
7663 reginitcolors();
ab3bbdeb
YO
7664 if (RX_DEBUG(r)){
7665 SV *dsv= sv_newmortal();
7666 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
7667 dsv, r->precomp, r->prelen, 60);
7668 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
7669 PL_colors[4],PL_colors[5],s);
7670 }
9e55ce06 7671 });
cad2e5aa 7672
43c5f42d
NC
7673 /* gcov results gave these as non-null 100% of the time, so there's no
7674 optimisation in checking them before calling Safefree */
7675 Safefree(r->precomp);
7676 Safefree(r->offsets); /* 20010421 MJD */
ed252734 7677 RX_MATCH_COPY_FREE(r);
f8c7b90f 7678#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
7679 if (r->saved_copy)
7680 SvREFCNT_dec(r->saved_copy);
7681#endif
a193d654
GS
7682 if (r->substrs) {
7683 if (r->anchored_substr)
7684 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
7685 if (r->anchored_utf8)
7686 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
7687 if (r->float_substr)
7688 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
7689 if (r->float_utf8)
7690 SvREFCNT_dec(r->float_utf8);
2779dcf1 7691 Safefree(r->substrs);
a193d654 7692 }
c277df42
IZ
7693 if (r->data) {
7694 int n = r->data->count;
f3548bdc
DM
7695 PAD* new_comppad = NULL;
7696 PAD* old_comppad;
4026c95a 7697 PADOFFSET refcnt;
dfad63ad 7698
c277df42 7699 while (--n >= 0) {
261faec3 7700 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
7701 switch (r->data->what[n]) {
7702 case 's':
7703 SvREFCNT_dec((SV*)r->data->data[n]);
7704 break;
653099ff
GS
7705 case 'f':
7706 Safefree(r->data->data[n]);
7707 break;
dfad63ad
HS
7708 case 'p':
7709 new_comppad = (AV*)r->data->data[n];
7710 break;
c277df42 7711 case 'o':
dfad63ad 7712 if (new_comppad == NULL)
cea2e8a9 7713 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
7714 PAD_SAVE_LOCAL(old_comppad,
7715 /* Watch out for global destruction's random ordering. */
c445ea15 7716 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 7717 );
b34c0dd4 7718 OP_REFCNT_LOCK;
4026c95a
SH
7719 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
7720 OP_REFCNT_UNLOCK;
7721 if (!refcnt)
9b978d73 7722 op_free((OP_4tree*)r->data->data[n]);
9b978d73 7723
f3548bdc 7724 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
7725 SvREFCNT_dec((SV*)new_comppad);
7726 new_comppad = NULL;
c277df42
IZ
7727 break;
7728 case 'n':
9e55ce06 7729 break;
07be1b83 7730 case 'T':
be8e71aa
YO
7731 { /* Aho Corasick add-on structure for a trie node.
7732 Used in stclass optimization only */
07be1b83
YO
7733 U32 refcount;
7734 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
7735 OP_REFCNT_LOCK;
7736 refcount = --aho->refcount;
7737 OP_REFCNT_UNLOCK;
7738 if ( !refcount ) {
7739 Safefree(aho->states);
7740 Safefree(aho->fail);
7741 aho->trie=NULL; /* not necessary to free this as it is
7742 handled by the 't' case */
7743 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 7744 Safefree(r->regstclass);
07be1b83
YO
7745 }
7746 }
7747 break;
a3621e74 7748 case 't':
07be1b83 7749 {
be8e71aa 7750 /* trie structure. */
07be1b83
YO
7751 U32 refcount;
7752 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
7753 OP_REFCNT_LOCK;
7754 refcount = --trie->refcount;
7755 OP_REFCNT_UNLOCK;
7756 if ( !refcount ) {
7757 Safefree(trie->charmap);
7758 if (trie->widecharmap)
7759 SvREFCNT_dec((SV*)trie->widecharmap);
7760 Safefree(trie->states);
7761 Safefree(trie->trans);
7762 if (trie->bitmap)
7763 Safefree(trie->bitmap);
7764 if (trie->wordlen)
7765 Safefree(trie->wordlen);
786e8c11
YO
7766 if (trie->jump)
7767 Safefree(trie->jump);
7768 if (trie->nextword)
7769 Safefree(trie->nextword);
a3621e74 7770#ifdef DEBUGGING
be8e71aa
YO
7771 if (RX_DEBUG(r)) {
7772 if (trie->words)
7773 SvREFCNT_dec((SV*)trie->words);
7774 if (trie->revcharmap)
7775 SvREFCNT_dec((SV*)trie->revcharmap);
7776 }
a3621e74 7777#endif
07be1b83 7778 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 7779 }
07be1b83
YO
7780 }
7781 break;
c277df42 7782 default:
830247a4 7783 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
7784 }
7785 }
7786 Safefree(r->data->what);
7787 Safefree(r->data);
a0d0e21e
LW
7788 }
7789 Safefree(r->startp);
7790 Safefree(r->endp);
7791 Safefree(r);
a687059c 7792}
c277df42 7793
84da74a7
YO
7794#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
7795#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
7796#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
7797
7798/*
7799 regdupe - duplicate a regexp.
7800
7801 This routine is called by sv.c's re_dup and is expected to clone a
7802 given regexp structure. It is a no-op when not under USE_ITHREADS.
7803 (Originally this *was* re_dup() for change history see sv.c)
7804
7805 See pregfree() above if you change anything here.
7806*/
a3c0e9ca 7807#if defined(USE_ITHREADS)
84da74a7
YO
7808regexp *
7809Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
7810{
84da74a7
YO
7811 dVAR;
7812 REGEXP *ret;
7813 int i, len, npar;
7814 struct reg_substr_datum *s;
7815
7816 if (!r)
7817 return (REGEXP *)NULL;
7818
7819 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
7820 return ret;
7821
7822 len = r->offsets[0];
7823 npar = r->nparens+1;
7824
7825 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
7826 Copy(r->program, ret->program, len+1, regnode);
7827
7828 Newx(ret->startp, npar, I32);
7829 Copy(r->startp, ret->startp, npar, I32);
7830 Newx(ret->endp, npar, I32);
7831 Copy(r->startp, ret->startp, npar, I32);
7832
7833 Newx(ret->substrs, 1, struct reg_substr_data);
7834 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
7835 s->min_offset = r->substrs->data[i].min_offset;
7836 s->max_offset = r->substrs->data[i].max_offset;
7837 s->end_shift = r->substrs->data[i].end_shift;
7838 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
7839 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
7840 }
7841
7842 ret->regstclass = NULL;
7843 if (r->data) {
7844 struct reg_data *d;
7845 const int count = r->data->count;
7846 int i;
7847
7848 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
7849 char, struct reg_data);
7850 Newx(d->what, count, U8);
7851
7852 d->count = count;
7853 for (i = 0; i < count; i++) {
7854 d->what[i] = r->data->what[i];
7855 switch (d->what[i]) {
7856 /* legal options are one of: sfpont
7857 see also regcomp.h and pregfree() */
7858 case 's':
7859 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
7860 break;
7861 case 'p':
7862 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
7863 break;
7864 case 'f':
7865 /* This is cheating. */
7866 Newx(d->data[i], 1, struct regnode_charclass_class);
7867 StructCopy(r->data->data[i], d->data[i],
7868 struct regnode_charclass_class);
7869 ret->regstclass = (regnode*)d->data[i];
7870 break;
7871 case 'o':
7872 /* Compiled op trees are readonly, and can thus be
7873 shared without duplication. */
7874 OP_REFCNT_LOCK;
7875 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
7876 OP_REFCNT_UNLOCK;
7877 break;
7878 case 'n':
7879 d->data[i] = r->data->data[i];
7880 break;
7881 case 't':
7882 d->data[i] = r->data->data[i];
7883 OP_REFCNT_LOCK;
7884 ((reg_trie_data*)d->data[i])->refcount++;
7885 OP_REFCNT_UNLOCK;
7886 break;
7887 case 'T':
7888 d->data[i] = r->data->data[i];
7889 OP_REFCNT_LOCK;
7890 ((reg_ac_data*)d->data[i])->refcount++;
7891 OP_REFCNT_UNLOCK;
7892 /* Trie stclasses are readonly and can thus be shared
7893 * without duplication. We free the stclass in pregfree
7894 * when the corresponding reg_ac_data struct is freed.
7895 */
7896 ret->regstclass= r->regstclass;
7897 break;
7898 default:
7899 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
7900 }
7901 }
7902
7903 ret->data = d;
7904 }
7905 else
7906 ret->data = NULL;
7907
7908 Newx(ret->offsets, 2*len+1, U32);
7909 Copy(r->offsets, ret->offsets, 2*len+1, U32);
7910
7911 ret->precomp = SAVEPVN(r->precomp, r->prelen);
7912 ret->refcnt = r->refcnt;
7913 ret->minlen = r->minlen;
7914 ret->prelen = r->prelen;
7915 ret->nparens = r->nparens;
7916 ret->lastparen = r->lastparen;
7917 ret->lastcloseparen = r->lastcloseparen;
7918 ret->reganch = r->reganch;
7919
7920 ret->sublen = r->sublen;
7921
f9f4320a
YO
7922 ret->engine = r->engine;
7923
84da74a7
YO
7924 if (RX_MATCH_COPIED(ret))
7925 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
7926 else
7927 ret->subbeg = NULL;
7928#ifdef PERL_OLD_COPY_ON_WRITE
7929 ret->saved_copy = NULL;
7930#endif
7931
7932 ptr_table_store(PL_ptr_table, r, ret);
7933 return ret;
84da74a7 7934}
a3c0e9ca 7935#endif
84da74a7 7936
76234dfb 7937#ifndef PERL_IN_XSUB_RE
c277df42
IZ
7938/*
7939 - regnext - dig the "next" pointer out of a node
c277df42
IZ
7940 */
7941regnode *
864dbfa3 7942Perl_regnext(pTHX_ register regnode *p)
c277df42 7943{
97aff369 7944 dVAR;
c277df42
IZ
7945 register I32 offset;
7946
3280af22 7947 if (p == &PL_regdummy)
c277df42
IZ
7948 return(NULL);
7949
7950 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
7951 if (offset == 0)
7952 return(NULL);
7953
c277df42 7954 return(p+offset);
c277df42 7955}
76234dfb 7956#endif
c277df42 7957
01f988be 7958STATIC void
cea2e8a9 7959S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
7960{
7961 va_list args;
7962 STRLEN l1 = strlen(pat1);
7963 STRLEN l2 = strlen(pat2);
7964 char buf[512];
06bf62c7 7965 SV *msv;
73d840c0 7966 const char *message;
c277df42
IZ
7967
7968 if (l1 > 510)
7969 l1 = 510;
7970 if (l1 + l2 > 510)
7971 l2 = 510 - l1;
7972 Copy(pat1, buf, l1 , char);
7973 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
7974 buf[l1 + l2] = '\n';
7975 buf[l1 + l2 + 1] = '\0';
8736538c
AS
7976#ifdef I_STDARG
7977 /* ANSI variant takes additional second argument */
c277df42 7978 va_start(args, pat2);
8736538c
AS
7979#else
7980 va_start(args);
7981#endif
5a844595 7982 msv = vmess(buf, &args);
c277df42 7983 va_end(args);
cfd0369c 7984 message = SvPV_const(msv,l1);
c277df42
IZ
7985 if (l1 > 512)
7986 l1 = 512;
7987 Copy(message, buf, l1 , char);
197cf9b9 7988 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 7989 Perl_croak(aTHX_ "%s", buf);
c277df42 7990}
a0ed51b3
LW
7991
7992/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
7993
76234dfb 7994#ifndef PERL_IN_XSUB_RE
a0ed51b3 7995void
864dbfa3 7996Perl_save_re_context(pTHX)
b81d288d 7997{
97aff369 7998 dVAR;
1ade1aa1
NC
7999
8000 struct re_save_state *state;
8001
8002 SAVEVPTR(PL_curcop);
8003 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8004
8005 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8006 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8007 SSPUSHINT(SAVEt_RE_STATE);
8008
46ab3289 8009 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8010
a0ed51b3 8011 PL_reg_start_tmp = 0;
a0ed51b3 8012 PL_reg_start_tmpl = 0;
c445ea15 8013 PL_reg_oldsaved = NULL;
a5db57d6 8014 PL_reg_oldsavedlen = 0;
a5db57d6 8015 PL_reg_maxiter = 0;
a5db57d6 8016 PL_reg_leftiter = 0;
c445ea15 8017 PL_reg_poscache = NULL;
a5db57d6 8018 PL_reg_poscache_size = 0;
1ade1aa1
NC
8019#ifdef PERL_OLD_COPY_ON_WRITE
8020 PL_nrs = NULL;
8021#endif
ada6e8a9 8022
c445ea15
AL
8023 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8024 if (PL_curpm) {
8025 const REGEXP * const rx = PM_GETRE(PL_curpm);
8026 if (rx) {
1df70142 8027 U32 i;
ada6e8a9 8028 for (i = 1; i <= rx->nparens; i++) {
1df70142 8029 char digits[TYPE_CHARS(long)];
d9fad198 8030 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8031 GV *const *const gvp
8032 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8033
b37c2d43
AL
8034 if (gvp) {
8035 GV * const gv = *gvp;
8036 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8037 save_scalar(gv);
49f27e4b 8038 }
ada6e8a9
AMS
8039 }
8040 }
8041 }
a0ed51b3 8042}
76234dfb 8043#endif
51371543 8044
51371543 8045static void
acfe0abc 8046clear_re(pTHX_ void *r)
51371543 8047{
97aff369 8048 dVAR;
51371543
GS
8049 ReREFCNT_dec((regexp *)r);
8050}
ffbc6a93 8051
a28509cc
AL
8052#ifdef DEBUGGING
8053
8054STATIC void
8055S_put_byte(pTHX_ SV *sv, int c)
8056{
8057 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8058 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8059 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8060 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8061 else
8062 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8063}
8064
786e8c11 8065
3dab1dad
YO
8066#define CLEAR_OPTSTART \
8067 if (optstart) STMT_START { \
07be1b83 8068 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8069 optstart=NULL; \
8070 } STMT_END
8071
786e8c11 8072#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8073
b5a2f8d8
NC
8074STATIC const regnode *
8075S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8076 const regnode *last, const regnode *plast,
8077 SV* sv, I32 indent, U32 depth)
a28509cc 8078{
97aff369 8079 dVAR;
786e8c11 8080 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8081 register const regnode *next;
3dab1dad
YO
8082 const regnode *optstart= NULL;
8083 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8084
786e8c11
YO
8085#ifdef DEBUG_DUMPUNTIL
8086 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8087 last ? last-start : 0,plast ? plast-start : 0);
8088#endif
8089
8090 if (plast && plast < last)
8091 last= plast;
8092
8093 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8094 /* While that wasn't END last time... */
8095
8096 NODE_ALIGN(node);
8097 op = OP(node);
8098 if (op == CLOSE)
786e8c11 8099 indent--;
b5a2f8d8 8100 next = regnext((regnode *)node);
07be1b83 8101
a28509cc 8102 /* Where, what. */
8e11feef 8103 if (OP(node) == OPTIMIZED) {
e68ec53f 8104 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8105 optstart = node;
3dab1dad 8106 else
8e11feef 8107 goto after_print;
3dab1dad
YO
8108 } else
8109 CLEAR_OPTSTART;
07be1b83 8110
32fc9b6a 8111 regprop(r, sv, node);
a28509cc 8112 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8113 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8114
8115 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8116 if (next == NULL) /* Next ptr. */
8117 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8118 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8119 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8120 else
8121 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8122
1de06328 8123 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8124 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8125 }
8126
a28509cc
AL
8127 after_print:
8128 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8129 assert(next);
8130 {
8131 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8132 ? regnext((regnode *)next)
8133 : next);
be8e71aa
YO
8134 if (last && nnode > last)
8135 nnode = last;
786e8c11 8136 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8137 }
a28509cc
AL
8138 }
8139 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8140 assert(next);
786e8c11 8141 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8142 }
8143 else if ( PL_regkind[(U8)op] == TRIE ) {
1de06328 8144 const char op = OP(node);
a28509cc 8145 const I32 n = ARG(node);
1de06328
YO
8146 const reg_ac_data * const ac = op>=AHOCORASICK ?
8147 (reg_ac_data *)r->data->data[n] :
8148 NULL;
8149 const reg_trie_data * const trie = op<AHOCORASICK ?
8150 (reg_trie_data*)r->data->data[n] :
8151 ac->trie;
786e8c11 8152 const regnode *nextbranch= NULL;
a28509cc 8153 I32 word_idx;
1de06328 8154 sv_setpvn(sv, "", 0);
786e8c11 8155 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8156 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8157
8158 PerlIO_printf(Perl_debug_log, "%*s%s ",
8159 (int)(2*(indent+3)), "",
8160 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8161 PL_colors[0], PL_colors[1],
8162 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8163 PERL_PV_PRETTY_ELIPSES |
8164 PERL_PV_PRETTY_LTGT
786e8c11
YO
8165 )
8166 : "???"
8167 );
8168 if (trie->jump) {
8169 U16 dist= trie->jump[word_idx+1];
8170 PerlIO_printf(Perl_debug_log, "(%u)\n",(next - dist) - start);
8171 if (dist) {
8172 if (!nextbranch)
8173 nextbranch= next - trie->jump[0];
8174 DUMPUNTIL(next - dist, nextbranch);
8175 }
8176 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8177 nextbranch= regnext((regnode *)nextbranch);
8178 } else {
8179 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8180 }
786e8c11
YO
8181 }
8182 if (last && next > last)
8183 node= last;
8184 else
8185 node= next;
a28509cc 8186 }
786e8c11
YO
8187 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8188 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8189 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8190 }
8191 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8192 assert(next);
786e8c11 8193 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8194 }
8195 else if ( op == PLUS || op == STAR) {
786e8c11 8196 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8197 }
8198 else if (op == ANYOF) {
8199 /* arglen 1 + class block */
8200 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8201 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8202 node = NEXTOPER(node);
8203 }
8204 else if (PL_regkind[(U8)op] == EXACT) {
8205 /* Literal string, where present. */
8206 node += NODE_SZ_STR(node) - 1;
8207 node = NEXTOPER(node);
8208 }
8209 else {
8210 node = NEXTOPER(node);
8211 node += regarglen[(U8)op];
8212 }
8213 if (op == CURLYX || op == OPEN)
786e8c11 8214 indent++;
a28509cc 8215 else if (op == WHILEM)
786e8c11 8216 indent--;
a28509cc 8217 }
3dab1dad 8218 CLEAR_OPTSTART;
786e8c11
YO
8219#ifdef DEBUG_DUMPUNTIL
8220 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8221#endif
1de06328 8222 return node;
a28509cc
AL
8223}
8224
8225#endif /* DEBUGGING */
8226
241d1a3b
NC
8227/*
8228 * Local variables:
8229 * c-indentation-style: bsd
8230 * c-basic-offset: 4
8231 * indent-tabs-mode: t
8232 * End:
8233 *
37442d52
RGS
8234 * ex: set ts=8 sts=4 sw=4 noet:
8235 */