This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[perl #40718] perl parser bug leading to memory corruption
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
e2e6a0f1 117 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
118 I32 extralen;
119 I32 seen_zerolen;
120 I32 seen_evals;
40d049e4
YO
121 regnode **open_parens; /* pointers to open parens */
122 regnode **close_parens; /* pointers to close parens */
123 regnode *opend; /* END node in program */
1aa99e6b 124 I32 utf8;
6bda09f9 125 HV *charnames; /* cache of named sequences */
81714fb9 126 HV *paren_names; /* Paren names */
40d049e4
YO
127 regnode **recurse; /* Recurse regops */
128 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
3dab1dad 133#ifdef DEBUGGING
be8e71aa 134 const char *lastparse;
3dab1dad
YO
135 I32 lastnum;
136#define RExC_lastparse (pRExC_state->lastparse)
137#define RExC_lastnum (pRExC_state->lastnum)
138#endif
830247a4
IZ
139} RExC_state_t;
140
e2509266 141#define RExC_flags (pRExC_state->flags)
830247a4
IZ
142#define RExC_precomp (pRExC_state->precomp)
143#define RExC_rx (pRExC_state->rx)
fac92740 144#define RExC_start (pRExC_state->start)
830247a4
IZ
145#define RExC_end (pRExC_state->end)
146#define RExC_parse (pRExC_state->parse)
147#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 148#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 149#define RExC_emit (pRExC_state->emit)
fac92740 150#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
151#define RExC_naughty (pRExC_state->naughty)
152#define RExC_sawback (pRExC_state->sawback)
153#define RExC_seen (pRExC_state->seen)
154#define RExC_size (pRExC_state->size)
155#define RExC_npar (pRExC_state->npar)
e2e6a0f1 156#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
157#define RExC_extralen (pRExC_state->extralen)
158#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 160#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 161#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
162#define RExC_open_parens (pRExC_state->open_parens)
163#define RExC_close_parens (pRExC_state->close_parens)
164#define RExC_opend (pRExC_state->opend)
81714fb9 165#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
166#define RExC_recurse (pRExC_state->recurse)
167#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 168
a687059c
LW
169#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
170#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
171 ((*s) == '{' && regcurly(s)))
a687059c 172
35c8bce7
LW
173#ifdef SPSTART
174#undef SPSTART /* dratted cpp namespace... */
175#endif
a687059c
LW
176/*
177 * Flags to be passed up and down.
178 */
a687059c 179#define WORST 0 /* Worst case. */
821b33a5 180#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
181#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
182#define SPSTART 0x4 /* Starts with * or +. */
183#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 184
3dab1dad
YO
185#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
186
07be1b83
YO
187/* whether trie related optimizations are enabled */
188#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
189#define TRIE_STUDY_OPT
786e8c11 190#define FULL_TRIE_STUDY
07be1b83
YO
191#define TRIE_STCLASS
192#endif
1de06328
YO
193
194
40d049e4
YO
195
196#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
197#define PBITVAL(paren) (1 << ((paren) & 7))
198#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
199#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
200#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
201
202
1de06328
YO
203/* About scan_data_t.
204
205 During optimisation we recurse through the regexp program performing
206 various inplace (keyhole style) optimisations. In addition study_chunk
207 and scan_commit populate this data structure with information about
208 what strings MUST appear in the pattern. We look for the longest
209 string that must appear for at a fixed location, and we look for the
210 longest string that may appear at a floating location. So for instance
211 in the pattern:
212
213 /FOO[xX]A.*B[xX]BAR/
214
215 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
216 strings (because they follow a .* construct). study_chunk will identify
217 both FOO and BAR as being the longest fixed and floating strings respectively.
218
219 The strings can be composites, for instance
220
221 /(f)(o)(o)/
222
223 will result in a composite fixed substring 'foo'.
224
225 For each string some basic information is maintained:
226
227 - offset or min_offset
228 This is the position the string must appear at, or not before.
229 It also implicitly (when combined with minlenp) tells us how many
230 character must match before the string we are searching.
231 Likewise when combined with minlenp and the length of the string
232 tells us how many characters must appear after the string we have
233 found.
234
235 - max_offset
236 Only used for floating strings. This is the rightmost point that
237 the string can appear at. Ifset to I32 max it indicates that the
238 string can occur infinitely far to the right.
239
240 - minlenp
241 A pointer to the minimum length of the pattern that the string
242 was found inside. This is important as in the case of positive
243 lookahead or positive lookbehind we can have multiple patterns
244 involved. Consider
245
246 /(?=FOO).*F/
247
248 The minimum length of the pattern overall is 3, the minimum length
249 of the lookahead part is 3, but the minimum length of the part that
250 will actually match is 1. So 'FOO's minimum length is 3, but the
251 minimum length for the F is 1. This is important as the minimum length
252 is used to determine offsets in front of and behind the string being
253 looked for. Since strings can be composites this is the length of the
254 pattern at the time it was commited with a scan_commit. Note that
255 the length is calculated by study_chunk, so that the minimum lengths
256 are not known until the full pattern has been compiled, thus the
257 pointer to the value.
258
259 - lookbehind
260
261 In the case of lookbehind the string being searched for can be
262 offset past the start point of the final matching string.
263 If this value was just blithely removed from the min_offset it would
264 invalidate some of the calculations for how many chars must match
265 before or after (as they are derived from min_offset and minlen and
266 the length of the string being searched for).
267 When the final pattern is compiled and the data is moved from the
268 scan_data_t structure into the regexp structure the information
269 about lookbehind is factored in, with the information that would
270 have been lost precalculated in the end_shift field for the
271 associated string.
272
273 The fields pos_min and pos_delta are used to store the minimum offset
274 and the delta to the maximum offset at the current point in the pattern.
275
276*/
2c2d71f5
JH
277
278typedef struct scan_data_t {
1de06328
YO
279 /*I32 len_min; unused */
280 /*I32 len_delta; unused */
2c2d71f5
JH
281 I32 pos_min;
282 I32 pos_delta;
283 SV *last_found;
1de06328 284 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
285 I32 last_start_min;
286 I32 last_start_max;
1de06328
YO
287 SV **longest; /* Either &l_fixed, or &l_float. */
288 SV *longest_fixed; /* longest fixed string found in pattern */
289 I32 offset_fixed; /* offset where it starts */
290 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
291 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
292 SV *longest_float; /* longest floating string found in pattern */
293 I32 offset_float_min; /* earliest point in string it can appear */
294 I32 offset_float_max; /* latest point in string it can appear */
295 I32 *minlen_float; /* pointer to the minlen relevent to the string */
296 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
297 I32 flags;
298 I32 whilem_c;
cb434fcc 299 I32 *last_closep;
653099ff 300 struct regnode_charclass_class *start_class;
2c2d71f5
JH
301} scan_data_t;
302
a687059c 303/*
e50aee73 304 * Forward declarations for pregcomp()'s friends.
a687059c 305 */
a0d0e21e 306
27da23d5 307static const scan_data_t zero_scan_data =
1de06328 308 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
309
310#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
311#define SF_BEFORE_SEOL 0x0001
312#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
313#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
314#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
315
09b7f37c
CB
316#ifdef NO_UNARY_PLUS
317# define SF_FIX_SHIFT_EOL (0+2)
318# define SF_FL_SHIFT_EOL (0+4)
319#else
320# define SF_FIX_SHIFT_EOL (+2)
321# define SF_FL_SHIFT_EOL (+4)
322#endif
c277df42
IZ
323
324#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
325#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
326
327#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
328#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
329#define SF_IS_INF 0x0040
330#define SF_HAS_PAR 0x0080
331#define SF_IN_PAR 0x0100
332#define SF_HAS_EVAL 0x0200
333#define SCF_DO_SUBSTR 0x0400
653099ff
GS
334#define SCF_DO_STCLASS_AND 0x0800
335#define SCF_DO_STCLASS_OR 0x1000
336#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 337#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 338
786e8c11 339#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 340#define SCF_SEEN_ACCEPT 0x8000
07be1b83 341
eb160463 342#define UTF (RExC_utf8 != 0)
e2509266
JH
343#define LOC ((RExC_flags & PMf_LOCALE) != 0)
344#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 345
ffc61ed2 346#define OOB_UNICODE 12345678
93733859 347#define OOB_NAMEDCLASS -1
b8c5462f 348
a0ed51b3
LW
349#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
350#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
351
8615cb43 352
b45f050a
JF
353/* length of regex to show in messages that don't mark a position within */
354#define RegexLengthToShowInErrorMessages 127
355
356/*
357 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
358 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
359 * op/pragma/warn/regcomp.
360 */
7253e4e3
RK
361#define MARKER1 "<-- HERE" /* marker as it appears in the description */
362#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 363
7253e4e3 364#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
365
366/*
367 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
368 * arg. Show regex, up to a maximum length. If it's too long, chop and add
369 * "...".
370 */
ccb2c380 371#define FAIL(msg) STMT_START { \
bfed75c6 372 const char *ellipses = ""; \
ccb2c380
MP
373 IV len = RExC_end - RExC_precomp; \
374 \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 if (len > RegexLengthToShowInErrorMessages) { \
378 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
379 len = RegexLengthToShowInErrorMessages - 10; \
380 ellipses = "..."; \
381 } \
382 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
383 msg, (int)len, RExC_precomp, ellipses); \
384} STMT_END
8615cb43 385
b45f050a 386/*
b45f050a
JF
387 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
388 */
ccb2c380 389#define Simple_vFAIL(m) STMT_START { \
a28509cc 390 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
391 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
392 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
b45f050a
JF
394
395/*
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
397 */
ccb2c380
MP
398#define vFAIL(m) STMT_START { \
399 if (!SIZE_ONLY) \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
401 Simple_vFAIL(m); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Like Simple_vFAIL(), but accepts two arguments.
406 */
ccb2c380 407#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 408 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
409 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
410 (int)offset, RExC_precomp, RExC_precomp + offset); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
415 */
ccb2c380
MP
416#define vFAIL2(m,a1) STMT_START { \
417 if (!SIZE_ONLY) \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 Simple_vFAIL2(m, a1); \
420} STMT_END
b45f050a
JF
421
422
423/*
424 * Like Simple_vFAIL(), but accepts three arguments.
425 */
ccb2c380 426#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
434 */
ccb2c380
MP
435#define vFAIL3(m,a1,a2) STMT_START { \
436 if (!SIZE_ONLY) \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
438 Simple_vFAIL3(m, a1, a2); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts four arguments.
443 */
ccb2c380 444#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a 449
ccb2c380 450#define vWARN(loc,m) STMT_START { \
a28509cc 451 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
455
456#define vWARNdep(loc,m) STMT_START { \
a28509cc 457 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
458 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
459 "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
461} STMT_END
462
463
464#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 471 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
473 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
480} STMT_END
481
482#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
485 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
9d1d55b5 487
8615cb43 488
cd439c50 489/* Allow for side effects in s */
ccb2c380
MP
490#define REGC(c,s) STMT_START { \
491 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
492} STMT_END
cd439c50 493
fac92740
MJD
494/* Macros for recording node offsets. 20001227 mjd@plover.com
495 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
496 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
497 * Element 0 holds the number n.
07be1b83 498 * Position is 1 indexed.
fac92740
MJD
499 */
500
ccb2c380
MP
501#define Set_Node_Offset_To_R(node,byte) STMT_START { \
502 if (! SIZE_ONLY) { \
503 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 504 __LINE__, (node), (int)(byte))); \
ccb2c380 505 if((node) < 0) { \
551405c4 506 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
507 } else { \
508 RExC_offsets[2*(node)-1] = (byte); \
509 } \
510 } \
511} STMT_END
512
513#define Set_Node_Offset(node,byte) \
514 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
515#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
516
517#define Set_Node_Length_To_R(node,len) STMT_START { \
518 if (! SIZE_ONLY) { \
519 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 520 __LINE__, (int)(node), (int)(len))); \
ccb2c380 521 if((node) < 0) { \
551405c4 522 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
523 } else { \
524 RExC_offsets[2*(node)] = (len); \
525 } \
526 } \
527} STMT_END
528
529#define Set_Node_Length(node,len) \
530 Set_Node_Length_To_R((node)-RExC_emit_start, len)
531#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
532#define Set_Node_Cur_Length(node) \
533 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
534
535/* Get offsets and lengths */
536#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
537#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
538
07be1b83
YO
539#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
540 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
541 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
542} STMT_END
543
544
545#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
546#define EXPERIMENTAL_INPLACESCAN
547#endif
548
1de06328 549#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 550DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
551 PerlIO_printf(Perl_debug_log, \
552 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
553 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
554 (int)(depth)*2, "", \
555 (IV)((data)->pos_min), \
556 (IV)((data)->pos_delta), \
557 (IV)((data)->flags), \
558 (IV)((data)->whilem_c), \
559 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
560 ); \
561 if ((data)->last_found) \
562 PerlIO_printf(Perl_debug_log, \
563 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
564 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
565 SvPVX_const((data)->last_found), \
566 (IV)((data)->last_end), \
567 (IV)((data)->last_start_min), \
568 (IV)((data)->last_start_max), \
569 ((data)->longest && \
570 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
571 SvPVX_const((data)->longest_fixed), \
572 (IV)((data)->offset_fixed), \
573 ((data)->longest && \
574 (data)->longest==&((data)->longest_float)) ? "*" : "", \
575 SvPVX_const((data)->longest_float), \
576 (IV)((data)->offset_float_min), \
577 (IV)((data)->offset_float_max) \
578 ); \
579 PerlIO_printf(Perl_debug_log,"\n"); \
580});
581
acfe0abc 582static void clear_re(pTHX_ void *r);
4327152a 583
653099ff 584/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 585 Update the longest found anchored substring and the longest found
653099ff
GS
586 floating substrings if needed. */
587
4327152a 588STATIC void
1de06328 589S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 590{
e1ec3a88
AL
591 const STRLEN l = CHR_SVLEN(data->last_found);
592 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 593 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 594
c277df42 595 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 596 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
597 if (*data->longest == data->longest_fixed) {
598 data->offset_fixed = l ? data->last_start_min : data->pos_min;
599 if (data->flags & SF_BEFORE_EOL)
b81d288d 600 data->flags
c277df42
IZ
601 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
602 else
603 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
604 data->minlen_fixed=minlenp;
605 data->lookbehind_fixed=0;
a0ed51b3
LW
606 }
607 else {
c277df42 608 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
609 data->offset_float_max = (l
610 ? data->last_start_max
c277df42 611 : data->pos_min + data->pos_delta);
9051bda5
HS
612 if ((U32)data->offset_float_max > (U32)I32_MAX)
613 data->offset_float_max = I32_MAX;
c277df42 614 if (data->flags & SF_BEFORE_EOL)
b81d288d 615 data->flags
c277df42
IZ
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
619 data->minlen_float=minlenp;
620 data->lookbehind_float=0;
c277df42
IZ
621 }
622 }
623 SvCUR_set(data->last_found, 0);
0eda9292 624 {
a28509cc 625 SV * const sv = data->last_found;
097eb12c
AL
626 if (SvUTF8(sv) && SvMAGICAL(sv)) {
627 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
628 if (mg)
629 mg->mg_len = 0;
630 }
0eda9292 631 }
c277df42
IZ
632 data->last_end = -1;
633 data->flags &= ~SF_BEFORE_EOL;
1de06328 634 DEBUG_STUDYDATA(data,0);
c277df42
IZ
635}
636
653099ff
GS
637/* Can match anything (initialization) */
638STATIC void
097eb12c 639S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 640{
653099ff 641 ANYOF_CLASS_ZERO(cl);
f8bef550 642 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 643 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
644 if (LOC)
645 cl->flags |= ANYOF_LOCALE;
646}
647
648/* Can match anything (initialization) */
649STATIC int
5f66b61c 650S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
651{
652 int value;
653
aaa51d5e 654 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
655 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
656 return 1;
1aa99e6b
IH
657 if (!(cl->flags & ANYOF_UNICODE_ALL))
658 return 0;
10edeb5d 659 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 660 return 0;
653099ff
GS
661 return 1;
662}
663
664/* Can match anything (initialization) */
665STATIC void
097eb12c 666S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 667{
8ecf7187 668 Zero(cl, 1, struct regnode_charclass_class);
653099ff 669 cl->type = ANYOF;
830247a4 670 cl_anything(pRExC_state, cl);
653099ff
GS
671}
672
673STATIC void
097eb12c 674S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 675{
8ecf7187 676 Zero(cl, 1, struct regnode_charclass_class);
653099ff 677 cl->type = ANYOF;
830247a4 678 cl_anything(pRExC_state, cl);
653099ff
GS
679 if (LOC)
680 cl->flags |= ANYOF_LOCALE;
681}
682
683/* 'And' a given class with another one. Can create false positives */
684/* We assume that cl is not inverted */
685STATIC void
5f66b61c 686S_cl_and(struct regnode_charclass_class *cl,
a28509cc 687 const struct regnode_charclass_class *and_with)
653099ff 688{
40d049e4
YO
689
690 assert(and_with->type == ANYOF);
653099ff
GS
691 if (!(and_with->flags & ANYOF_CLASS)
692 && !(cl->flags & ANYOF_CLASS)
693 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
694 && !(and_with->flags & ANYOF_FOLD)
695 && !(cl->flags & ANYOF_FOLD)) {
696 int i;
697
698 if (and_with->flags & ANYOF_INVERT)
699 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
700 cl->bitmap[i] &= ~and_with->bitmap[i];
701 else
702 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
703 cl->bitmap[i] &= and_with->bitmap[i];
704 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
705 if (!(and_with->flags & ANYOF_EOS))
706 cl->flags &= ~ANYOF_EOS;
1aa99e6b 707
14ebb1a2
JH
708 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
709 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
710 cl->flags &= ~ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_UNICODE;
712 ARG_SET(cl, ARG(and_with));
713 }
14ebb1a2
JH
714 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
715 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 716 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
717 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
718 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 719 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
720}
721
722/* 'OR' a given class with another one. Can create false positives */
723/* We assume that cl is not inverted */
724STATIC void
097eb12c 725S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 726{
653099ff
GS
727 if (or_with->flags & ANYOF_INVERT) {
728 /* We do not use
729 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
730 * <= (B1 | !B2) | (CL1 | !CL2)
731 * which is wasteful if CL2 is small, but we ignore CL2:
732 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
733 * XXXX Can we handle case-fold? Unclear:
734 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
735 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
736 */
737 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
738 && !(or_with->flags & ANYOF_FOLD)
739 && !(cl->flags & ANYOF_FOLD) ) {
740 int i;
741
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] |= ~or_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise */
745 else {
830247a4 746 cl_anything(pRExC_state, cl);
653099ff
GS
747 }
748 } else {
749 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
750 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 751 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
752 || (cl->flags & ANYOF_FOLD)) ) {
753 int i;
754
755 /* OR char bitmap and class bitmap separately */
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= or_with->bitmap[i];
758 if (or_with->flags & ANYOF_CLASS) {
759 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
760 cl->classflags[i] |= or_with->classflags[i];
761 cl->flags |= ANYOF_CLASS;
762 }
763 }
764 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 765 cl_anything(pRExC_state, cl);
653099ff
GS
766 }
767 }
768 if (or_with->flags & ANYOF_EOS)
769 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
770
771 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
772 ARG(cl) != ARG(or_with)) {
773 cl->flags |= ANYOF_UNICODE_ALL;
774 cl->flags &= ~ANYOF_UNICODE;
775 }
776 if (or_with->flags & ANYOF_UNICODE_ALL) {
777 cl->flags |= ANYOF_UNICODE_ALL;
778 cl->flags &= ~ANYOF_UNICODE;
779 }
653099ff
GS
780}
781
a3621e74
YO
782#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
783#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
784#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
785#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
786
3dab1dad
YO
787
788#ifdef DEBUGGING
07be1b83 789/*
3dab1dad
YO
790 dump_trie(trie)
791 dump_trie_interim_list(trie,next_alloc)
792 dump_trie_interim_table(trie,next_alloc)
793
794 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
795 The _interim_ variants are used for debugging the interim
796 tables that are used to generate the final compressed
797 representation which is what dump_trie expects.
798
3dab1dad
YO
799 Part of the reason for their existance is to provide a form
800 of documentation as to how the different representations function.
07be1b83
YO
801
802*/
3dab1dad
YO
803
804/*
805 dump_trie(trie)
806 Dumps the final compressed table form of the trie to Perl_debug_log.
807 Used for debugging make_trie().
808*/
809
810STATIC void
811S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
812{
813 U32 state;
ab3bbdeb
YO
814 SV *sv=sv_newmortal();
815 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
816 GET_RE_DEBUG_FLAGS_DECL;
817
ab3bbdeb 818
3dab1dad
YO
819 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
820 (int)depth * 2 + 2,"",
821 "Match","Base","Ofs" );
822
823 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 824 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 825 if ( tmp ) {
ab3bbdeb
YO
826 PerlIO_printf( Perl_debug_log, "%*s",
827 colwidth,
ddc5bc0f 828 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
829 PL_colors[0], PL_colors[1],
830 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
831 PERL_PV_ESCAPE_FIRSTCHAR
832 )
833 );
3dab1dad
YO
834 }
835 }
836 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
837 (int)depth * 2 + 2,"");
838
839 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 840 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
841 PerlIO_printf( Perl_debug_log, "\n");
842
1e2e3d02 843 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 844 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
845
846 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
847
848 if ( trie->states[ state ].wordnum ) {
849 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
850 } else {
851 PerlIO_printf( Perl_debug_log, "%6s", "" );
852 }
853
854 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
855
856 if ( base ) {
857 U32 ofs = 0;
858
859 while( ( base + ofs < trie->uniquecharcount ) ||
860 ( base + ofs - trie->uniquecharcount < trie->lasttrans
861 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
862 ofs++;
863
864 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
865
866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
867 if ( ( base + ofs >= trie->uniquecharcount ) &&
868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
870 {
ab3bbdeb
YO
871 PerlIO_printf( Perl_debug_log, "%*"UVXf,
872 colwidth,
3dab1dad
YO
873 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
874 } else {
ab3bbdeb 875 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
876 }
877 }
878
879 PerlIO_printf( Perl_debug_log, "]");
880
881 }
882 PerlIO_printf( Perl_debug_log, "\n" );
883 }
884}
885/*
886 dump_trie_interim_list(trie,next_alloc)
887 Dumps a fully constructed but uncompressed trie in list form.
888 List tries normally only are used for construction when the number of
889 possible chars (trie->uniquecharcount) is very high.
890 Used for debugging make_trie().
891*/
892STATIC void
893S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
894{
895 U32 state;
ab3bbdeb
YO
896 SV *sv=sv_newmortal();
897 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
898 GET_RE_DEBUG_FLAGS_DECL;
899 /* print out the table precompression. */
ab3bbdeb
YO
900 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
901 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
902 "------:-----+-----------------\n" );
3dab1dad
YO
903
904 for( state=1 ; state < next_alloc ; state ++ ) {
905 U16 charid;
906
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
908 (int)depth * 2 + 2,"", (UV)state );
909 if ( ! trie->states[ state ].wordnum ) {
910 PerlIO_printf( Perl_debug_log, "%5s| ","");
911 } else {
912 PerlIO_printf( Perl_debug_log, "W%4x| ",
913 trie->states[ state ].wordnum
914 );
915 }
916 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 917 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
918 if ( tmp ) {
919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
920 colwidth,
ddc5bc0f 921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
922 PL_colors[0], PL_colors[1],
923 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
924 PERL_PV_ESCAPE_FIRSTCHAR
925 ) ,
1e2e3d02
YO
926 TRIE_LIST_ITEM(state,charid).forid,
927 (UV)TRIE_LIST_ITEM(state,charid).newstate
928 );
929 if (!(charid % 10))
664e119d
RGS
930 PerlIO_printf(Perl_debug_log, "\n%*s| ",
931 (int)((depth * 2) + 14), "");
1e2e3d02 932 }
ab3bbdeb
YO
933 }
934 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
935 }
936}
937
938/*
939 dump_trie_interim_table(trie,next_alloc)
940 Dumps a fully constructed but uncompressed trie in table form.
941 This is the normal DFA style state transition table, with a few
942 twists to facilitate compression later.
943 Used for debugging make_trie().
944*/
945STATIC void
946S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
947{
948 U32 state;
949 U16 charid;
ab3bbdeb
YO
950 SV *sv=sv_newmortal();
951 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
952 GET_RE_DEBUG_FLAGS_DECL;
953
954 /*
955 print out the table precompression so that we can do a visual check
956 that they are identical.
957 */
958
959 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
960
961 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 962 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 963 if ( tmp ) {
ab3bbdeb
YO
964 PerlIO_printf( Perl_debug_log, "%*s",
965 colwidth,
ddc5bc0f 966 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
967 PL_colors[0], PL_colors[1],
968 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
969 PERL_PV_ESCAPE_FIRSTCHAR
970 )
971 );
3dab1dad
YO
972 }
973 }
974
975 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
976
977 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 978 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
979 }
980
981 PerlIO_printf( Perl_debug_log, "\n" );
982
983 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
984
985 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
986 (int)depth * 2 + 2,"",
987 (UV)TRIE_NODENUM( state ) );
988
989 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
990 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
991 if (v)
992 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
993 else
994 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
995 }
996 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
998 } else {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1000 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1001 }
1002 }
07be1b83 1003}
3dab1dad
YO
1004
1005#endif
1006
786e8c11
YO
1007/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1008 startbranch: the first branch in the whole branch sequence
1009 first : start branch of sequence of branch-exact nodes.
1010 May be the same as startbranch
1011 last : Thing following the last branch.
1012 May be the same as tail.
1013 tail : item following the branch sequence
1014 count : words in the sequence
1015 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1016 depth : indent depth
3dab1dad 1017
786e8c11 1018Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1019
786e8c11
YO
1020A trie is an N'ary tree where the branches are determined by digital
1021decomposition of the key. IE, at the root node you look up the 1st character and
1022follow that branch repeat until you find the end of the branches. Nodes can be
1023marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1024
786e8c11 1025 /he|she|his|hers/
72f13be8 1026
786e8c11
YO
1027would convert into the following structure. Numbers represent states, letters
1028following numbers represent valid transitions on the letter from that state, if
1029the number is in square brackets it represents an accepting state, otherwise it
1030will be in parenthesis.
07be1b83 1031
786e8c11
YO
1032 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1033 | |
1034 | (2)
1035 | |
1036 (1) +-i->(6)-+-s->[7]
1037 |
1038 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1039
786e8c11
YO
1040 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1041
1042This shows that when matching against the string 'hers' we will begin at state 1
1043read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1044then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1045is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1046single traverse. We store a mapping from accepting to state to which word was
1047matched, and then when we have multiple possibilities we try to complete the
1048rest of the regex in the order in which they occured in the alternation.
1049
1050The only prior NFA like behaviour that would be changed by the TRIE support is
1051the silent ignoring of duplicate alternations which are of the form:
1052
1053 / (DUPE|DUPE) X? (?{ ... }) Y /x
1054
1055Thus EVAL blocks follwing a trie may be called a different number of times with
1056and without the optimisation. With the optimisations dupes will be silently
1057ignored. This inconsistant behaviour of EVAL type nodes is well established as
1058the following demonstrates:
1059
1060 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1061
1062which prints out 'word' three times, but
1063
1064 'words'=~/(word|word|word)(?{ print $1 })S/
1065
1066which doesnt print it out at all. This is due to other optimisations kicking in.
1067
1068Example of what happens on a structural level:
1069
1070The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1071
1072 1: CURLYM[1] {1,32767}(18)
1073 5: BRANCH(8)
1074 6: EXACT <ac>(16)
1075 8: BRANCH(11)
1076 9: EXACT <ad>(16)
1077 11: BRANCH(14)
1078 12: EXACT <ab>(16)
1079 16: SUCCEED(0)
1080 17: NOTHING(18)
1081 18: END(0)
1082
1083This would be optimizable with startbranch=5, first=5, last=16, tail=16
1084and should turn into:
1085
1086 1: CURLYM[1] {1,32767}(18)
1087 5: TRIE(16)
1088 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1089 <ac>
1090 <ad>
1091 <ab>
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096Cases where tail != last would be like /(?foo|bar)baz/:
1097
1098 1: BRANCH(4)
1099 2: EXACT <foo>(8)
1100 4: BRANCH(7)
1101 5: EXACT <bar>(8)
1102 7: TAIL(8)
1103 8: EXACT <baz>(10)
1104 10: END(0)
1105
1106which would be optimizable with startbranch=1, first=1, last=7, tail=8
1107and would end up looking like:
1108
1109 1: TRIE(8)
1110 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1111 <foo>
1112 <bar>
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117 d = uvuni_to_utf8_flags(d, uv, 0);
1118
1119is the recommended Unicode-aware way of saying
1120
1121 *(d++) = uv;
1122*/
1123
1e2e3d02 1124#define TRIE_STORE_REVCHAR \
786e8c11 1125 STMT_START { \
1e2e3d02 1126 SV *tmp = newSVpvs(""); \
786e8c11 1127 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1128 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1129 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1130 } STMT_END
1131
1132#define TRIE_READ_CHAR STMT_START { \
1133 wordlen++; \
1134 if ( UTF ) { \
1135 if ( folder ) { \
1136 if ( foldlen > 0 ) { \
1137 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1138 foldlen -= len; \
1139 scan += len; \
1140 len = 0; \
1141 } else { \
1142 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1143 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1144 foldlen -= UNISKIP( uvc ); \
1145 scan = foldbuf + UNISKIP( uvc ); \
1146 } \
1147 } else { \
1148 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1149 } \
1150 } else { \
1151 uvc = (U32)*uc; \
1152 len = 1; \
1153 } \
1154} STMT_END
1155
1156
1157
1158#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1159 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1160 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1161 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1162 } \
1163 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1164 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1165 TRIE_LIST_CUR( state )++; \
1166} STMT_END
07be1b83 1167
786e8c11
YO
1168#define TRIE_LIST_NEW(state) STMT_START { \
1169 Newxz( trie->states[ state ].trans.list, \
1170 4, reg_trie_trans_le ); \
1171 TRIE_LIST_CUR( state ) = 1; \
1172 TRIE_LIST_LEN( state ) = 4; \
1173} STMT_END
07be1b83 1174
786e8c11
YO
1175#define TRIE_HANDLE_WORD(state) STMT_START { \
1176 U16 dupe= trie->states[ state ].wordnum; \
1177 regnode * const noper_next = regnext( noper ); \
1178 \
1179 if (trie->wordlen) \
1180 trie->wordlen[ curword ] = wordlen; \
1181 DEBUG_r({ \
1182 /* store the word for dumping */ \
1183 SV* tmp; \
1184 if (OP(noper) != NOTHING) \
1185 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1186 else \
1187 tmp = newSVpvn( "", 0 ); \
1188 if ( UTF ) SvUTF8_on( tmp ); \
1189 av_push( trie->words, tmp ); \
1190 }); \
1191 \
1192 curword++; \
1193 \
1194 if ( noper_next < tail ) { \
1195 if (!trie->jump) \
1196 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1197 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1198 if (!jumper) \
1199 jumper = noper_next; \
1200 if (!nextbranch) \
1201 nextbranch= regnext(cur); \
1202 } \
1203 \
1204 if ( dupe ) { \
1205 /* So it's a dupe. This means we need to maintain a */\
1206 /* linked-list from the first to the next. */\
1207 /* we only allocate the nextword buffer when there */\
1208 /* a dupe, so first time we have to do the allocation */\
1209 if (!trie->nextword) \
1210 Newxz( trie->nextword, word_count + 1, U16); \
1211 while ( trie->nextword[dupe] ) \
1212 dupe= trie->nextword[dupe]; \
1213 trie->nextword[dupe]= curword; \
1214 } else { \
1215 /* we haven't inserted this word yet. */ \
1216 trie->states[ state ].wordnum = curword; \
1217 } \
1218} STMT_END
07be1b83 1219
3dab1dad 1220
786e8c11
YO
1221#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1222 ( ( base + charid >= ucharcount \
1223 && base + charid < ubound \
1224 && state == trie->trans[ base - ucharcount + charid ].check \
1225 && trie->trans[ base - ucharcount + charid ].next ) \
1226 ? trie->trans[ base - ucharcount + charid ].next \
1227 : ( state==1 ? special : 0 ) \
1228 )
3dab1dad 1229
786e8c11
YO
1230#define MADE_TRIE 1
1231#define MADE_JUMP_TRIE 2
1232#define MADE_EXACT_TRIE 4
3dab1dad 1233
a3621e74 1234STATIC I32
786e8c11 1235S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1236{
27da23d5 1237 dVAR;
a3621e74
YO
1238 /* first pass, loop through and scan words */
1239 reg_trie_data *trie;
1240 regnode *cur;
9f7f3913 1241 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1242 STRLEN len = 0;
1243 UV uvc = 0;
1244 U16 curword = 0;
1245 U32 next_alloc = 0;
786e8c11
YO
1246 regnode *jumper = NULL;
1247 regnode *nextbranch = NULL;
7f69552c 1248 regnode *convert = NULL;
a3621e74 1249 /* we just use folder as a flag in utf8 */
e1ec3a88 1250 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1251 ? PL_fold
1252 : ( flags == EXACTFL
1253 ? PL_fold_locale
1254 : NULL
1255 )
1256 );
1257
e1ec3a88 1258 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1259 SV *re_trie_maxbuff;
3dab1dad
YO
1260#ifndef DEBUGGING
1261 /* these are only used during construction but are useful during
8e11feef 1262 * debugging so we store them in the struct when debugging.
8e11feef 1263 */
3dab1dad 1264 STRLEN trie_charcount=0;
3dab1dad
YO
1265 AV *trie_revcharmap;
1266#endif
a3621e74 1267 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1268#ifndef DEBUGGING
1269 PERL_UNUSED_ARG(depth);
1270#endif
a3621e74 1271
a02a5408 1272 Newxz( trie, 1, reg_trie_data );
a3621e74 1273 trie->refcount = 1;
3dab1dad 1274 trie->startstate = 1;
786e8c11 1275 trie->wordcount = word_count;
a3621e74 1276 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1277 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1278 if (!(UTF && folder))
1279 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1280 DEBUG_r({
1281 trie->words = newAV();
a3621e74 1282 });
3dab1dad 1283 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1284
0111c4fd 1285 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1286 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1287 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1288 }
3dab1dad
YO
1289 DEBUG_OPTIMISE_r({
1290 PerlIO_printf( Perl_debug_log,
786e8c11 1291 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1292 (int)depth * 2 + 2, "",
1293 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1294 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1295 (int)depth);
3dab1dad 1296 });
7f69552c
YO
1297
1298 /* Find the node we are going to overwrite */
1299 if ( first == startbranch && OP( last ) != BRANCH ) {
1300 /* whole branch chain */
1301 convert = first;
1302 } else {
1303 /* branch sub-chain */
1304 convert = NEXTOPER( first );
1305 }
1306
a3621e74
YO
1307 /* -- First loop and Setup --
1308
1309 We first traverse the branches and scan each word to determine if it
1310 contains widechars, and how many unique chars there are, this is
1311 important as we have to build a table with at least as many columns as we
1312 have unique chars.
1313
1314 We use an array of integers to represent the character codes 0..255
1315 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1316 native representation of the character value as the key and IV's for the
1317 coded index.
1318
1319 *TODO* If we keep track of how many times each character is used we can
1320 remap the columns so that the table compression later on is more
1321 efficient in terms of memory by ensuring most common value is in the
1322 middle and the least common are on the outside. IMO this would be better
1323 than a most to least common mapping as theres a decent chance the most
1324 common letter will share a node with the least common, meaning the node
1325 will not be compressable. With a middle is most common approach the worst
1326 case is when we have the least common nodes twice.
1327
1328 */
1329
a3621e74 1330 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1331 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1332 const U8 *uc = (U8*)STRING( noper );
a28509cc 1333 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1334 STRLEN foldlen = 0;
1335 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1336 const U8 *scan = (U8*)NULL;
07be1b83 1337 U32 wordlen = 0; /* required init */
3dab1dad 1338 STRLEN chars=0;
a3621e74 1339
3dab1dad
YO
1340 if (OP(noper) == NOTHING) {
1341 trie->minlen= 0;
1342 continue;
1343 }
1344 if (trie->bitmap) {
1345 TRIE_BITMAP_SET(trie,*uc);
1346 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1347 }
a3621e74 1348 for ( ; uc < e ; uc += len ) {
3dab1dad 1349 TRIE_CHARCOUNT(trie)++;
a3621e74 1350 TRIE_READ_CHAR;
3dab1dad 1351 chars++;
a3621e74
YO
1352 if ( uvc < 256 ) {
1353 if ( !trie->charmap[ uvc ] ) {
1354 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1355 if ( folder )
1356 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1357 TRIE_STORE_REVCHAR;
a3621e74
YO
1358 }
1359 } else {
1360 SV** svpp;
1361 if ( !trie->widecharmap )
1362 trie->widecharmap = newHV();
1363
1364 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1365
1366 if ( !svpp )
e4584336 1367 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1368
1369 if ( !SvTRUE( *svpp ) ) {
1370 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1371 TRIE_STORE_REVCHAR;
a3621e74
YO
1372 }
1373 }
1374 }
3dab1dad
YO
1375 if( cur == first ) {
1376 trie->minlen=chars;
1377 trie->maxlen=chars;
1378 } else if (chars < trie->minlen) {
1379 trie->minlen=chars;
1380 } else if (chars > trie->maxlen) {
1381 trie->maxlen=chars;
1382 }
1383
a3621e74
YO
1384 } /* end first pass */
1385 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1386 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1387 (int)depth * 2 + 2,"",
85c3142d 1388 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1390 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1391 );
786e8c11 1392 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1393
1394 /*
1395 We now know what we are dealing with in terms of unique chars and
1396 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1397 representation using a flat table will take. If it's over a reasonable
1398 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1399 conservative but potentially much slower representation using an array
1400 of lists.
1401
1402 At the end we convert both representations into the same compressed
1403 form that will be used in regexec.c for matching with. The latter
1404 is a form that cannot be used to construct with but has memory
1405 properties similar to the list form and access properties similar
1406 to the table form making it both suitable for fast searches and
1407 small enough that its feasable to store for the duration of a program.
1408
1409 See the comment in the code where the compressed table is produced
1410 inplace from the flat tabe representation for an explanation of how
1411 the compression works.
1412
1413 */
1414
1415
3dab1dad 1416 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1417 /*
1418 Second Pass -- Array Of Lists Representation
1419
1420 Each state will be represented by a list of charid:state records
1421 (reg_trie_trans_le) the first such element holds the CUR and LEN
1422 points of the allocated array. (See defines above).
1423
1424 We build the initial structure using the lists, and then convert
1425 it into the compressed table form which allows faster lookups
1426 (but cant be modified once converted).
a3621e74
YO
1427 */
1428
a3621e74
YO
1429 STRLEN transcount = 1;
1430
1e2e3d02
YO
1431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1432 "%*sCompiling trie using list compiler\n",
1433 (int)depth * 2 + 2, ""));
1434
3dab1dad 1435 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1436 TRIE_LIST_NEW(1);
1437 next_alloc = 2;
1438
1439 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1440
c445ea15
AL
1441 regnode * const noper = NEXTOPER( cur );
1442 U8 *uc = (U8*)STRING( noper );
1443 const U8 * const e = uc + STR_LEN( noper );
1444 U32 state = 1; /* required init */
1445 U16 charid = 0; /* sanity init */
1446 U8 *scan = (U8*)NULL; /* sanity init */
1447 STRLEN foldlen = 0; /* required init */
07be1b83 1448 U32 wordlen = 0; /* required init */
c445ea15
AL
1449 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1450
3dab1dad 1451 if (OP(noper) != NOTHING) {
786e8c11 1452 for ( ; uc < e ; uc += len ) {
c445ea15 1453
786e8c11 1454 TRIE_READ_CHAR;
c445ea15 1455
786e8c11
YO
1456 if ( uvc < 256 ) {
1457 charid = trie->charmap[ uvc ];
c445ea15 1458 } else {
786e8c11
YO
1459 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1460 if ( !svpp ) {
1461 charid = 0;
1462 } else {
1463 charid=(U16)SvIV( *svpp );
1464 }
c445ea15 1465 }
786e8c11
YO
1466 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1467 if ( charid ) {
a3621e74 1468
786e8c11
YO
1469 U16 check;
1470 U32 newstate = 0;
a3621e74 1471
786e8c11
YO
1472 charid--;
1473 if ( !trie->states[ state ].trans.list ) {
1474 TRIE_LIST_NEW( state );
c445ea15 1475 }
786e8c11
YO
1476 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1477 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1478 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1479 break;
1480 }
1481 }
1482 if ( ! newstate ) {
1483 newstate = next_alloc++;
1484 TRIE_LIST_PUSH( state, charid, newstate );
1485 transcount++;
1486 }
1487 state = newstate;
1488 } else {
1489 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1490 }
a28509cc 1491 }
c445ea15 1492 }
3dab1dad 1493 TRIE_HANDLE_WORD(state);
a3621e74
YO
1494
1495 } /* end second pass */
1496
1e2e3d02
YO
1497 /* next alloc is the NEXT state to be allocated */
1498 trie->statecount = next_alloc;
a3621e74
YO
1499 Renew( trie->states, next_alloc, reg_trie_state );
1500
3dab1dad
YO
1501 /* and now dump it out before we compress it */
1502 DEBUG_TRIE_COMPILE_MORE_r(
1503 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1504 );
a3621e74 1505
a02a5408 1506 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1507 {
1508 U32 state;
a3621e74
YO
1509 U32 tp = 0;
1510 U32 zp = 0;
1511
1512
1513 for( state=1 ; state < next_alloc ; state ++ ) {
1514 U32 base=0;
1515
1516 /*
1517 DEBUG_TRIE_COMPILE_MORE_r(
1518 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1519 );
1520 */
1521
1522 if (trie->states[state].trans.list) {
1523 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1524 U16 maxid=minid;
a28509cc 1525 U16 idx;
a3621e74
YO
1526
1527 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1528 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1529 if ( forid < minid ) {
1530 minid=forid;
1531 } else if ( forid > maxid ) {
1532 maxid=forid;
1533 }
a3621e74
YO
1534 }
1535 if ( transcount < tp + maxid - minid + 1) {
1536 transcount *= 2;
1537 Renew( trie->trans, transcount, reg_trie_trans );
1538 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1539 }
1540 base = trie->uniquecharcount + tp - minid;
1541 if ( maxid == minid ) {
1542 U32 set = 0;
1543 for ( ; zp < tp ; zp++ ) {
1544 if ( ! trie->trans[ zp ].next ) {
1545 base = trie->uniquecharcount + zp - minid;
1546 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1547 trie->trans[ zp ].check = state;
1548 set = 1;
1549 break;
1550 }
1551 }
1552 if ( !set ) {
1553 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1554 trie->trans[ tp ].check = state;
1555 tp++;
1556 zp = tp;
1557 }
1558 } else {
1559 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1560 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1561 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1562 trie->trans[ tid ].check = state;
1563 }
1564 tp += ( maxid - minid + 1 );
1565 }
1566 Safefree(trie->states[ state ].trans.list);
1567 }
1568 /*
1569 DEBUG_TRIE_COMPILE_MORE_r(
1570 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1571 );
1572 */
1573 trie->states[ state ].trans.base=base;
1574 }
cc601c31 1575 trie->lasttrans = tp + 1;
a3621e74
YO
1576 }
1577 } else {
1578 /*
1579 Second Pass -- Flat Table Representation.
1580
1581 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1582 We know that we will need Charcount+1 trans at most to store the data
1583 (one row per char at worst case) So we preallocate both structures
1584 assuming worst case.
1585
1586 We then construct the trie using only the .next slots of the entry
1587 structs.
1588
1589 We use the .check field of the first entry of the node temporarily to
1590 make compression both faster and easier by keeping track of how many non
1591 zero fields are in the node.
1592
1593 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1594 transition.
1595
1596 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1597 number representing the first entry of the node, and state as a
1598 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1599 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1600 are 2 entrys per node. eg:
1601
1602 A B A B
1603 1. 2 4 1. 3 7
1604 2. 0 3 3. 0 5
1605 3. 0 0 5. 0 0
1606 4. 0 0 7. 0 0
1607
1608 The table is internally in the right hand, idx form. However as we also
1609 have to deal with the states array which is indexed by nodenum we have to
1610 use TRIE_NODENUM() to convert.
1611
1612 */
1e2e3d02
YO
1613 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1614 "%*sCompiling trie using table compiler\n",
1615 (int)depth * 2 + 2, ""));
3dab1dad
YO
1616
1617 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1618 reg_trie_trans );
3dab1dad 1619 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1620 next_alloc = trie->uniquecharcount + 1;
1621
3dab1dad 1622
a3621e74
YO
1623 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1624
c445ea15 1625 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1626 const U8 *uc = (U8*)STRING( noper );
1627 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1628
1629 U32 state = 1; /* required init */
1630
1631 U16 charid = 0; /* sanity init */
1632 U32 accept_state = 0; /* sanity init */
1633 U8 *scan = (U8*)NULL; /* sanity init */
1634
1635 STRLEN foldlen = 0; /* required init */
07be1b83 1636 U32 wordlen = 0; /* required init */
a3621e74
YO
1637 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1638
3dab1dad 1639 if ( OP(noper) != NOTHING ) {
786e8c11 1640 for ( ; uc < e ; uc += len ) {
a3621e74 1641
786e8c11 1642 TRIE_READ_CHAR;
a3621e74 1643
786e8c11
YO
1644 if ( uvc < 256 ) {
1645 charid = trie->charmap[ uvc ];
1646 } else {
1647 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1648 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1649 }
786e8c11
YO
1650 if ( charid ) {
1651 charid--;
1652 if ( !trie->trans[ state + charid ].next ) {
1653 trie->trans[ state + charid ].next = next_alloc;
1654 trie->trans[ state ].check++;
1655 next_alloc += trie->uniquecharcount;
1656 }
1657 state = trie->trans[ state + charid ].next;
1658 } else {
1659 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1660 }
1661 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1662 }
a3621e74 1663 }
3dab1dad
YO
1664 accept_state = TRIE_NODENUM( state );
1665 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1666
1667 } /* end second pass */
1668
3dab1dad
YO
1669 /* and now dump it out before we compress it */
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 dump_trie_interim_table(trie,next_alloc,depth+1)
1672 );
a3621e74 1673
a3621e74
YO
1674 {
1675 /*
1676 * Inplace compress the table.*
1677
1678 For sparse data sets the table constructed by the trie algorithm will
1679 be mostly 0/FAIL transitions or to put it another way mostly empty.
1680 (Note that leaf nodes will not contain any transitions.)
1681
1682 This algorithm compresses the tables by eliminating most such
1683 transitions, at the cost of a modest bit of extra work during lookup:
1684
1685 - Each states[] entry contains a .base field which indicates the
1686 index in the state[] array wheres its transition data is stored.
1687
1688 - If .base is 0 there are no valid transitions from that node.
1689
1690 - If .base is nonzero then charid is added to it to find an entry in
1691 the trans array.
1692
1693 -If trans[states[state].base+charid].check!=state then the
1694 transition is taken to be a 0/Fail transition. Thus if there are fail
1695 transitions at the front of the node then the .base offset will point
1696 somewhere inside the previous nodes data (or maybe even into a node
1697 even earlier), but the .check field determines if the transition is
1698 valid.
1699
786e8c11 1700 XXX - wrong maybe?
a3621e74
YO
1701 The following process inplace converts the table to the compressed
1702 table: We first do not compress the root node 1,and mark its all its
1703 .check pointers as 1 and set its .base pointer as 1 as well. This
1704 allows to do a DFA construction from the compressed table later, and
1705 ensures that any .base pointers we calculate later are greater than
1706 0.
1707
1708 - We set 'pos' to indicate the first entry of the second node.
1709
1710 - We then iterate over the columns of the node, finding the first and
1711 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1712 and set the .check pointers accordingly, and advance pos
1713 appropriately and repreat for the next node. Note that when we copy
1714 the next pointers we have to convert them from the original
1715 NODEIDX form to NODENUM form as the former is not valid post
1716 compression.
1717
1718 - If a node has no transitions used we mark its base as 0 and do not
1719 advance the pos pointer.
1720
1721 - If a node only has one transition we use a second pointer into the
1722 structure to fill in allocated fail transitions from other states.
1723 This pointer is independent of the main pointer and scans forward
1724 looking for null transitions that are allocated to a state. When it
1725 finds one it writes the single transition into the "hole". If the
786e8c11 1726 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1727
1728 - Once compressed we can Renew/realloc the structures to release the
1729 excess space.
1730
1731 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1732 specifically Fig 3.47 and the associated pseudocode.
1733
1734 demq
1735 */
a3b680e6 1736 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1737 U32 state, charid;
a3621e74 1738 U32 pos = 0, zp=0;
1e2e3d02 1739 trie->statecount = laststate;
a3621e74
YO
1740
1741 for ( state = 1 ; state < laststate ; state++ ) {
1742 U8 flag = 0;
a28509cc
AL
1743 const U32 stateidx = TRIE_NODEIDX( state );
1744 const U32 o_used = trie->trans[ stateidx ].check;
1745 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1746 trie->trans[ stateidx ].check = 0;
1747
1748 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1749 if ( flag || trie->trans[ stateidx + charid ].next ) {
1750 if ( trie->trans[ stateidx + charid ].next ) {
1751 if (o_used == 1) {
1752 for ( ; zp < pos ; zp++ ) {
1753 if ( ! trie->trans[ zp ].next ) {
1754 break;
1755 }
1756 }
1757 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1758 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1759 trie->trans[ zp ].check = state;
1760 if ( ++zp > pos ) pos = zp;
1761 break;
1762 }
1763 used--;
1764 }
1765 if ( !flag ) {
1766 flag = 1;
1767 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1768 }
1769 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ pos ].check = state;
1771 pos++;
1772 }
1773 }
1774 }
cc601c31 1775 trie->lasttrans = pos + 1;
1e2e3d02 1776 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1777 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1778 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1779 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1780 (int)depth * 2 + 2,"",
1781 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1782 (IV)next_alloc,
1783 (IV)pos,
a3621e74
YO
1784 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1785 );
1786
1787 } /* end table compress */
1788 }
1e2e3d02
YO
1789 DEBUG_TRIE_COMPILE_MORE_r(
1790 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1791 (int)depth * 2 + 2, "",
1792 (UV)trie->statecount,
1793 (UV)trie->lasttrans)
1794 );
cc601c31
YO
1795 /* resize the trans array to remove unused space */
1796 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1797
3dab1dad
YO
1798 /* and now dump out the compressed format */
1799 DEBUG_TRIE_COMPILE_r(
1800 dump_trie(trie,depth+1)
1801 );
07be1b83 1802
3dab1dad 1803 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1804 U8 nodetype =(U8)(flags & 0xFF);
1805 char *str=NULL;
786e8c11 1806
07be1b83 1807#ifdef DEBUGGING
e62cc96a 1808 regnode *optimize = NULL;
b57a0404
JH
1809 U32 mjd_offset = 0;
1810 U32 mjd_nodelen = 0;
07be1b83 1811#endif
a3621e74 1812 /*
3dab1dad
YO
1813 This means we convert either the first branch or the first Exact,
1814 depending on whether the thing following (in 'last') is a branch
1815 or not and whther first is the startbranch (ie is it a sub part of
1816 the alternation or is it the whole thing.)
1817 Assuming its a sub part we conver the EXACT otherwise we convert
1818 the whole branch sequence, including the first.
a3621e74 1819 */
3dab1dad 1820 /* Find the node we are going to overwrite */
7f69552c 1821 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1822 /* branch sub-chain */
3dab1dad 1823 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1824 DEBUG_r({
1825 mjd_offset= Node_Offset((convert));
1826 mjd_nodelen= Node_Length((convert));
1827 });
7f69552c
YO
1828 /* whole branch chain */
1829 } else {
1830 DEBUG_r({
1831 const regnode *nop = NEXTOPER( convert );
1832 mjd_offset= Node_Offset((nop));
1833 mjd_nodelen= Node_Length((nop));
1834 });
07be1b83 1835 }
7f69552c 1836
07be1b83
YO
1837 DEBUG_OPTIMISE_r(
1838 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1839 (int)depth * 2 + 2, "",
786e8c11 1840 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1841 );
a3621e74 1842
3dab1dad
YO
1843 /* But first we check to see if there is a common prefix we can
1844 split out as an EXACT and put in front of the TRIE node. */
1845 trie->startstate= 1;
786e8c11 1846 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1847 U32 state;
1e2e3d02 1848 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1849 U32 ofs = 0;
8e11feef
RGS
1850 I32 idx = -1;
1851 U32 count = 0;
1852 const U32 base = trie->states[ state ].trans.base;
a3621e74 1853
3dab1dad 1854 if ( trie->states[state].wordnum )
8e11feef 1855 count = 1;
a3621e74 1856
8e11feef 1857 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1858 if ( ( base + ofs >= trie->uniquecharcount ) &&
1859 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1860 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1861 {
3dab1dad 1862 if ( ++count > 1 ) {
8e11feef 1863 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1864 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1865 if ( state == 1 ) break;
3dab1dad
YO
1866 if ( count == 2 ) {
1867 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1868 DEBUG_OPTIMISE_r(
8e11feef
RGS
1869 PerlIO_printf(Perl_debug_log,
1870 "%*sNew Start State=%"UVuf" Class: [",
1871 (int)depth * 2 + 2, "",
786e8c11 1872 (UV)state));
be8e71aa
YO
1873 if (idx >= 0) {
1874 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1875 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1876
3dab1dad 1877 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1878 if ( folder )
1879 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1880 DEBUG_OPTIMISE_r(
07be1b83 1881 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1882 );
8e11feef
RGS
1883 }
1884 }
1885 TRIE_BITMAP_SET(trie,*ch);
1886 if ( folder )
1887 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1888 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1889 }
1890 idx = ofs;
1891 }
3dab1dad
YO
1892 }
1893 if ( count == 1 ) {
1894 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1895 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1896 DEBUG_OPTIMISE_r(
8e11feef
RGS
1897 PerlIO_printf( Perl_debug_log,
1898 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1899 (int)depth * 2 + 2, "",
786e8c11 1900 (UV)state, (UV)idx, ch)
3dab1dad
YO
1901 );
1902 if ( state==1 ) {
1903 OP( convert ) = nodetype;
1904 str=STRING(convert);
1905 STR_LEN(convert)=0;
1906 }
1907 *str++=*ch;
1908 STR_LEN(convert)++;
a3621e74 1909
8e11feef 1910 } else {
f9049ba1 1911#ifdef DEBUGGING
8e11feef
RGS
1912 if (state>1)
1913 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1914#endif
8e11feef
RGS
1915 break;
1916 }
1917 }
3dab1dad 1918 if (str) {
8e11feef 1919 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1920 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1921 trie->startstate = state;
07be1b83
YO
1922 trie->minlen -= (state - 1);
1923 trie->maxlen -= (state - 1);
1924 DEBUG_r({
1925 regnode *fix = convert;
1926 mjd_nodelen++;
1927 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1928 while( ++fix < n ) {
1929 Set_Node_Offset_Length(fix, 0, 0);
1930 }
1931 });
8e11feef
RGS
1932 if (trie->maxlen) {
1933 convert = n;
1934 } else {
3dab1dad 1935 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1936 DEBUG_r(optimize= n);
3dab1dad
YO
1937 }
1938 }
1939 }
a5ca303d
YO
1940 if (!jumper)
1941 jumper = last;
3dab1dad 1942 if ( trie->maxlen ) {
8e11feef
RGS
1943 NEXT_OFF( convert ) = (U16)(tail - convert);
1944 ARG_SET( convert, data_slot );
786e8c11
YO
1945 /* Store the offset to the first unabsorbed branch in
1946 jump[0], which is otherwise unused by the jump logic.
1947 We use this when dumping a trie and during optimisation. */
1948 if (trie->jump)
7f69552c 1949 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1950
786e8c11
YO
1951 /* XXXX */
1952 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1953 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1954 {
1955 OP( convert ) = TRIEC;
1956 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1957 Safefree(trie->bitmap);
1958 trie->bitmap= NULL;
1959 } else
1960 OP( convert ) = TRIE;
a3621e74 1961
3dab1dad
YO
1962 /* store the type in the flags */
1963 convert->flags = nodetype;
a5ca303d
YO
1964 DEBUG_r({
1965 optimize = convert
1966 + NODE_STEP_REGNODE
1967 + regarglen[ OP( convert ) ];
1968 });
1969 /* XXX We really should free up the resource in trie now,
1970 as we won't use them - (which resources?) dmq */
3dab1dad 1971 }
a3621e74 1972 /* needed for dumping*/
e62cc96a 1973 DEBUG_r(if (optimize) {
07be1b83 1974 regnode *opt = convert;
e62cc96a 1975 while ( ++opt < optimize) {
07be1b83
YO
1976 Set_Node_Offset_Length(opt,0,0);
1977 }
786e8c11
YO
1978 /*
1979 Try to clean up some of the debris left after the
1980 optimisation.
a3621e74 1981 */
786e8c11 1982 while( optimize < jumper ) {
07be1b83 1983 mjd_nodelen += Node_Length((optimize));
a3621e74 1984 OP( optimize ) = OPTIMIZED;
07be1b83 1985 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1986 optimize++;
1987 }
07be1b83 1988 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1989 });
1990 } /* end node insert */
07be1b83 1991#ifndef DEBUGGING
6e8b4190 1992 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1993#endif
786e8c11
YO
1994 return trie->jump
1995 ? MADE_JUMP_TRIE
1996 : trie->startstate>1
1997 ? MADE_EXACT_TRIE
1998 : MADE_TRIE;
1999}
2000
2001STATIC void
2002S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2003{
2004/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2005
2006 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2007 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2008 ISBN 0-201-10088-6
2009
2010 We find the fail state for each state in the trie, this state is the longest proper
2011 suffix of the current states 'word' that is also a proper prefix of another word in our
2012 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2013 the DFA not to have to restart after its tried and failed a word at a given point, it
2014 simply continues as though it had been matching the other word in the first place.
2015 Consider
2016 'abcdgu'=~/abcdefg|cdgu/
2017 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2018 fail, which would bring use to the state representing 'd' in the second word where we would
2019 try 'g' and succeed, prodceding to match 'cdgu'.
2020 */
2021 /* add a fail transition */
2022 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2023 U32 *q;
2024 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2025 const U32 numstates = trie->statecount;
786e8c11
YO
2026 const U32 ubound = trie->lasttrans + ucharcount;
2027 U32 q_read = 0;
2028 U32 q_write = 0;
2029 U32 charid;
2030 U32 base = trie->states[ 1 ].trans.base;
2031 U32 *fail;
2032 reg_ac_data *aho;
2033 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2034 GET_RE_DEBUG_FLAGS_DECL;
2035#ifndef DEBUGGING
2036 PERL_UNUSED_ARG(depth);
2037#endif
2038
2039
2040 ARG_SET( stclass, data_slot );
2041 Newxz( aho, 1, reg_ac_data );
2042 RExC_rx->data->data[ data_slot ] = (void*)aho;
2043 aho->trie=trie;
2044 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2045 numstates * sizeof(reg_trie_state));
786e8c11
YO
2046 Newxz( q, numstates, U32);
2047 Newxz( aho->fail, numstates, U32 );
2048 aho->refcount = 1;
2049 fail = aho->fail;
2050 /* initialize fail[0..1] to be 1 so that we always have
2051 a valid final fail state */
2052 fail[ 0 ] = fail[ 1 ] = 1;
2053
2054 for ( charid = 0; charid < ucharcount ; charid++ ) {
2055 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2056 if ( newstate ) {
2057 q[ q_write ] = newstate;
2058 /* set to point at the root */
2059 fail[ q[ q_write++ ] ]=1;
2060 }
2061 }
2062 while ( q_read < q_write) {
2063 const U32 cur = q[ q_read++ % numstates ];
2064 base = trie->states[ cur ].trans.base;
2065
2066 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2067 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2068 if (ch_state) {
2069 U32 fail_state = cur;
2070 U32 fail_base;
2071 do {
2072 fail_state = fail[ fail_state ];
2073 fail_base = aho->states[ fail_state ].trans.base;
2074 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2075
2076 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2077 fail[ ch_state ] = fail_state;
2078 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2079 {
2080 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2081 }
2082 q[ q_write++ % numstates] = ch_state;
2083 }
2084 }
2085 }
2086 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2087 when we fail in state 1, this allows us to use the
2088 charclass scan to find a valid start char. This is based on the principle
2089 that theres a good chance the string being searched contains lots of stuff
2090 that cant be a start char.
2091 */
2092 fail[ 0 ] = fail[ 1 ] = 0;
2093 DEBUG_TRIE_COMPILE_r({
1e2e3d02
YO
2094 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2095 (int)(depth * 2), "", numstates
2096 );
786e8c11
YO
2097 for( q_read=1; q_read<numstates; q_read++ ) {
2098 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2099 }
2100 PerlIO_printf(Perl_debug_log, "\n");
2101 });
2102 Safefree(q);
2103 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2104}
2105
786e8c11 2106
a3621e74 2107/*
5d1c421c
JH
2108 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2109 * These need to be revisited when a newer toolchain becomes available.
2110 */
2111#if defined(__sparc64__) && defined(__GNUC__)
2112# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2113# undef SPARC64_GCC_WORKAROUND
2114# define SPARC64_GCC_WORKAROUND 1
2115# endif
2116#endif
2117
07be1b83
YO
2118#define DEBUG_PEEP(str,scan,depth) \
2119 DEBUG_OPTIMISE_r({ \
2120 SV * const mysv=sv_newmortal(); \
2121 regnode *Next = regnext(scan); \
2122 regprop(RExC_rx, mysv, scan); \
7f69552c 2123 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2124 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2125 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2126 });
2127
1de06328
YO
2128
2129
2130
2131
07be1b83
YO
2132#define JOIN_EXACT(scan,min,flags) \
2133 if (PL_regkind[OP(scan)] == EXACT) \
2134 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2135
be8e71aa 2136STATIC U32
07be1b83
YO
2137S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2138 /* Merge several consecutive EXACTish nodes into one. */
2139 regnode *n = regnext(scan);
2140 U32 stringok = 1;
2141 regnode *next = scan + NODE_SZ_STR(scan);
2142 U32 merged = 0;
2143 U32 stopnow = 0;
2144#ifdef DEBUGGING
2145 regnode *stop = scan;
72f13be8 2146 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2147#else
d47053eb
RGS
2148 PERL_UNUSED_ARG(depth);
2149#endif
2150#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2151 PERL_UNUSED_ARG(flags);
2152 PERL_UNUSED_ARG(val);
07be1b83 2153#endif
07be1b83
YO
2154 DEBUG_PEEP("join",scan,depth);
2155
2156 /* Skip NOTHING, merge EXACT*. */
2157 while (n &&
2158 ( PL_regkind[OP(n)] == NOTHING ||
2159 (stringok && (OP(n) == OP(scan))))
2160 && NEXT_OFF(n)
2161 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2162
2163 if (OP(n) == TAIL || n > next)
2164 stringok = 0;
2165 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2166 DEBUG_PEEP("skip:",n,depth);
2167 NEXT_OFF(scan) += NEXT_OFF(n);
2168 next = n + NODE_STEP_REGNODE;
2169#ifdef DEBUGGING
2170 if (stringok)
2171 stop = n;
2172#endif
2173 n = regnext(n);
2174 }
2175 else if (stringok) {
786e8c11 2176 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2177 regnode * const nnext = regnext(n);
2178
2179 DEBUG_PEEP("merg",n,depth);
2180
2181 merged++;
2182 if (oldl + STR_LEN(n) > U8_MAX)
2183 break;
2184 NEXT_OFF(scan) += NEXT_OFF(n);
2185 STR_LEN(scan) += STR_LEN(n);
2186 next = n + NODE_SZ_STR(n);
2187 /* Now we can overwrite *n : */
2188 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2189#ifdef DEBUGGING
2190 stop = next - 1;
2191#endif
2192 n = nnext;
2193 if (stopnow) break;
2194 }
2195
d47053eb
RGS
2196#ifdef EXPERIMENTAL_INPLACESCAN
2197 if (flags && !NEXT_OFF(n)) {
2198 DEBUG_PEEP("atch", val, depth);
2199 if (reg_off_by_arg[OP(n)]) {
2200 ARG_SET(n, val - n);
2201 }
2202 else {
2203 NEXT_OFF(n) = val - n;
2204 }
2205 stopnow = 1;
2206 }
07be1b83
YO
2207#endif
2208 }
2209
2210 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2211 /*
2212 Two problematic code points in Unicode casefolding of EXACT nodes:
2213
2214 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2215 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2216
2217 which casefold to
2218
2219 Unicode UTF-8
2220
2221 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2222 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2223
2224 This means that in case-insensitive matching (or "loose matching",
2225 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2226 length of the above casefolded versions) can match a target string
2227 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2228 This would rather mess up the minimum length computation.
2229
2230 What we'll do is to look for the tail four bytes, and then peek
2231 at the preceding two bytes to see whether we need to decrease
2232 the minimum length by four (six minus two).
2233
2234 Thanks to the design of UTF-8, there cannot be false matches:
2235 A sequence of valid UTF-8 bytes cannot be a subsequence of
2236 another valid sequence of UTF-8 bytes.
2237
2238 */
2239 char * const s0 = STRING(scan), *s, *t;
2240 char * const s1 = s0 + STR_LEN(scan) - 1;
2241 char * const s2 = s1 - 4;
e294cc5d
JH
2242#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2243 const char t0[] = "\xaf\x49\xaf\x42";
2244#else
07be1b83 2245 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2246#endif
07be1b83
YO
2247 const char * const t1 = t0 + 3;
2248
2249 for (s = s0 + 2;
2250 s < s2 && (t = ninstr(s, s1, t0, t1));
2251 s = t + 4) {
e294cc5d
JH
2252#ifdef EBCDIC
2253 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2254 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2255#else
07be1b83
YO
2256 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2257 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2258#endif
07be1b83
YO
2259 *min -= 4;
2260 }
2261 }
2262
2263#ifdef DEBUGGING
2264 /* Allow dumping */
2265 n = scan + NODE_SZ_STR(scan);
2266 while (n <= stop) {
2267 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2268 OP(n) = OPTIMIZED;
2269 NEXT_OFF(n) = 0;
2270 }
2271 n++;
2272 }
2273#endif
2274 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2275 return stopnow;
2276}
2277
653099ff
GS
2278/* REx optimizer. Converts nodes into quickier variants "in place".
2279 Finds fixed substrings. */
2280
a0288114 2281/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2282 to the position after last scanned or to NULL. */
2283
40d049e4
YO
2284#define INIT_AND_WITHP \
2285 assert(!and_withp); \
2286 Newx(and_withp,1,struct regnode_charclass_class); \
2287 SAVEFREEPV(and_withp)
07be1b83 2288
76e3520e 2289STATIC I32
40d049e4 2290S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2291 I32 *minlenp, I32 *deltap,
40d049e4
YO
2292 regnode *last,
2293 scan_data_t *data,
2294 I32 stopparen,
2295 U8* recursed,
2296 struct regnode_charclass_class *and_withp,
2297 U32 flags, U32 depth)
c277df42
IZ
2298 /* scanp: Start here (read-write). */
2299 /* deltap: Write maxlen-minlen here. */
2300 /* last: Stop before this one. */
40d049e4
YO
2301 /* data: string data about the pattern */
2302 /* stopparen: treat close N as END */
2303 /* recursed: which subroutines have we recursed into */
2304 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2305{
97aff369 2306 dVAR;
c277df42
IZ
2307 I32 min = 0, pars = 0, code;
2308 regnode *scan = *scanp, *next;
2309 I32 delta = 0;
2310 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2311 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2312 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2313 scan_data_t data_fake;
a3621e74 2314 SV *re_trie_maxbuff = NULL;
786e8c11 2315 regnode *first_non_open = scan;
e2e6a0f1 2316 I32 stopmin = I32_MAX;
a3621e74 2317 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2318#ifdef DEBUGGING
40d049e4 2319 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2320#endif
40d049e4 2321
786e8c11 2322 if ( depth == 0 ) {
40d049e4 2323 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2324 first_non_open=regnext(first_non_open);
2325 }
2326
b81d288d 2327
c277df42
IZ
2328 while (scan && OP(scan) != END && scan < last) {
2329 /* Peephole optimizer: */
1de06328 2330 DEBUG_STUDYDATA(data,depth);
07be1b83 2331 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2332 JOIN_EXACT(scan,&min,0);
a3621e74 2333
653099ff
GS
2334 /* Follow the next-chain of the current node and optimize
2335 away all the NOTHINGs from it. */
c277df42 2336 if (OP(scan) != CURLYX) {
a3b680e6 2337 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2338 ? I32_MAX
2339 /* I32 may be smaller than U16 on CRAYs! */
2340 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2341 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2342 int noff;
2343 regnode *n = scan;
b81d288d 2344
c277df42
IZ
2345 /* Skip NOTHING and LONGJMP. */
2346 while ((n = regnext(n))
3dab1dad 2347 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2348 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2349 && off + noff < max)
2350 off += noff;
2351 if (reg_off_by_arg[OP(scan)])
2352 ARG(scan) = off;
b81d288d 2353 else
c277df42
IZ
2354 NEXT_OFF(scan) = off;
2355 }
a3621e74 2356
07be1b83 2357
3dab1dad 2358
653099ff
GS
2359 /* The principal pseudo-switch. Cannot be a switch, since we
2360 look into several different things. */
b81d288d 2361 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2362 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2363 next = regnext(scan);
2364 code = OP(scan);
a3621e74 2365 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2366
2367 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2368 /* NOTE - There is similar code to this block below for handling
2369 TRIE nodes on a re-study. If you change stuff here check there
2370 too. */
c277df42 2371 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2372 struct regnode_charclass_class accum;
d4c19fe8 2373 regnode * const startbranch=scan;
c277df42 2374
653099ff 2375 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2376 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2377 if (flags & SCF_DO_STCLASS)
830247a4 2378 cl_init_zero(pRExC_state, &accum);
a3621e74 2379
c277df42 2380 while (OP(scan) == code) {
830247a4 2381 I32 deltanext, minnext, f = 0, fake;
653099ff 2382 struct regnode_charclass_class this_class;
c277df42
IZ
2383
2384 num++;
2385 data_fake.flags = 0;
40d049e4 2386 if (data) {
2c2d71f5 2387 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2388 data_fake.last_closep = data->last_closep;
2389 }
2390 else
2391 data_fake.last_closep = &fake;
c277df42
IZ
2392 next = regnext(scan);
2393 scan = NEXTOPER(scan);
2394 if (code != BRANCH)
2395 scan = NEXTOPER(scan);
653099ff 2396 if (flags & SCF_DO_STCLASS) {
830247a4 2397 cl_init(pRExC_state, &this_class);
653099ff
GS
2398 data_fake.start_class = &this_class;
2399 f = SCF_DO_STCLASS_AND;
b81d288d 2400 }
e1901655
IZ
2401 if (flags & SCF_WHILEM_VISITED_POS)
2402 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2403
653099ff 2404 /* we suppose the run is continuous, last=next...*/
1de06328 2405 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2406 next, &data_fake,
2407 stopparen, recursed, NULL, f,depth+1);
b81d288d 2408 if (min1 > minnext)
c277df42
IZ
2409 min1 = minnext;
2410 if (max1 < minnext + deltanext)
2411 max1 = minnext + deltanext;
2412 if (deltanext == I32_MAX)
aca2d497 2413 is_inf = is_inf_internal = 1;
c277df42
IZ
2414 scan = next;
2415 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2416 pars++;
e2e6a0f1
YO
2417 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2418 if ( stopmin > minnext)
2419 stopmin = min + min1;
2420 flags &= ~SCF_DO_SUBSTR;
2421 if (data)
2422 data->flags |= SCF_SEEN_ACCEPT;
2423 }
3dab1dad
YO
2424 if (data) {
2425 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2426 data->flags |= SF_HAS_EVAL;
2c2d71f5 2427 data->whilem_c = data_fake.whilem_c;
3dab1dad 2428 }
653099ff 2429 if (flags & SCF_DO_STCLASS)
830247a4 2430 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2431 if (code == SUSPEND)
c277df42
IZ
2432 break;
2433 }
2434 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2435 min1 = 0;
2436 if (flags & SCF_DO_SUBSTR) {
2437 data->pos_min += min1;
2438 data->pos_delta += max1 - min1;
2439 if (max1 != min1 || is_inf)
2440 data->longest = &(data->longest_float);
2441 }
2442 min += min1;
2443 delta += max1 - min1;
653099ff 2444 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2445 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2446 if (min1) {
40d049e4 2447 cl_and(data->start_class, and_withp);
653099ff
GS
2448 flags &= ~SCF_DO_STCLASS;
2449 }
2450 }
2451 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2452 if (min1) {
2453 cl_and(data->start_class, &accum);
653099ff 2454 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2455 }
2456 else {
b81d288d 2457 /* Switch to OR mode: cache the old value of
de0c8cb8 2458 * data->start_class */
40d049e4
YO
2459 INIT_AND_WITHP;
2460 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2461 struct regnode_charclass_class);
2462 flags &= ~SCF_DO_STCLASS_AND;
2463 StructCopy(&accum, data->start_class,
2464 struct regnode_charclass_class);
2465 flags |= SCF_DO_STCLASS_OR;
2466 data->start_class->flags |= ANYOF_EOS;
2467 }
653099ff 2468 }
a3621e74 2469
786e8c11 2470 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2471 /* demq.
2472
2473 Assuming this was/is a branch we are dealing with: 'scan' now
2474 points at the item that follows the branch sequence, whatever
2475 it is. We now start at the beginning of the sequence and look
2476 for subsequences of
2477
786e8c11
YO
2478 BRANCH->EXACT=>x1
2479 BRANCH->EXACT=>x2
2480 tail
a3621e74
YO
2481
2482 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2483
2484 If we can find such a subseqence we need to turn the first
2485 element into a trie and then add the subsequent branch exact
2486 strings to the trie.
2487
2488 We have two cases
2489
786e8c11 2490 1. patterns where the whole set of branch can be converted.
a3621e74 2491
786e8c11 2492 2. patterns where only a subset can be converted.
a3621e74
YO
2493
2494 In case 1 we can replace the whole set with a single regop
2495 for the trie. In case 2 we need to keep the start and end
2496 branchs so
2497
2498 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2499 becomes BRANCH TRIE; BRANCH X;
2500
786e8c11
YO
2501 There is an additional case, that being where there is a
2502 common prefix, which gets split out into an EXACT like node
2503 preceding the TRIE node.
2504
2505 If x(1..n)==tail then we can do a simple trie, if not we make
2506 a "jump" trie, such that when we match the appropriate word
2507 we "jump" to the appopriate tail node. Essentailly we turn
2508 a nested if into a case structure of sorts.
a3621e74
YO
2509
2510 */
786e8c11 2511
3dab1dad 2512 int made=0;
0111c4fd
RGS
2513 if (!re_trie_maxbuff) {
2514 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2515 if (!SvIOK(re_trie_maxbuff))
2516 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2517 }
786e8c11 2518 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2519 regnode *cur;
2520 regnode *first = (regnode *)NULL;
2521 regnode *last = (regnode *)NULL;
2522 regnode *tail = scan;
2523 U8 optype = 0;
2524 U32 count=0;
2525
2526#ifdef DEBUGGING
c445ea15 2527 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2528#endif
2529 /* var tail is used because there may be a TAIL
2530 regop in the way. Ie, the exacts will point to the
2531 thing following the TAIL, but the last branch will
2532 point at the TAIL. So we advance tail. If we
2533 have nested (?:) we may have to move through several
2534 tails.
2535 */
2536
2537 while ( OP( tail ) == TAIL ) {
2538 /* this is the TAIL generated by (?:) */
2539 tail = regnext( tail );
2540 }
2541
3dab1dad 2542
a3621e74 2543 DEBUG_OPTIMISE_r({
32fc9b6a 2544 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2545 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2546 (int)depth * 2 + 2, "",
2547 "Looking for TRIE'able sequences. Tail node is: ",
2548 SvPV_nolen_const( mysv )
a3621e74
YO
2549 );
2550 });
3dab1dad 2551
a3621e74
YO
2552 /*
2553
2554 step through the branches, cur represents each
2555 branch, noper is the first thing to be matched
2556 as part of that branch and noper_next is the
2557 regnext() of that node. if noper is an EXACT
2558 and noper_next is the same as scan (our current
2559 position in the regex) then the EXACT branch is
2560 a possible optimization target. Once we have
2561 two or more consequetive such branches we can
2562 create a trie of the EXACT's contents and stich
2563 it in place. If the sequence represents all of
2564 the branches we eliminate the whole thing and
2565 replace it with a single TRIE. If it is a
2566 subsequence then we need to stitch it in. This
2567 means the first branch has to remain, and needs
2568 to be repointed at the item on the branch chain
2569 following the last branch optimized. This could
2570 be either a BRANCH, in which case the
2571 subsequence is internal, or it could be the
2572 item following the branch sequence in which
2573 case the subsequence is at the end.
2574
2575 */
2576
2577 /* dont use tail as the end marker for this traverse */
2578 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2579 regnode * const noper = NEXTOPER( cur );
be981c67 2580#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2581 regnode * const noper_next = regnext( noper );
be981c67 2582#endif
a3621e74 2583
a3621e74 2584 DEBUG_OPTIMISE_r({
32fc9b6a 2585 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2586 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2587 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2588
32fc9b6a 2589 regprop(RExC_rx, mysv, noper);
a3621e74 2590 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2591 SvPV_nolen_const(mysv));
a3621e74
YO
2592
2593 if ( noper_next ) {
32fc9b6a 2594 regprop(RExC_rx, mysv, noper_next );
a3621e74 2595 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2596 SvPV_nolen_const(mysv));
a3621e74 2597 }
3dab1dad
YO
2598 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2599 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2600 });
3dab1dad
YO
2601 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2602 : PL_regkind[ OP( noper ) ] == EXACT )
2603 || OP(noper) == NOTHING )
786e8c11
YO
2604#ifdef NOJUMPTRIE
2605 && noper_next == tail
2606#endif
2607 && count < U16_MAX)
a3621e74
YO
2608 {
2609 count++;
3dab1dad
YO
2610 if ( !first || optype == NOTHING ) {
2611 if (!first) first = cur;
a3621e74
YO
2612 optype = OP( noper );
2613 } else {
a3621e74 2614 last = cur;
a3621e74
YO
2615 }
2616 } else {
2617 if ( last ) {
786e8c11
YO
2618 make_trie( pRExC_state,
2619 startbranch, first, cur, tail, count,
2620 optype, depth+1 );
a3621e74 2621 }
3dab1dad 2622 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2623#ifdef NOJUMPTRIE
2624 && noper_next == tail
2625#endif
2626 ){
a3621e74
YO
2627 count = 1;
2628 first = cur;
2629 optype = OP( noper );
2630 } else {
2631 count = 0;
2632 first = NULL;
2633 optype = 0;
2634 }
2635 last = NULL;
2636 }
2637 }
2638 DEBUG_OPTIMISE_r({
32fc9b6a 2639 regprop(RExC_rx, mysv, cur);
a3621e74 2640 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2641 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2642 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2643
2644 });
2645 if ( last ) {
786e8c11 2646 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2647#ifdef TRIE_STUDY_OPT
786e8c11
YO
2648 if ( ((made == MADE_EXACT_TRIE &&
2649 startbranch == first)
2650 || ( first_non_open == first )) &&
2651 depth==0 )
2652 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2653#endif
07be1b83 2654 }
a3621e74 2655 }
3dab1dad
YO
2656
2657 } /* do trie */
786e8c11 2658
a0ed51b3 2659 }
a3621e74 2660 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2661 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2662 } else /* single branch is optimized. */
c277df42
IZ
2663 scan = NEXTOPER(scan);
2664 continue;
a0ed51b3
LW
2665 }
2666 else if (OP(scan) == EXACT) {
cd439c50 2667 I32 l = STR_LEN(scan);
c445ea15 2668 UV uc;
a0ed51b3 2669 if (UTF) {
a3b680e6 2670 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2671 l = utf8_length(s, s + l);
9041c2e3 2672 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2673 } else {
2674 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2675 }
2676 min += l;
c277df42 2677 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2678 /* The code below prefers earlier match for fixed
2679 offset, later match for variable offset. */
2680 if (data->last_end == -1) { /* Update the start info. */
2681 data->last_start_min = data->pos_min;
2682 data->last_start_max = is_inf
b81d288d 2683 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2684 }
cd439c50 2685 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2686 if (UTF)
2687 SvUTF8_on(data->last_found);
0eda9292 2688 {
9a957fbc 2689 SV * const sv = data->last_found;
a28509cc 2690 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2691 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2692 if (mg && mg->mg_len >= 0)
5e43f467
JH
2693 mg->mg_len += utf8_length((U8*)STRING(scan),
2694 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2695 }
c277df42
IZ
2696 data->last_end = data->pos_min + l;
2697 data->pos_min += l; /* As in the first entry. */
2698 data->flags &= ~SF_BEFORE_EOL;
2699 }
653099ff
GS
2700 if (flags & SCF_DO_STCLASS_AND) {
2701 /* Check whether it is compatible with what we know already! */
2702 int compat = 1;
2703
1aa99e6b 2704 if (uc >= 0x100 ||
516a5887 2705 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2706 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2707 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2708 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2709 )
653099ff
GS
2710 compat = 0;
2711 ANYOF_CLASS_ZERO(data->start_class);
2712 ANYOF_BITMAP_ZERO(data->start_class);
2713 if (compat)
1aa99e6b 2714 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2715 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2716 if (uc < 0x100)
2717 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2718 }
2719 else if (flags & SCF_DO_STCLASS_OR) {
2720 /* false positive possible if the class is case-folded */
1aa99e6b 2721 if (uc < 0x100)
9b877dbb
IH
2722 ANYOF_BITMAP_SET(data->start_class, uc);
2723 else
2724 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2725 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2726 cl_and(data->start_class, and_withp);
653099ff
GS
2727 }
2728 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2729 }
3dab1dad 2730 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2731 I32 l = STR_LEN(scan);
1aa99e6b 2732 UV uc = *((U8*)STRING(scan));
653099ff
GS
2733
2734 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2735 if (flags & SCF_DO_SUBSTR) {
2736 assert(data);
1de06328 2737 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2738 }
a0ed51b3 2739 if (UTF) {
6136c704 2740 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2741 l = utf8_length(s, s + l);
9041c2e3 2742 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2743 }
2744 min += l;
ecaa9b9c 2745 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2746 data->pos_min += l;
653099ff
GS
2747 if (flags & SCF_DO_STCLASS_AND) {
2748 /* Check whether it is compatible with what we know already! */
2749 int compat = 1;
2750
1aa99e6b 2751 if (uc >= 0x100 ||
516a5887 2752 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2753 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2754 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2755 compat = 0;
2756 ANYOF_CLASS_ZERO(data->start_class);
2757 ANYOF_BITMAP_ZERO(data->start_class);
2758 if (compat) {
1aa99e6b 2759 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2760 data->start_class->flags &= ~ANYOF_EOS;
2761 data->start_class->flags |= ANYOF_FOLD;
2762 if (OP(scan) == EXACTFL)
2763 data->start_class->flags |= ANYOF_LOCALE;
2764 }
2765 }
2766 else if (flags & SCF_DO_STCLASS_OR) {
2767 if (data->start_class->flags & ANYOF_FOLD) {
2768 /* false positive possible if the class is case-folded.
2769 Assume that the locale settings are the same... */
1aa99e6b
IH
2770 if (uc < 0x100)
2771 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2772 data->start_class->flags &= ~ANYOF_EOS;
2773 }
40d049e4 2774 cl_and(data->start_class, and_withp);
653099ff
GS
2775 }
2776 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2777 }
bfed75c6 2778 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2779 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2780 I32 f = flags, pos_before = 0;
d4c19fe8 2781 regnode * const oscan = scan;
653099ff
GS
2782 struct regnode_charclass_class this_class;
2783 struct regnode_charclass_class *oclass = NULL;
727f22e3 2784 I32 next_is_eval = 0;
653099ff 2785
3dab1dad 2786 switch (PL_regkind[OP(scan)]) {
653099ff 2787 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2788 scan = NEXTOPER(scan);
2789 goto finish;
2790 case PLUS:
653099ff 2791 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2792 next = NEXTOPER(scan);
653099ff 2793 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2794 mincount = 1;
2795 maxcount = REG_INFTY;
c277df42
IZ
2796 next = regnext(scan);
2797 scan = NEXTOPER(scan);
2798 goto do_curly;
2799 }
2800 }
2801 if (flags & SCF_DO_SUBSTR)
2802 data->pos_min++;
2803 min++;
2804 /* Fall through. */
2805 case STAR:
653099ff
GS
2806 if (flags & SCF_DO_STCLASS) {
2807 mincount = 0;
b81d288d 2808 maxcount = REG_INFTY;
653099ff
GS
2809 next = regnext(scan);
2810 scan = NEXTOPER(scan);
2811 goto do_curly;
2812 }
b81d288d 2813 is_inf = is_inf_internal = 1;
c277df42
IZ
2814 scan = regnext(scan);
2815 if (flags & SCF_DO_SUBSTR) {
1de06328 2816 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2817 data->longest = &(data->longest_float);
2818 }
2819 goto optimize_curly_tail;
2820 case CURLY:
40d049e4
YO
2821 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2822 && (scan->flags == stopparen))
2823 {
2824 mincount = 1;
2825 maxcount = 1;
2826 } else {
2827 mincount = ARG1(scan);
2828 maxcount = ARG2(scan);
2829 }
c277df42 2830 next = regnext(scan);
cb434fcc
IZ
2831 if (OP(scan) == CURLYX) {
2832 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2833 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2834 }
c277df42 2835 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2836 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2837 do_curly:
2838 if (flags & SCF_DO_SUBSTR) {
1de06328 2839 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2840 pos_before = data->pos_min;
2841 }
2842 if (data) {
2843 fl = data->flags;
2844 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2845 if (is_inf)
2846 data->flags |= SF_IS_INF;
2847 }
653099ff 2848 if (flags & SCF_DO_STCLASS) {
830247a4 2849 cl_init(pRExC_state, &this_class);
653099ff
GS
2850 oclass = data->start_class;
2851 data->start_class = &this_class;
2852 f |= SCF_DO_STCLASS_AND;
2853 f &= ~SCF_DO_STCLASS_OR;
2854 }
e1901655
IZ
2855 /* These are the cases when once a subexpression
2856 fails at a particular position, it cannot succeed
2857 even after backtracking at the enclosing scope.
b81d288d 2858
e1901655
IZ
2859 XXXX what if minimal match and we are at the
2860 initial run of {n,m}? */
2861 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2862 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2863
c277df42 2864 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2865 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2866 last, data, stopparen, recursed, NULL,
a3621e74
YO
2867 (mincount == 0
2868 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2869
2870 if (flags & SCF_DO_STCLASS)
2871 data->start_class = oclass;
2872 if (mincount == 0 || minnext == 0) {
2873 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2874 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2875 }
2876 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2877 /* Switch to OR mode: cache the old value of
653099ff 2878 * data->start_class */
40d049e4
YO
2879 INIT_AND_WITHP;
2880 StructCopy(data->start_class, and_withp,
653099ff
GS
2881 struct regnode_charclass_class);
2882 flags &= ~SCF_DO_STCLASS_AND;
2883 StructCopy(&this_class, data->start_class,
2884 struct regnode_charclass_class);
2885 flags |= SCF_DO_STCLASS_OR;
2886 data->start_class->flags |= ANYOF_EOS;
2887 }
2888 } else { /* Non-zero len */
2889 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2890 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2891 cl_and(data->start_class, and_withp);
653099ff
GS
2892 }
2893 else if (flags & SCF_DO_STCLASS_AND)
2894 cl_and(data->start_class, &this_class);
2895 flags &= ~SCF_DO_STCLASS;
2896 }
c277df42
IZ
2897 if (!scan) /* It was not CURLYX, but CURLY. */
2898 scan = next;
041457d9
DM
2899 if ( /* ? quantifier ok, except for (?{ ... }) */
2900 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2901 && (minnext == 0) && (deltanext == 0)
99799961 2902 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2903 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2904 && ckWARN(WARN_REGEXP))
b45f050a 2905 {
830247a4 2906 vWARN(RExC_parse,
b45f050a
JF
2907 "Quantifier unexpected on zero-length expression");
2908 }
2909
c277df42 2910 min += minnext * mincount;
b81d288d 2911 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2912 && (minnext + deltanext) > 0)
2913 || deltanext == I32_MAX);
aca2d497 2914 is_inf |= is_inf_internal;
c277df42
IZ
2915 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2916
2917 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2918 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2919 && data->flags & SF_IN_PAR
2920 && !(data->flags & SF_HAS_EVAL)
2921 && !deltanext && minnext == 1 ) {
2922 /* Try to optimize to CURLYN. */
2923 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2924 regnode * const nxt1 = nxt;
497b47a8
JH
2925#ifdef DEBUGGING
2926 regnode *nxt2;
2927#endif
c277df42
IZ
2928
2929 /* Skip open. */
2930 nxt = regnext(nxt);
bfed75c6 2931 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2932 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2933 && STR_LEN(nxt) == 1))
c277df42 2934 goto nogo;
497b47a8 2935#ifdef DEBUGGING
c277df42 2936 nxt2 = nxt;
497b47a8 2937#endif
c277df42 2938 nxt = regnext(nxt);
b81d288d 2939 if (OP(nxt) != CLOSE)
c277df42 2940 goto nogo;
40d049e4
YO
2941 if (RExC_open_parens) {
2942 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2943 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2944 }
c277df42 2945 /* Now we know that nxt2 is the only contents: */
eb160463 2946 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2947 OP(oscan) = CURLYN;
2948 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2949
c277df42
IZ
2950#ifdef DEBUGGING
2951 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2952 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2953 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2954 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2955 OP(nxt + 1) = OPTIMIZED; /* was count. */
2956 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2957#endif
c277df42 2958 }
c277df42
IZ
2959 nogo:
2960
2961 /* Try optimization CURLYX => CURLYM. */
b81d288d 2962 if ( OP(oscan) == CURLYX && data
c277df42 2963 && !(data->flags & SF_HAS_PAR)
c277df42 2964 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2965 && !deltanext /* atom is fixed width */
2966 && minnext != 0 /* CURLYM can't handle zero width */
2967 ) {
c277df42
IZ
2968 /* XXXX How to optimize if data == 0? */
2969 /* Optimize to a simpler form. */
2970 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2971 regnode *nxt2;
2972
2973 OP(oscan) = CURLYM;
2974 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2975 && (OP(nxt2) != WHILEM))
c277df42
IZ
2976 nxt = nxt2;
2977 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2978 /* Need to optimize away parenths. */
2979 if (data->flags & SF_IN_PAR) {
2980 /* Set the parenth number. */
2981 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2982
b81d288d 2983 if (OP(nxt) != CLOSE)
b45f050a 2984 FAIL("Panic opt close");
eb160463 2985 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2986 if (RExC_open_parens) {
2987 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2988 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2989 }
c277df42
IZ
2990 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2991 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2992
c277df42
IZ
2993#ifdef DEBUGGING
2994 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2995 OP(nxt + 1) = OPTIMIZED; /* was count. */
2996 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2997 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2998#endif
c277df42
IZ
2999#if 0
3000 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3001 regnode *nnxt = regnext(nxt1);
b81d288d 3002
c277df42
IZ
3003 if (nnxt == nxt) {
3004 if (reg_off_by_arg[OP(nxt1)])
3005 ARG_SET(nxt1, nxt2 - nxt1);
3006 else if (nxt2 - nxt1 < U16_MAX)
3007 NEXT_OFF(nxt1) = nxt2 - nxt1;
3008 else
3009 OP(nxt) = NOTHING; /* Cannot beautify */
3010 }
3011 nxt1 = nnxt;
3012 }
3013#endif
3014 /* Optimize again: */
1de06328 3015 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3016 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3017 }
3018 else
c277df42 3019 oscan->flags = 0;
c277df42 3020 }
e1901655
IZ
3021 else if ((OP(oscan) == CURLYX)
3022 && (flags & SCF_WHILEM_VISITED_POS)
3023 /* See the comment on a similar expression above.
3024 However, this time it not a subexpression
3025 we care about, but the expression itself. */
3026 && (maxcount == REG_INFTY)
3027 && data && ++data->whilem_c < 16) {
3028 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3029 /* Find WHILEM (as in regexec.c) */
3030 regnode *nxt = oscan + NEXT_OFF(oscan);
3031
3032 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3033 nxt += ARG(nxt);
eb160463
GS
3034 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3035 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3036 }
b81d288d 3037 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3038 pars++;
3039 if (flags & SCF_DO_SUBSTR) {
c445ea15 3040 SV *last_str = NULL;
c277df42
IZ
3041 int counted = mincount != 0;
3042
3043 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3044#if defined(SPARC64_GCC_WORKAROUND)
3045 I32 b = 0;
3046 STRLEN l = 0;
cfd0369c 3047 const char *s = NULL;
5d1c421c
JH
3048 I32 old = 0;
3049
3050 if (pos_before >= data->last_start_min)
3051 b = pos_before;
3052 else
3053 b = data->last_start_min;
3054
3055 l = 0;
cfd0369c 3056 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3057 old = b - data->last_start_min;
3058
3059#else
b81d288d 3060 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3061 ? pos_before : data->last_start_min;
3062 STRLEN l;
d4c19fe8 3063 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3064 I32 old = b - data->last_start_min;
5d1c421c 3065#endif
a0ed51b3
LW
3066
3067 if (UTF)
3068 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3069
a0ed51b3 3070 l -= old;
c277df42 3071 /* Get the added string: */
79cb57f6 3072 last_str = newSVpvn(s + old, l);
0e933229
IH
3073 if (UTF)
3074 SvUTF8_on(last_str);
c277df42
IZ
3075 if (deltanext == 0 && pos_before == b) {
3076 /* What was added is a constant string */
3077 if (mincount > 1) {
3078 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3079 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3080 SvPVX_const(last_str), l, mincount - 1);
b162af07 3081 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3082 /* Add additional parts. */
b81d288d 3083 SvCUR_set(data->last_found,
c277df42
IZ
3084 SvCUR(data->last_found) - l);
3085 sv_catsv(data->last_found, last_str);
0eda9292
JH
3086 {
3087 SV * sv = data->last_found;
3088 MAGIC *mg =
3089 SvUTF8(sv) && SvMAGICAL(sv) ?
3090 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3091 if (mg && mg->mg_len >= 0)
3092 mg->mg_len += CHR_SVLEN(last_str);
3093 }
c277df42
IZ
3094 data->last_end += l * (mincount - 1);
3095 }
2a8d9689
HS
3096 } else {
3097 /* start offset must point into the last copy */
3098 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3099 data->last_start_max += is_inf ? I32_MAX
3100 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3101 }
3102 }
3103 /* It is counted once already... */
3104 data->pos_min += minnext * (mincount - counted);
3105 data->pos_delta += - counted * deltanext +
3106 (minnext + deltanext) * maxcount - minnext * mincount;
3107 if (mincount != maxcount) {
653099ff
GS
3108 /* Cannot extend fixed substrings found inside
3109 the group. */
1de06328 3110 scan_commit(pRExC_state,data,minlenp);
c277df42 3111 if (mincount && last_str) {
d4c19fe8
AL
3112 SV * const sv = data->last_found;
3113 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3114 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3115
3116 if (mg)
3117 mg->mg_len = -1;
3118 sv_setsv(sv, last_str);
c277df42 3119 data->last_end = data->pos_min;
b81d288d 3120 data->last_start_min =
a0ed51b3 3121 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3122 data->last_start_max = is_inf
3123 ? I32_MAX
c277df42 3124 : data->pos_min + data->pos_delta
a0ed51b3 3125 - CHR_SVLEN(last_str);
c277df42
IZ
3126 }
3127 data->longest = &(data->longest_float);
3128 }
aca2d497 3129 SvREFCNT_dec(last_str);
c277df42 3130 }
405ff068 3131 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3132 data->flags |= SF_HAS_EVAL;
3133 optimize_curly_tail:
c277df42 3134 if (OP(oscan) != CURLYX) {
3dab1dad 3135 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3136 && NEXT_OFF(next))
3137 NEXT_OFF(oscan) += NEXT_OFF(next);
3138 }
c277df42 3139 continue;
653099ff 3140 default: /* REF and CLUMP only? */
c277df42 3141 if (flags & SCF_DO_SUBSTR) {
1de06328 3142 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3143 data->longest = &(data->longest_float);
3144 }
aca2d497 3145 is_inf = is_inf_internal = 1;
653099ff 3146 if (flags & SCF_DO_STCLASS_OR)
830247a4 3147 cl_anything(pRExC_state, data->start_class);
653099ff 3148 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3149 break;
3150 }
a0ed51b3 3151 }
bfed75c6 3152 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3153 int value = 0;
653099ff 3154
c277df42 3155 if (flags & SCF_DO_SUBSTR) {
1de06328 3156 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3157 data->pos_min++;
3158 }
3159 min++;
653099ff
GS
3160 if (flags & SCF_DO_STCLASS) {
3161 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3162
3163 /* Some of the logic below assumes that switching
3164 locale on will only add false positives. */
3dab1dad 3165 switch (PL_regkind[OP(scan)]) {
653099ff 3166 case SANY:
653099ff
GS
3167 default:
3168 do_default:
3169 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3170 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3171 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3172 break;
3173 case REG_ANY:
3174 if (OP(scan) == SANY)
3175 goto do_default;
3176 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3177 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3178 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3179 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3180 }
3181 if (flags & SCF_DO_STCLASS_AND || !value)
3182 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3183 break;
3184 case ANYOF:
3185 if (flags & SCF_DO_STCLASS_AND)
3186 cl_and(data->start_class,
3187 (struct regnode_charclass_class*)scan);
3188 else
830247a4 3189 cl_or(pRExC_state, data->start_class,
653099ff
GS
3190 (struct regnode_charclass_class*)scan);
3191 break;
3192 case ALNUM:
3193 if (flags & SCF_DO_STCLASS_AND) {
3194 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3195 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3196 for (value = 0; value < 256; value++)
3197 if (!isALNUM(value))
3198 ANYOF_BITMAP_CLEAR(data->start_class, value);
3199 }
3200 }
3201 else {
3202 if (data->start_class->flags & ANYOF_LOCALE)
3203 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3204 else {
3205 for (value = 0; value < 256; value++)
3206 if (isALNUM(value))
b81d288d 3207 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3208 }
3209 }
3210 break;
3211 case ALNUML:
3212 if (flags & SCF_DO_STCLASS_AND) {
3213 if (data->start_class->flags & ANYOF_LOCALE)
3214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3215 }
3216 else {
3217 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3218 data->start_class->flags |= ANYOF_LOCALE;
3219 }
3220 break;
3221 case NALNUM:
3222 if (flags & SCF_DO_STCLASS_AND) {
3223 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3224 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3225 for (value = 0; value < 256; value++)
3226 if (isALNUM(value))
3227 ANYOF_BITMAP_CLEAR(data->start_class, value);
3228 }
3229 }
3230 else {
3231 if (data->start_class->flags & ANYOF_LOCALE)
3232 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3233 else {
3234 for (value = 0; value < 256; value++)
3235 if (!isALNUM(value))
b81d288d 3236 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3237 }
3238 }
3239 break;
3240 case NALNUML:
3241 if (flags & SCF_DO_STCLASS_AND) {
3242 if (data->start_class->flags & ANYOF_LOCALE)
3243 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3244 }
3245 else {
3246 data->start_class->flags |= ANYOF_LOCALE;
3247 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3248 }
3249 break;
3250 case SPACE:
3251 if (flags & SCF_DO_STCLASS_AND) {
3252 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3253 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3254 for (value = 0; value < 256; value++)
3255 if (!isSPACE(value))
3256 ANYOF_BITMAP_CLEAR(data->start_class, value);
3257 }
3258 }
3259 else {
3260 if (data->start_class->flags & ANYOF_LOCALE)
3261 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3262 else {
3263 for (value = 0; value < 256; value++)
3264 if (isSPACE(value))
b81d288d 3265 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3266 }
3267 }
3268 break;
3269 case SPACEL:
3270 if (flags & SCF_DO_STCLASS_AND) {
3271 if (data->start_class->flags & ANYOF_LOCALE)
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3273 }
3274 else {
3275 data->start_class->flags |= ANYOF_LOCALE;
3276 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3277 }
3278 break;
3279 case NSPACE:
3280 if (flags & SCF_DO_STCLASS_AND) {
3281 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3282 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3283 for (value = 0; value < 256; value++)
3284 if (isSPACE(value))
3285 ANYOF_BITMAP_CLEAR(data->start_class, value);
3286 }
3287 }
3288 else {
3289 if (data->start_class->flags & ANYOF_LOCALE)
3290 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3291 else {
3292 for (value = 0; value < 256; value++)
3293 if (!isSPACE(value))
b81d288d 3294 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3295 }
3296 }
3297 break;
3298 case NSPACEL:
3299 if (flags & SCF_DO_STCLASS_AND) {
3300 if (data->start_class->flags & ANYOF_LOCALE) {
3301 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3302 for (value = 0; value < 256; value++)
3303 if (!isSPACE(value))
3304 ANYOF_BITMAP_CLEAR(data->start_class, value);
3305 }
3306 }
3307 else {
3308 data->start_class->flags |= ANYOF_LOCALE;
3309 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3310 }
3311 break;
3312 case DIGIT:
3313 if (flags & SCF_DO_STCLASS_AND) {
3314 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3315 for (value = 0; value < 256; value++)
3316 if (!isDIGIT(value))
3317 ANYOF_BITMAP_CLEAR(data->start_class, value);
3318 }
3319 else {
3320 if (data->start_class->flags & ANYOF_LOCALE)
3321 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3322 else {
3323 for (value = 0; value < 256; value++)
3324 if (isDIGIT(value))
b81d288d 3325 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3326 }
3327 }
3328 break;
3329 case NDIGIT:
3330 if (flags & SCF_DO_STCLASS_AND) {
3331 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3332 for (value = 0; value < 256; value++)
3333 if (isDIGIT(value))
3334 ANYOF_BITMAP_CLEAR(data->start_class, value);
3335 }
3336 else {
3337 if (data->start_class->flags & ANYOF_LOCALE)
3338 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3339 else {
3340 for (value = 0; value < 256; value++)
3341 if (!isDIGIT(value))
b81d288d 3342 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3343 }
3344 }
3345 break;
3346 }
3347 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3348 cl_and(data->start_class, and_withp);
653099ff
GS
3349 flags &= ~SCF_DO_STCLASS;
3350 }
a0ed51b3 3351 }
3dab1dad 3352 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3353 data->flags |= (OP(scan) == MEOL
3354 ? SF_BEFORE_MEOL
3355 : SF_BEFORE_SEOL);
a0ed51b3 3356 }
3dab1dad 3357 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3358 /* Lookbehind, or need to calculate parens/evals/stclass: */
3359 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3360 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3361 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3362 || OP(scan) == UNLESSM )
3363 {
3364 /* Negative Lookahead/lookbehind
3365 In this case we can't do fixed string optimisation.
3366 */
3367
3368 I32 deltanext, minnext, fake = 0;
3369 regnode *nscan;
3370 struct regnode_charclass_class intrnl;
3371 int f = 0;
3372
3373 data_fake.flags = 0;
3374 if (data) {
3375 data_fake.whilem_c = data->whilem_c;
3376 data_fake.last_closep = data->last_closep;
a0ed51b3 3377 }
1de06328
YO
3378 else
3379 data_fake.last_closep = &fake;
3380 if ( flags & SCF_DO_STCLASS && !scan->flags
3381 && OP(scan) == IFMATCH ) { /* Lookahead */
3382 cl_init(pRExC_state, &intrnl);
3383 data_fake.start_class = &intrnl;
3384 f |= SCF_DO_STCLASS_AND;
c277df42 3385 }
1de06328
YO
3386 if (flags & SCF_WHILEM_VISITED_POS)
3387 f |= SCF_WHILEM_VISITED_POS;
3388 next = regnext(scan);
3389 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3390 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3391 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3392 if (scan->flags) {
3393 if (deltanext) {
3394 vFAIL("Variable length lookbehind not implemented");
3395 }
3396 else if (minnext > (I32)U8_MAX) {
3397 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3398 }
3399 scan->flags = (U8)minnext;
3400 }
3401 if (data) {
3402 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3403 pars++;
3404 if (data_fake.flags & SF_HAS_EVAL)
3405 data->flags |= SF_HAS_EVAL;
3406 data->whilem_c = data_fake.whilem_c;
3407 }
3408 if (f & SCF_DO_STCLASS_AND) {
3409 const int was = (data->start_class->flags & ANYOF_EOS);
3410
3411 cl_and(data->start_class, &intrnl);
3412 if (was)
3413 data->start_class->flags |= ANYOF_EOS;
3414 }
be8e71aa 3415 }
1de06328
YO
3416#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3417 else {
3418 /* Positive Lookahead/lookbehind
3419 In this case we can do fixed string optimisation,
3420 but we must be careful about it. Note in the case of
3421 lookbehind the positions will be offset by the minimum
3422 length of the pattern, something we won't know about
3423 until after the recurse.
3424 */
3425 I32 deltanext, fake = 0;
3426 regnode *nscan;
3427 struct regnode_charclass_class intrnl;
3428 int f = 0;
3429 /* We use SAVEFREEPV so that when the full compile
3430 is finished perl will clean up the allocated
3431 minlens when its all done. This was we don't
3432 have to worry about freeing them when we know
3433 they wont be used, which would be a pain.
3434 */
3435 I32 *minnextp;
3436 Newx( minnextp, 1, I32 );
3437 SAVEFREEPV(minnextp);
3438
3439 if (data) {
3440 StructCopy(data, &data_fake, scan_data_t);
3441 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3442 f |= SCF_DO_SUBSTR;
3443 if (scan->flags)
3444 scan_commit(pRExC_state, &data_fake,minlenp);
3445 data_fake.last_found=newSVsv(data->last_found);
3446 }
3447 }
3448 else
3449 data_fake.last_closep = &fake;
3450 data_fake.flags = 0;
3451 if (is_inf)
3452 data_fake.flags |= SF_IS_INF;
3453 if ( flags & SCF_DO_STCLASS && !scan->flags
3454 && OP(scan) == IFMATCH ) { /* Lookahead */
3455 cl_init(pRExC_state, &intrnl);
3456 data_fake.start_class = &intrnl;
3457 f |= SCF_DO_STCLASS_AND;
3458 }
3459 if (flags & SCF_WHILEM_VISITED_POS)
3460 f |= SCF_WHILEM_VISITED_POS;
3461 next = regnext(scan);
3462 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3463
3464 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3465 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3466 if (scan->flags) {
3467 if (deltanext) {
3468 vFAIL("Variable length lookbehind not implemented");
3469 }
3470 else if (*minnextp > (I32)U8_MAX) {
3471 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3472 }
3473 scan->flags = (U8)*minnextp;
3474 }
40d049e4 3475
1de06328 3476 *minnextp += min;
40d049e4 3477
1de06328
YO
3478 if (f & SCF_DO_STCLASS_AND) {
3479 const int was = (data->start_class->flags & ANYOF_EOS);
3480
3481 cl_and(data->start_class, &intrnl);
3482 if (was)
3483 data->start_class->flags |= ANYOF_EOS;
40d049e4 3484 }
1de06328
YO
3485 if (data) {
3486 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3487 pars++;
3488 if (data_fake.flags & SF_HAS_EVAL)
3489 data->flags |= SF_HAS_EVAL;
3490 data->whilem_c = data_fake.whilem_c;
3491 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3492 if (RExC_rx->minlen<*minnextp)
3493 RExC_rx->minlen=*minnextp;
3494 scan_commit(pRExC_state, &data_fake, minnextp);
3495 SvREFCNT_dec(data_fake.last_found);
3496
3497 if ( data_fake.minlen_fixed != minlenp )
3498 {
3499 data->offset_fixed= data_fake.offset_fixed;
3500 data->minlen_fixed= data_fake.minlen_fixed;
3501 data->lookbehind_fixed+= scan->flags;
3502 }
3503 if ( data_fake.minlen_float != minlenp )
3504 {
3505 data->minlen_float= data_fake.minlen_float;
3506 data->offset_float_min=data_fake.offset_float_min;
3507 data->offset_float_max=data_fake.offset_float_max;
3508 data->lookbehind_float+= scan->flags;
3509 }
3510 }
3511 }
3512
653099ff 3513
653099ff 3514 }
1de06328 3515#endif
a0ed51b3
LW
3516 }
3517 else if (OP(scan) == OPEN) {
40d049e4
YO
3518 if (stopparen != (I32)ARG(scan))
3519 pars++;
a0ed51b3 3520 }
cb434fcc 3521 else if (OP(scan) == CLOSE) {
40d049e4
YO
3522 if (stopparen == (I32)ARG(scan)) {
3523 break;
3524 }
eb160463 3525 if ((I32)ARG(scan) == is_par) {
cb434fcc 3526 next = regnext(scan);
c277df42 3527
cb434fcc
IZ
3528 if ( next && (OP(next) != WHILEM) && next < last)
3529 is_par = 0; /* Disable optimization */
3530 }
3531 if (data)
3532 *(data->last_closep) = ARG(scan);
a0ed51b3 3533 }
1a147d38 3534 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3535 /* set the pointer */
3536 I32 paren;
3537 regnode *start;
3538 regnode *end;
1a147d38 3539 if (OP(scan) == GOSUB) {
40d049e4
YO
3540 paren = ARG(scan);
3541 RExC_recurse[ARG2L(scan)] = scan;
3542 start = RExC_open_parens[paren-1];
3543 end = RExC_close_parens[paren-1];
3544 } else {
3545 paren = 0;
3546 start = RExC_rx->program + 1;
3547 end = RExC_opend;
3548 }
3549 assert(start);
3550 assert(end);
3551 if (!recursed) {
3552 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3553 SAVEFREEPV(recursed);
3554 }
3555 if (!PAREN_TEST(recursed,paren+1)) {
3556 I32 deltanext = 0;
3557 PAREN_SET(recursed,paren+1);
3558
3559 DEBUG_PEEP("goto",start,depth);
3560 min += study_chunk(
3561 pRExC_state,
3562 &start,
3563 minlenp,
3564 &deltanext,
3565 end+1,
3566 data,
3567 paren,
3568 recursed,
3569 and_withp,
3570 flags,depth+1);
3571 delta+=deltanext;
3572 if (deltanext == I32_MAX) {
3573 is_inf = is_inf_internal = 1;
3574 delta=deltanext;
3575 }
3576 DEBUG_PEEP("rtrn",end,depth);
3577 PAREN_UNSET(recursed,paren+1);
3578 } else {
3579 if (flags & SCF_DO_SUBSTR) {
3580 scan_commit(pRExC_state,data,minlenp);
3581 data->longest = &(data->longest_float);
3582 }
3583 is_inf = is_inf_internal = 1;
3584 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3585 cl_anything(pRExC_state, data->start_class);
3586 flags &= ~SCF_DO_STCLASS;
3587 }
3588 }
a0ed51b3 3589 else if (OP(scan) == EVAL) {
c277df42
IZ
3590 if (data)
3591 data->flags |= SF_HAS_EVAL;
3592 }
e2e6a0f1 3593 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
40d049e4
YO
3594 if (flags & SCF_DO_SUBSTR) {
3595 scan_commit(pRExC_state,data,minlenp);
3596 flags &= ~SCF_DO_SUBSTR;
3597 }
e2e6a0f1
YO
3598 if (data && OP(scan)==ACCEPT) {
3599 data->flags |= SCF_SEEN_ACCEPT;
3600 if (stopmin > min)
3601 stopmin = min;
3602 }
40d049e4
YO
3603 }
3604 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3605 {
0f5d15d6 3606 if (flags & SCF_DO_SUBSTR) {
1de06328 3607 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3608 data->longest = &(data->longest_float);
3609 }
3610 is_inf = is_inf_internal = 1;
653099ff 3611 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3612 cl_anything(pRExC_state, data->start_class);
96776eda 3613 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3614 }
786e8c11 3615#ifdef TRIE_STUDY_OPT
40d049e4 3616#ifdef FULL_TRIE_STUDY
786e8c11
YO
3617 else if (PL_regkind[OP(scan)] == TRIE) {
3618 /* NOTE - There is similar code to this block above for handling
40d049e4 3619 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3620 check there too. */
7f69552c 3621 regnode *trie_node= scan;
786e8c11
YO
3622 regnode *tail= regnext(scan);
3623 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3624 I32 max1 = 0, min1 = I32_MAX;
3625 struct regnode_charclass_class accum;
3626
3627 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3628 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3629 if (flags & SCF_DO_STCLASS)
3630 cl_init_zero(pRExC_state, &accum);
3631
3632 if (!trie->jump) {
3633 min1= trie->minlen;
3634 max1= trie->maxlen;
3635 } else {
3636 const regnode *nextbranch= NULL;
3637 U32 word;
3638
3639 for ( word=1 ; word <= trie->wordcount ; word++)
3640 {
3641 I32 deltanext=0, minnext=0, f = 0, fake;
3642 struct regnode_charclass_class this_class;
3643
3644 data_fake.flags = 0;
3645 if (data) {
3646 data_fake.whilem_c = data->whilem_c;
3647 data_fake.last_closep = data->last_closep;
3648 }
3649 else
3650 data_fake.last_closep = &fake;
3651
3652 if (flags & SCF_DO_STCLASS) {
3653 cl_init(pRExC_state, &this_class);
3654 data_fake.start_class = &this_class;
3655 f = SCF_DO_STCLASS_AND;
3656 }
3657 if (flags & SCF_WHILEM_VISITED_POS)
3658 f |= SCF_WHILEM_VISITED_POS;
3659
3660 if (trie->jump[word]) {
3661 if (!nextbranch)
7f69552c
YO
3662 nextbranch = trie_node + trie->jump[0];
3663 scan= trie_node + trie->jump[word];
786e8c11
YO
3664 /* We go from the jump point to the branch that follows
3665 it. Note this means we need the vestigal unused branches
3666 even though they arent otherwise used.
3667 */
40d049e4
YO
3668 minnext = study_chunk(pRExC_state, &scan, minlenp,
3669 &deltanext, (regnode *)nextbranch, &data_fake,
3670 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3671 }
3672 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3673 nextbranch= regnext((regnode*)nextbranch);
3674
3675 if (min1 > (I32)(minnext + trie->minlen))
3676 min1 = minnext + trie->minlen;
3677 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3678 max1 = minnext + deltanext + trie->maxlen;
3679 if (deltanext == I32_MAX)
3680 is_inf = is_inf_internal = 1;
3681
3682 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3683 pars++;
e2e6a0f1
YO
3684 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3685 if ( stopmin > min + min1)
3686 stopmin = min + min1;
3687 flags &= ~SCF_DO_SUBSTR;
3688 if (data)
3689 data->flags |= SCF_SEEN_ACCEPT;
3690 }
786e8c11
YO
3691 if (data) {
3692 if (data_fake.flags & SF_HAS_EVAL)
3693 data->flags |= SF_HAS_EVAL;
3694 data->whilem_c = data_fake.whilem_c;
3695 }
3696 if (flags & SCF_DO_STCLASS)
3697 cl_or(pRExC_state, &accum, &this_class);
3698 }
3699 }
3700 if (flags & SCF_DO_SUBSTR) {
3701 data->pos_min += min1;
3702 data->pos_delta += max1 - min1;
3703 if (max1 != min1 || is_inf)
3704 data->longest = &(data->longest_float);
3705 }
3706 min += min1;
3707 delta += max1 - min1;
3708 if (flags & SCF_DO_STCLASS_OR) {
3709 cl_or(pRExC_state, data->start_class, &accum);
3710 if (min1) {
40d049e4 3711 cl_and(data->start_class, and_withp);
786e8c11
YO
3712 flags &= ~SCF_DO_STCLASS;
3713 }
3714 }
3715 else if (flags & SCF_DO_STCLASS_AND) {
3716 if (min1) {
3717 cl_and(data->start_class, &accum);
3718 flags &= ~SCF_DO_STCLASS;
3719 }
3720 else {
3721 /* Switch to OR mode: cache the old value of
3722 * data->start_class */
40d049e4
YO
3723 INIT_AND_WITHP;
3724 StructCopy(data->start_class, and_withp,
786e8c11
YO
3725 struct regnode_charclass_class);
3726 flags &= ~SCF_DO_STCLASS_AND;
3727 StructCopy(&accum, data->start_class,
3728 struct regnode_charclass_class);
3729 flags |= SCF_DO_STCLASS_OR;
3730 data->start_class->flags |= ANYOF_EOS;
3731 }
3732 }
3733 scan= tail;
3734 continue;
3735 }
3736#else
3737 else if (PL_regkind[OP(scan)] == TRIE) {
3738 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3739 U8*bang=NULL;
3740
3741 min += trie->minlen;
3742 delta += (trie->maxlen - trie->minlen);
3743 flags &= ~SCF_DO_STCLASS; /* xxx */
3744 if (flags & SCF_DO_SUBSTR) {
1de06328 3745 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3746 data->pos_min += trie->minlen;
3747 data->pos_delta += (trie->maxlen - trie->minlen);
3748 if (trie->maxlen != trie->minlen)
3749 data->longest = &(data->longest_float);
3750 }
3751 if (trie->jump) /* no more substrings -- for now /grr*/
3752 flags &= ~SCF_DO_SUBSTR;
3753 }
3754#endif /* old or new */
3755#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3756 /* Else: zero-length, ignore. */
3757 scan = regnext(scan);
3758 }
3759
3760 finish:
3761 *scanp = scan;
aca2d497 3762 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3763 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3764 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3765 if (is_par > (I32)U8_MAX)
c277df42
IZ
3766 is_par = 0;
3767 if (is_par && pars==1 && data) {
3768 data->flags |= SF_IN_PAR;
3769 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3770 }
3771 else if (pars && data) {
c277df42
IZ
3772 data->flags |= SF_HAS_PAR;
3773 data->flags &= ~SF_IN_PAR;
3774 }
653099ff 3775 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3776 cl_and(data->start_class, and_withp);
786e8c11
YO
3777 if (flags & SCF_TRIE_RESTUDY)
3778 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3779
3780 DEBUG_STUDYDATA(data,depth);
3781
e2e6a0f1 3782 return min < stopmin ? min : stopmin;
c277df42
IZ
3783}
3784
76e3520e 3785STATIC I32
5f66b61c 3786S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3787{
830247a4 3788 if (RExC_rx->data) {
2eb97020 3789 const U32 count = RExC_rx->data->count;
b81d288d 3790 Renewc(RExC_rx->data,
2eb97020 3791 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3792 char, struct reg_data);
2eb97020 3793 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3794 RExC_rx->data->count += n;
a0ed51b3
LW
3795 }
3796 else {
a02a5408 3797 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3798 char, struct reg_data);
a02a5408 3799 Newx(RExC_rx->data->what, n, U8);
830247a4 3800 RExC_rx->data->count = n;
c277df42 3801 }
830247a4
IZ
3802 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3803 return RExC_rx->data->count - n;
c277df42
IZ
3804}
3805
76234dfb 3806#ifndef PERL_IN_XSUB_RE
d88dccdf 3807void
864dbfa3 3808Perl_reginitcolors(pTHX)
d88dccdf 3809{
97aff369 3810 dVAR;
1df70142 3811 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3812 if (s) {
1df70142
AL
3813 char *t = savepv(s);
3814 int i = 0;
3815 PL_colors[0] = t;
d88dccdf 3816 while (++i < 6) {
1df70142
AL
3817 t = strchr(t, '\t');
3818 if (t) {
3819 *t = '\0';
3820 PL_colors[i] = ++t;
d88dccdf
IZ
3821 }
3822 else
1df70142 3823 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3824 }
3825 } else {
1df70142 3826 int i = 0;
b81d288d 3827 while (i < 6)
06b5626a 3828 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3829 }
3830 PL_colorset = 1;
3831}
76234dfb 3832#endif
8615cb43 3833
07be1b83 3834
786e8c11
YO
3835#ifdef TRIE_STUDY_OPT
3836#define CHECK_RESTUDY_GOTO \
3837 if ( \
3838 (data.flags & SCF_TRIE_RESTUDY) \
3839 && ! restudied++ \
3840 ) goto reStudy
3841#else
3842#define CHECK_RESTUDY_GOTO
3843#endif
f9f4320a 3844
a687059c 3845/*
e50aee73 3846 - pregcomp - compile a regular expression into internal code
a687059c
LW
3847 *
3848 * We can't allocate space until we know how big the compiled form will be,
3849 * but we can't compile it (and thus know how big it is) until we've got a
3850 * place to put the code. So we cheat: we compile it twice, once with code
3851 * generation turned off and size counting turned on, and once "for real".
3852 * This also means that we don't allocate space until we are sure that the
3853 * thing really will compile successfully, and we never have to move the
3854 * code and thus invalidate pointers into it. (Note that it has to be in
3855 * one piece because free() must be able to free it all.) [NB: not true in perl]
3856 *
3857 * Beware that the optimization-preparation code in here knows about some
3858 * of the structure of the compiled regexp. [I'll say.]
3859 */
b9b4dddf
YO
3860
3861
3862
f9f4320a 3863#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3864#define RE_ENGINE_PTR &PL_core_reg_engine
3865#else
f9f4320a
YO
3866extern const struct regexp_engine my_reg_engine;
3867#define RE_ENGINE_PTR &my_reg_engine
3868#endif
b9b4dddf
YO
3869/* these make a few things look better, to avoid indentation */
3870#define BEGIN_BLOCK {
f9f4320a
YO
3871#define END_BLOCK }
3872
a687059c 3873regexp *
864dbfa3 3874Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3875{
97aff369 3876 dVAR;
f9f4320a
YO
3877 GET_RE_DEBUG_FLAGS_DECL;
3878 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3879#ifndef PERL_IN_XSUB_RE
3880 BEGIN_BLOCK
f9f4320a
YO
3881 /* Dispatch a request to compile a regexp to correct
3882 regexp engine. */
3883 HV * const table = GvHV(PL_hintgv);
3884 if (table) {
3885 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3886 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3887 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3888 DEBUG_COMPILE_r({
8d8756e7 3889 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3890 SvIV(*ptr));
3891 });
f2f78491 3892 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3893 }
b9b4dddf
YO
3894 }
3895 END_BLOCK
3896#endif
3897 BEGIN_BLOCK
a0d0e21e 3898 register regexp *r;
c277df42 3899 regnode *scan;
c277df42 3900 regnode *first;
a0d0e21e 3901 I32 flags;
a0d0e21e
LW
3902 I32 minlen = 0;
3903 I32 sawplus = 0;
3904 I32 sawopen = 0;
2c2d71f5 3905 scan_data_t data;
830247a4 3906 RExC_state_t RExC_state;
be8e71aa 3907 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3908#ifdef TRIE_STUDY_OPT
3909 int restudied= 0;
3910 RExC_state_t copyRExC_state;
3911#endif
a0d0e21e 3912 if (exp == NULL)
c277df42 3913 FAIL("NULL regexp argument");
a0d0e21e 3914
a5961de5 3915 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3916
5cfc7842 3917 RExC_precomp = exp;
a3621e74 3918 DEBUG_COMPILE_r({
ab3bbdeb
YO
3919 SV *dsv= sv_newmortal();
3920 RE_PV_QUOTED_DECL(s, RExC_utf8,
3921 dsv, RExC_precomp, (xend - exp), 60);
3922 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3923 PL_colors[4],PL_colors[5],s);
a5961de5 3924 });
e2509266 3925 RExC_flags = pm->op_pmflags;
830247a4 3926 RExC_sawback = 0;
bbce6d69 3927
830247a4
IZ
3928 RExC_seen = 0;
3929 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3930 RExC_seen_evals = 0;
3931 RExC_extralen = 0;
c277df42 3932
bbce6d69 3933 /* First pass: determine size, legality. */
830247a4 3934 RExC_parse = exp;
fac92740 3935 RExC_start = exp;
830247a4
IZ
3936 RExC_end = xend;
3937 RExC_naughty = 0;
3938 RExC_npar = 1;
e2e6a0f1 3939 RExC_nestroot = 0;
830247a4
IZ
3940 RExC_size = 0L;
3941 RExC_emit = &PL_regdummy;
3942 RExC_whilem_seen = 0;
fc8cd66c 3943 RExC_charnames = NULL;
40d049e4
YO
3944 RExC_open_parens = NULL;
3945 RExC_close_parens = NULL;
3946 RExC_opend = NULL;
81714fb9 3947 RExC_paren_names = NULL;
40d049e4
YO
3948 RExC_recurse = NULL;
3949 RExC_recurse_count = 0;
81714fb9 3950
85ddcde9
JH
3951#if 0 /* REGC() is (currently) a NOP at the first pass.
3952 * Clever compilers notice this and complain. --jhi */
830247a4 3953 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3954#endif
3dab1dad
YO
3955 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3956 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3957 RExC_precomp = NULL;
a0d0e21e
LW
3958 return(NULL);
3959 }
07be1b83 3960 DEBUG_PARSE_r({
81714fb9
YO
3961 PerlIO_printf(Perl_debug_log,
3962 "Required size %"IVdf" nodes\n"
3963 "Starting second pass (creation)\n",
3964 (IV)RExC_size);
07be1b83
YO
3965 RExC_lastnum=0;
3966 RExC_lastparse=NULL;
3967 });
c277df42
IZ
3968 /* Small enough for pointer-storage convention?
3969 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3970 if (RExC_size >= 0x10000L && RExC_extralen)
3971 RExC_size += RExC_extralen;
c277df42 3972 else
830247a4
IZ
3973 RExC_extralen = 0;
3974 if (RExC_whilem_seen > 15)
3975 RExC_whilem_seen = 15;
a0d0e21e 3976
e2e6a0f1
YO
3977#ifdef DEBUGGING
3978 /* Make room for a sentinel value at the end of the program */
3979 RExC_size++;
3980#endif
3981
f9f4320a
YO
3982 /* Allocate space and zero-initialize. Note, the two step process
3983 of zeroing when in debug mode, thus anything assigned has to
3984 happen after that */
a02a5408 3985 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3986 char, regexp);
a0d0e21e 3987 if (r == NULL)
b45f050a 3988 FAIL("Regexp out of space");
0f79a09d
GS
3989#ifdef DEBUGGING
3990 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3991 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3992#endif
f9f4320a
YO
3993 /* initialization begins here */
3994 r->engine= RE_ENGINE_PTR;
c277df42 3995 r->refcnt = 1;
bbce6d69 3996 r->prelen = xend - exp;
5cfc7842 3997 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3998 r->subbeg = NULL;
f8c7b90f 3999#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 4000 r->saved_copy = NULL;
ed252734 4001#endif
cf93c79d 4002 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4003 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 4004 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
4005
4006 r->substrs = 0; /* Useful during FAIL. */
4007 r->startp = 0; /* Useful during FAIL. */
6bda09f9 4008 r->endp = 0;
81714fb9
YO
4009 r->paren_names = 0;
4010
6bda09f9 4011 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4012 Newxz(RExC_open_parens, RExC_npar,regnode *);
4013 SAVEFREEPV(RExC_open_parens);
4014 Newxz(RExC_close_parens,RExC_npar,regnode *);
4015 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4016 }
4017
4018 /* Useful during FAIL. */
a02a5408 4019 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4020 if (r->offsets) {
2af232bd 4021 r->offsets[0] = RExC_size;
fac92740 4022 }
a3621e74 4023 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4024 "%s %"UVuf" bytes for offset annotations.\n",
4025 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4026 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4027
830247a4 4028 RExC_rx = r;
bbce6d69 4029
4030 /* Second pass: emit code. */
e2509266 4031 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4032 RExC_parse = exp;
4033 RExC_end = xend;
4034 RExC_naughty = 0;
4035 RExC_npar = 1;
fac92740 4036 RExC_emit_start = r->program;
830247a4 4037 RExC_emit = r->program;
e2e6a0f1
YO
4038#ifdef DEBUGGING
4039 /* put a sentinal on the end of the program so we can check for
4040 overwrites */
4041 r->program[RExC_size].type = 255;
4042#endif
2cd61cdb 4043 /* Store the count of eval-groups for security checks: */
786e8c11 4044 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4045 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4046 r->data = 0;
3dab1dad 4047 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4048 return(NULL);
6bda09f9 4049
07be1b83
YO
4050 /* XXXX To minimize changes to RE engine we always allocate
4051 3-units-long substrs field. */
4052 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4053 if (RExC_recurse_count) {
4054 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4055 SAVEFREEPV(RExC_recurse);
4056 }
a0d0e21e 4057
07be1b83 4058reStudy:
1de06328 4059 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4060 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4061
07be1b83
YO
4062#ifdef TRIE_STUDY_OPT
4063 if ( restudied ) {
4064 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4065 RExC_state=copyRExC_state;
1de06328 4066 if (data.last_found) {
07be1b83 4067 SvREFCNT_dec(data.longest_fixed);
07be1b83 4068 SvREFCNT_dec(data.longest_float);
07be1b83 4069 SvREFCNT_dec(data.last_found);
1de06328 4070 }
40d049e4 4071 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4072 } else {
40d049e4 4073 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83
YO
4074 copyRExC_state=RExC_state;
4075 }
40d049e4
YO
4076#else
4077 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4078#endif
fc8cd66c 4079
a0d0e21e 4080 /* Dig out information for optimizations. */
cf93c79d 4081 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4082 pm->op_pmflags = RExC_flags;
a0ed51b3 4083 if (UTF)
5ff6fc6d 4084 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4085 r->regstclass = NULL;
830247a4 4086 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4087 r->reganch |= ROPT_NAUGHTY;
c277df42 4088 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4089
1de06328
YO
4090 /* testing for BRANCH here tells us whether there is "must appear"
4091 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4092 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4093 I32 fake;
c5254dd6 4094 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4095 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4096 int stclass_flag;
07be1b83 4097 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4098
4099 first = scan;
c277df42 4100 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4101 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4102 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4103 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4104 /* for now we can't handle lookbehind IFMATCH*/
4105 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4106 (OP(first) == PLUS) ||
4107 (OP(first) == MINMOD) ||
653099ff 4108 /* An {n,m} with n>0 */
07be1b83
YO
4109 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4110 {
786e8c11 4111
a0d0e21e
LW
4112 if (OP(first) == PLUS)
4113 sawplus = 1;
4114 else
3dab1dad 4115 first += regarglen[OP(first)];
07be1b83
YO
4116 if (OP(first) == IFMATCH) {
4117 first = NEXTOPER(first);
4118 first += EXTRA_STEP_2ARGS;
7c167cea 4119 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4120 first = NEXTOPER(first);
a687059c
LW
4121 }
4122
a0d0e21e
LW
4123 /* Starting-point info. */
4124 again:
786e8c11 4125 DEBUG_PEEP("first:",first,0);
07be1b83 4126 /* Ignore EXACT as we deal with it later. */
3dab1dad 4127 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4128 if (OP(first) == EXACT)
6f207bd3 4129 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4130 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4131 r->regstclass = first;
4132 }
07be1b83 4133#ifdef TRIE_STCLASS
786e8c11 4134 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4135 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4136 {
786e8c11 4137 regnode *trie_op;
07be1b83 4138 /* this can happen only on restudy */
786e8c11
YO
4139 if ( OP(first) == TRIE ) {
4140 struct regnode_1 *trieop;
4141 Newxz(trieop,1,struct regnode_1);
4142 StructCopy(first,trieop,struct regnode_1);
4143 trie_op=(regnode *)trieop;
4144 } else {
4145 struct regnode_charclass *trieop;
4146 Newxz(trieop,1,struct regnode_charclass);
4147 StructCopy(first,trieop,struct regnode_charclass);
4148 trie_op=(regnode *)trieop;
4149 }
1de06328 4150 OP(trie_op)+=2;
786e8c11
YO
4151 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4152 r->regstclass = trie_op;
07be1b83
YO
4153 }
4154#endif
bfed75c6 4155 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4156 r->regstclass = first;
3dab1dad
YO
4157 else if (PL_regkind[OP(first)] == BOUND ||
4158 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4159 r->regstclass = first;
3dab1dad 4160 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4161 r->reganch |= (OP(first) == MBOL
4162 ? ROPT_ANCH_MBOL
4163 : (OP(first) == SBOL
4164 ? ROPT_ANCH_SBOL
4165 : ROPT_ANCH_BOL));
a0d0e21e 4166 first = NEXTOPER(first);
774d564b 4167 goto again;
4168 }
4169 else if (OP(first) == GPOS) {
4170 r->reganch |= ROPT_ANCH_GPOS;
4171 first = NEXTOPER(first);
4172 goto again;
a0d0e21e 4173 }
e09294f4 4174 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4175 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4176 !(r->reganch & ROPT_ANCH) )
4177 {
4178 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4179 const int type =
4180 (OP(NEXTOPER(first)) == REG_ANY)
4181 ? ROPT_ANCH_MBOL
4182 : ROPT_ANCH_SBOL;
cad2e5aa 4183 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4184 first = NEXTOPER(first);
774d564b 4185 goto again;
a0d0e21e 4186 }
b81d288d 4187 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4188 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4189 /* x+ must match at the 1st pos of run of x's */
4190 r->reganch |= ROPT_SKIP;
a0d0e21e 4191
c277df42 4192 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4193#ifdef TRIE_STUDY_OPT
81714fb9 4194 DEBUG_PARSE_r(
be8e71aa
YO
4195 if (!restudied)
4196 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4197 (IV)(first - scan + 1))
4198 );
4199#else
81714fb9 4200 DEBUG_PARSE_r(
be8e71aa
YO
4201 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4202 (IV)(first - scan + 1))
4203 );
4204#endif
4205
4206
a0d0e21e
LW
4207 /*
4208 * If there's something expensive in the r.e., find the
4209 * longest literal string that must appear and make it the
4210 * regmust. Resolve ties in favor of later strings, since
4211 * the regstart check works with the beginning of the r.e.
4212 * and avoiding duplication strengthens checking. Not a
4213 * strong reason, but sufficient in the absence of others.
4214 * [Now we resolve ties in favor of the earlier string if
c277df42 4215 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4216 * earlier string may buy us something the later one won't.]
4217 */
a0d0e21e 4218 minlen = 0;
a687059c 4219
396482e1
GA
4220 data.longest_fixed = newSVpvs("");
4221 data.longest_float = newSVpvs("");
4222 data.last_found = newSVpvs("");
c277df42
IZ
4223 data.longest = &(data.longest_fixed);
4224 first = scan;
653099ff 4225 if (!r->regstclass) {
830247a4 4226 cl_init(pRExC_state, &ch_class);
653099ff
GS
4227 data.start_class = &ch_class;
4228 stclass_flag = SCF_DO_STCLASS_AND;
4229 } else /* XXXX Check for BOUND? */
4230 stclass_flag = 0;
cb434fcc 4231 data.last_closep = &last_close;
653099ff 4232
1de06328 4233 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4234 &data, -1, NULL, NULL,
4235 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4236
07be1b83 4237
786e8c11
YO
4238 CHECK_RESTUDY_GOTO;
4239
4240
830247a4 4241 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4242 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4243 && !RExC_seen_zerolen
4244 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4245 r->reganch |= ROPT_CHECK_ALL;
1de06328 4246 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4247 SvREFCNT_dec(data.last_found);
4248
1de06328
YO
4249 /* Note that code very similar to this but for anchored string
4250 follows immediately below, changes may need to be made to both.
4251 Be careful.
4252 */
a0ed51b3 4253 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4254 if (longest_float_length
c277df42
IZ
4255 || (data.flags & SF_FL_BEFORE_EOL
4256 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4257 || (RExC_flags & PMf_MULTILINE))))
4258 {
1182767e 4259 I32 t,ml;
cf93c79d 4260
1de06328 4261 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4262 && data.offset_fixed == data.offset_float_min
4263 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4264 goto remove_float; /* As in (a)+. */
4265
1de06328
YO
4266 /* copy the information about the longest float from the reg_scan_data
4267 over to the program. */
33b8afdf
JH
4268 if (SvUTF8(data.longest_float)) {
4269 r->float_utf8 = data.longest_float;
c445ea15 4270 r->float_substr = NULL;
33b8afdf
JH
4271 } else {
4272 r->float_substr = data.longest_float;
c445ea15 4273 r->float_utf8 = NULL;
33b8afdf 4274 }
1de06328
YO
4275 /* float_end_shift is how many chars that must be matched that
4276 follow this item. We calculate it ahead of time as once the
4277 lookbehind offset is added in we lose the ability to correctly
4278 calculate it.*/
4279 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4280 : (I32)longest_float_length;
1de06328
YO
4281 r->float_end_shift = ml - data.offset_float_min
4282 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4283 + data.lookbehind_float;
4284 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4285 r->float_max_offset = data.offset_float_max;
1182767e 4286 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4287 r->float_max_offset -= data.lookbehind_float;
4288
cf93c79d
IZ
4289 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4290 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4291 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4292 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4293 }
4294 else {
aca2d497 4295 remove_float:
c445ea15 4296 r->float_substr = r->float_utf8 = NULL;
c277df42 4297 SvREFCNT_dec(data.longest_float);
c5254dd6 4298 longest_float_length = 0;
a0d0e21e 4299 }
c277df42 4300
1de06328
YO
4301 /* Note that code very similar to this but for floating string
4302 is immediately above, changes may need to be made to both.
4303 Be careful.
4304 */
a0ed51b3 4305 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4306 if (longest_fixed_length
c277df42
IZ
4307 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4308 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4309 || (RExC_flags & PMf_MULTILINE))))
4310 {
1182767e 4311 I32 t,ml;
cf93c79d 4312
1de06328
YO
4313 /* copy the information about the longest fixed
4314 from the reg_scan_data over to the program. */
33b8afdf
JH
4315 if (SvUTF8(data.longest_fixed)) {
4316 r->anchored_utf8 = data.longest_fixed;
c445ea15 4317 r->anchored_substr = NULL;
33b8afdf
JH
4318 } else {
4319 r->anchored_substr = data.longest_fixed;
c445ea15 4320 r->anchored_utf8 = NULL;
33b8afdf 4321 }
1de06328
YO
4322 /* fixed_end_shift is how many chars that must be matched that
4323 follow this item. We calculate it ahead of time as once the
4324 lookbehind offset is added in we lose the ability to correctly
4325 calculate it.*/
4326 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4327 : (I32)longest_fixed_length;
1de06328
YO
4328 r->anchored_end_shift = ml - data.offset_fixed
4329 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4330 + data.lookbehind_fixed;
4331 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4332
cf93c79d
IZ
4333 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4334 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4335 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4336 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4337 }
4338 else {
c445ea15 4339 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4340 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4341 longest_fixed_length = 0;
a0d0e21e 4342 }
b81d288d 4343 if (r->regstclass
ffc61ed2 4344 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4345 r->regstclass = NULL;
33b8afdf
JH
4346 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4347 && stclass_flag
653099ff 4348 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4349 && !cl_is_anything(data.start_class))
4350 {
1df70142 4351 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4352
a02a5408 4353 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4354 struct regnode_charclass_class);
4355 StructCopy(data.start_class,
830247a4 4356 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4357 struct regnode_charclass_class);
830247a4 4358 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4359 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4360 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4361 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4362 PerlIO_printf(Perl_debug_log,
a0288114 4363 "synthetic stclass \"%s\".\n",
3f7c398e 4364 SvPVX_const(sv));});
653099ff 4365 }
c277df42
IZ
4366
4367 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4368 if (longest_fixed_length > longest_float_length) {
1de06328 4369 r->check_end_shift = r->anchored_end_shift;
c277df42 4370 r->check_substr = r->anchored_substr;
33b8afdf 4371 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4372 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4373 if (r->reganch & ROPT_ANCH_SINGLE)
4374 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4375 }
4376 else {
1de06328 4377 r->check_end_shift = r->float_end_shift;
c277df42 4378 r->check_substr = r->float_substr;
33b8afdf 4379 r->check_utf8 = r->float_utf8;
1de06328
YO
4380 r->check_offset_min = r->float_min_offset;
4381 r->check_offset_max = r->float_max_offset;
a0d0e21e 4382 }
30382c73
IZ
4383 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4384 This should be changed ASAP! */
33b8afdf 4385 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4386 r->reganch |= RE_USE_INTUIT;
33b8afdf 4387 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4388 r->reganch |= RE_INTUIT_TAIL;
4389 }
1de06328
YO
4390 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4391 if ( (STRLEN)minlen < longest_float_length )
4392 minlen= longest_float_length;
4393 if ( (STRLEN)minlen < longest_fixed_length )
4394 minlen= longest_fixed_length;
4395 */
a0ed51b3
LW
4396 }
4397 else {
c277df42
IZ
4398 /* Several toplevels. Best we can is to set minlen. */
4399 I32 fake;
653099ff 4400 struct regnode_charclass_class ch_class;
cb434fcc 4401 I32 last_close = 0;
c277df42 4402
a3621e74 4403 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4404
c277df42 4405 scan = r->program + 1;
830247a4 4406 cl_init(pRExC_state, &ch_class);
653099ff 4407 data.start_class = &ch_class;
cb434fcc 4408 data.last_closep = &last_close;
07be1b83 4409
1de06328 4410 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4411 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
07be1b83 4412
786e8c11 4413 CHECK_RESTUDY_GOTO;
07be1b83 4414
33b8afdf 4415 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4416 = r->float_substr = r->float_utf8 = NULL;
653099ff 4417 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4418 && !cl_is_anything(data.start_class))
4419 {
1df70142 4420 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4421
a02a5408 4422 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4423 struct regnode_charclass_class);
4424 StructCopy(data.start_class,
830247a4 4425 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4426 struct regnode_charclass_class);
830247a4 4427 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4428 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4429 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4430 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4431 PerlIO_printf(Perl_debug_log,
a0288114 4432 "synthetic stclass \"%s\".\n",
3f7c398e 4433 SvPVX_const(sv));});
653099ff 4434 }
a0d0e21e
LW
4435 }
4436
1de06328
YO
4437 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4438 the "real" pattern. */
4439 if (r->minlen < minlen)
4440 r->minlen = minlen;
4441
b81d288d 4442 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4443 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4444 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4445 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4446 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4447 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4448 if (RExC_seen & REG_SEEN_CANY)
4449 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4450 if (RExC_seen & REG_SEEN_VERBARG)
4451 r->reganch |= ROPT_VERBARG_SEEN;
81714fb9
YO
4452 if (RExC_paren_names)
4453 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4454 else
4455 r->paren_names = NULL;
4456
40d049e4
YO
4457 if (RExC_recurse_count) {
4458 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4459 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4460 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4461 }
4462 }
a02a5408
JC
4463 Newxz(r->startp, RExC_npar, I32);
4464 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4465
f2278c82 4466 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4467 DEBUG_DUMP_r({
4468 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4469 regdump(r);
4470 });
8e9a8a48
YO
4471 DEBUG_OFFSETS_r(if (r->offsets) {
4472 const U32 len = r->offsets[0];
4473 U32 i;
4474 GET_RE_DEBUG_FLAGS_DECL;
4475 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4476 for (i = 1; i <= len; i++) {
4477 if (r->offsets[i*2-1] || r->offsets[i*2])
4478 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4479 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4480 }
4481 PerlIO_printf(Perl_debug_log, "\n");
4482 });
a0d0e21e 4483 return(r);
f9f4320a 4484 END_BLOCK
a687059c
LW
4485}
4486
f9f4320a
YO
4487#undef CORE_ONLY_BLOCK
4488#undef END_BLOCK
4489#undef RE_ENGINE_PTR
3dab1dad 4490
9af228c6 4491#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4492SV*
4493Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4494{
4495 I32 parno = 0; /* no match */
4496 if (PL_curpm) {
4497 const REGEXP * const rx = PM_GETRE(PL_curpm);
4498 if (rx && rx->paren_names) {
4499 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4500 if (he_str) {
4501 IV i;
4502 SV* sv_dat=HeVAL(he_str);
4503 I32 *nums=(I32*)SvPVX(sv_dat);
4504 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4505 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4506 rx->endp[nums[i]] != -1)
4507 {
4508 parno = nums[i];
4509 break;
4510 }
4511 }
4512 }
4513 }
4514 }
4515 if ( !parno ) {
4516 return 0;
4517 } else {
4518 GV *gv_paren;
4519 SV *sv= sv_newmortal();
4520 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4521 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4522 return GvSVn(gv_paren);
4523 }
4524}
9af228c6 4525#endif
0a4db386 4526
894be9b7 4527/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4528 * If flags is REG_RSN_RETURN_NULL returns null.
4529 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4530 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4531 * to the parsed name as looked up in the RExC_paren_names hash.
4532 * If there is an error throws a vFAIL().. type exception.
894be9b7 4533 */
0a4db386
YO
4534
4535#define REG_RSN_RETURN_NULL 0
4536#define REG_RSN_RETURN_NAME 1
4537#define REG_RSN_RETURN_DATA 2
4538
894be9b7
YO
4539STATIC SV*
4540S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4541 char *name_start = RExC_parse;
0a4db386 4542 if ( UTF ) {
894be9b7 4543 STRLEN numlen;
0a4db386
YO
4544 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4545 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4546 {
4547 RExC_parse += numlen;
4548 }
4549 } else {
4550 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4551 RExC_parse++;
4552 }
0a4db386
YO
4553 if ( flags ) {
4554 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4555 (int)(RExC_parse - name_start)));
894be9b7 4556 if (UTF)
0a4db386
YO
4557 SvUTF8_on(sv_name);
4558 if ( flags == REG_RSN_RETURN_NAME)
4559 return sv_name;
4560 else if (flags==REG_RSN_RETURN_DATA) {
4561 HE *he_str = NULL;
4562 SV *sv_dat = NULL;
4563 if ( ! sv_name ) /* should not happen*/
4564 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4565 if (RExC_paren_names)
4566 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4567 if ( he_str )
4568 sv_dat = HeVAL(he_str);
4569 if ( ! sv_dat )
4570 vFAIL("Reference to nonexistent named group");
4571 return sv_dat;
4572 }
4573 else {
4574 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4575 }
4576 /* NOT REACHED */
894be9b7 4577 }
0a4db386 4578 return NULL;
894be9b7
YO
4579}
4580
3dab1dad
YO
4581#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4582 int rem=(int)(RExC_end - RExC_parse); \
4583 int cut; \
4584 int num; \
4585 int iscut=0; \
4586 if (rem>10) { \
4587 rem=10; \
4588 iscut=1; \
4589 } \
4590 cut=10-rem; \
4591 if (RExC_lastparse!=RExC_parse) \
4592 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4593 rem, RExC_parse, \
4594 cut + 4, \
4595 iscut ? "..." : "<" \
4596 ); \
4597 else \
4598 PerlIO_printf(Perl_debug_log,"%16s",""); \
4599 \
4600 if (SIZE_ONLY) \
4601 num=RExC_size; \
4602 else \
4603 num=REG_NODE_NUM(RExC_emit); \
4604 if (RExC_lastnum!=num) \
0a4db386 4605 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4606 else \
0a4db386 4607 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4608 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4609 (int)((depth*2)), "", \
3dab1dad
YO
4610 (funcname) \
4611 ); \
4612 RExC_lastnum=num; \
4613 RExC_lastparse=RExC_parse; \
4614})
4615
07be1b83
YO
4616
4617
3dab1dad
YO
4618#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4619 DEBUG_PARSE_MSG((funcname)); \
4620 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4621})
6bda09f9
YO
4622#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4623 DEBUG_PARSE_MSG((funcname)); \
4624 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4625})
a687059c
LW
4626/*
4627 - reg - regular expression, i.e. main body or parenthesized thing
4628 *
4629 * Caller must absorb opening parenthesis.
4630 *
4631 * Combining parenthesis handling with the base level of regular expression
4632 * is a trifle forced, but the need to tie the tails of the branches to what
4633 * follows makes it hard to avoid.
4634 */
07be1b83
YO
4635#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4636#ifdef DEBUGGING
4637#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4638#else
4639#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4640#endif
3dab1dad 4641
e2e6a0f1
YO
4642/* this idea is borrowed from STR_WITH_LEN in handy.h */
4643#define CHECK_WORD(s,v,l) \
4644 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4645
76e3520e 4646STATIC regnode *
3dab1dad 4647S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4648 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4649{
27da23d5 4650 dVAR;
c277df42
IZ
4651 register regnode *ret; /* Will be the head of the group. */
4652 register regnode *br;
4653 register regnode *lastbr;
cbbf8932 4654 register regnode *ender = NULL;
a0d0e21e 4655 register I32 parno = 0;
cbbf8932
AL
4656 I32 flags;
4657 const I32 oregflags = RExC_flags;
6136c704
AL
4658 bool have_branch = 0;
4659 bool is_open = 0;
9d1d55b5
JP
4660
4661 /* for (?g), (?gc), and (?o) warnings; warning
4662 about (?c) will warn about (?g) -- japhy */
4663
6136c704
AL
4664#define WASTED_O 0x01
4665#define WASTED_G 0x02
4666#define WASTED_C 0x04
4667#define WASTED_GC (0x02|0x04)
cbbf8932 4668 I32 wastedflags = 0x00;
9d1d55b5 4669
fac92740 4670 char * parse_start = RExC_parse; /* MJD */
a28509cc 4671 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4672
3dab1dad
YO
4673 GET_RE_DEBUG_FLAGS_DECL;
4674 DEBUG_PARSE("reg ");
4675
4676
821b33a5 4677 *flagp = 0; /* Tentatively. */
a0d0e21e 4678
9d1d55b5 4679
a0d0e21e
LW
4680 /* Make an OPEN node, if parenthesized. */
4681 if (paren) {
e2e6a0f1
YO
4682 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4683 char *start_verb = RExC_parse;
4684 STRLEN verb_len = 0;
4685 char *start_arg = NULL;
4686 unsigned char op = 0;
4687 int argok = 1;
4688 int internal_argval = 0; /* internal_argval is only useful if !argok */
4689 while ( *RExC_parse && *RExC_parse != ')' ) {
4690 if ( *RExC_parse == ':' ) {
4691 start_arg = RExC_parse + 1;
4692 break;
4693 }
4694 RExC_parse++;
4695 }
4696 ++start_verb;
4697 verb_len = RExC_parse - start_verb;
4698 if ( start_arg ) {
4699 RExC_parse++;
4700 while ( *RExC_parse && *RExC_parse != ')' )
4701 RExC_parse++;
4702 if ( *RExC_parse != ')' )
4703 vFAIL("Unterminated verb pattern argument");
4704 if ( RExC_parse == start_arg )
4705 start_arg = NULL;
4706 } else {
4707 if ( *RExC_parse != ')' )
4708 vFAIL("Unterminated verb pattern");
4709 }
4710 switch ( *start_verb ) {
4711 case 'A': /* (*ACCEPT) */
4712 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4713 op = ACCEPT;
4714 internal_argval = RExC_nestroot;
4715 }
4716 break;
4717 case 'C': /* (*COMMIT) */
4718 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4719 op = COMMIT;
4720 else if ( CHECK_WORD("CUT",start_verb,verb_len) )
4721 op = CUT;
4722 break;
4723 case 'F': /* (*FAIL) */
4724 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4725 op = OPFAIL;
4726 argok = 0;
4727 }
4728 break;
4729 case 'M':
4730 if ( CHECK_WORD("MARK",start_verb,verb_len) )
4731 op = MARKPOINT;
4732 break;
4733 case 'N': /* (*NOMATCH) */
4734 if ( CHECK_WORD("NOMATCH",start_verb,verb_len) )
4735 op = NOMATCH;
4736 break;
4737 }
4738 if ( ! op ) {
4739 RExC_parse++;
4740 vFAIL3("Unknown verb pattern '%.*s'",
4741 verb_len, start_verb);
4742 }
4743 if ( argok ) {
4744 if ( start_arg && internal_argval ) {
4745 vFAIL3("Verb pattern '%.*s' may not have an argument",
4746 verb_len, start_verb);
4747 } else if ( argok < 0 && !start_arg ) {
4748 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4749 verb_len, start_verb);
4750 } else {
4751 ret = reganode(pRExC_state, op, internal_argval);
4752 if ( ! internal_argval && ! SIZE_ONLY ) {
4753 if (start_arg) {
4754 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4755 ARG(ret) = add_data( pRExC_state, 1, "S" );
4756 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4757 ret->flags = 0;
4758 } else {
4759 ret->flags = 1;
4760 }
4761 }
4762 }
4763 if (!internal_argval)
4764 RExC_seen |= REG_SEEN_VERBARG;
4765 } else if ( start_arg ) {
4766 vFAIL3("Verb pattern '%.*s' may not have an argument",
4767 verb_len, start_verb);
4768 } else {
4769 ret = reg_node(pRExC_state, op);
4770 }
4771 nextchar(pRExC_state);
4772 return ret;
4773 } else
fac92740 4774 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4775 U32 posflags = 0, negflags = 0;
4776 U32 *flagsp = &posflags;
6136c704 4777 bool is_logical = 0;
a28509cc 4778 const char * const seqstart = RExC_parse;
ca9dfc88 4779
830247a4
IZ
4780 RExC_parse++;
4781 paren = *RExC_parse++;
c277df42 4782 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4783 switch (paren) {
894be9b7 4784
fac92740 4785 case '<': /* (?<...) */
b81d288d 4786 if (*RExC_parse == '!')
c277df42 4787 paren = ',';
0a4db386
YO
4788 else if (*RExC_parse != '=')
4789 { /* (?<...>) */
81714fb9 4790 char *name_start;
894be9b7 4791 SV *svname;
81714fb9
YO
4792 paren= '>';
4793 case '\'': /* (?'...') */
4794 name_start= RExC_parse;
0a4db386
YO
4795 svname = reg_scan_name(pRExC_state,
4796 SIZE_ONLY ? /* reverse test from the others */
4797 REG_RSN_RETURN_NAME :
4798 REG_RSN_RETURN_NULL);
81714fb9
YO
4799 if (RExC_parse == name_start)
4800 goto unknown;
4801 if (*RExC_parse != paren)
4802 vFAIL2("Sequence (?%c... not terminated",
4803 paren=='>' ? '<' : paren);
4804 if (SIZE_ONLY) {
e62cc96a
YO
4805 HE *he_str;
4806 SV *sv_dat = NULL;
894be9b7
YO
4807 if (!svname) /* shouldnt happen */
4808 Perl_croak(aTHX_
4809 "panic: reg_scan_name returned NULL");
81714fb9
YO
4810 if (!RExC_paren_names) {
4811 RExC_paren_names= newHV();
4812 sv_2mortal((SV*)RExC_paren_names);
4813 }
4814 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4815 if ( he_str )
81714fb9 4816 sv_dat = HeVAL(he_str);
e62cc96a 4817 if ( ! sv_dat ) {
81714fb9 4818 /* croak baby croak */
e62cc96a
YO
4819 Perl_croak(aTHX_
4820 "panic: paren_name hash element allocation failed");
4821 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4822 IV count=SvIV(sv_dat);
4823 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4824 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4825 pv[count]=RExC_npar;
4826 SvIVX(sv_dat)++;
4827 } else {
4828 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4829 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4830 SvIOK_on(sv_dat);
4831 SvIVX(sv_dat)= 1;
e62cc96a
YO
4832 }
4833
81714fb9
YO
4834 /*sv_dump(sv_dat);*/
4835 }
4836 nextchar(pRExC_state);
4837 paren = 1;
4838 goto capturing_parens;
4839 }
4840 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4841 RExC_parse++;
fac92740
MJD
4842 case '=': /* (?=...) */
4843 case '!': /* (?!...) */
830247a4 4844 RExC_seen_zerolen++;
e2e6a0f1
YO
4845 if (*RExC_parse == ')') {
4846 ret=reg_node(pRExC_state, OPFAIL);
4847 nextchar(pRExC_state);
4848 return ret;
4849 }
fac92740
MJD
4850 case ':': /* (?:...) */
4851 case '>': /* (?>...) */
a0d0e21e 4852 break;
fac92740
MJD
4853 case '$': /* (?$...) */
4854 case '@': /* (?@...) */
8615cb43 4855 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4856 break;
fac92740 4857 case '#': /* (?#...) */
830247a4
IZ
4858 while (*RExC_parse && *RExC_parse != ')')
4859 RExC_parse++;
4860 if (*RExC_parse != ')')
c277df42 4861 FAIL("Sequence (?#... not terminated");
830247a4 4862 nextchar(pRExC_state);
a0d0e21e
LW
4863 *flagp = TRYAGAIN;
4864 return NULL;
894be9b7
YO
4865 case '0' : /* (?0) */
4866 case 'R' : /* (?R) */
4867 if (*RExC_parse != ')')
6bda09f9 4868 FAIL("Sequence (?R) not terminated");
1a147d38 4869 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4870 nextchar(pRExC_state);
4871 return ret;
4872 /*notreached*/
894be9b7
YO
4873 { /* named and numeric backreferences */
4874 I32 num;
4875 char * parse_start;
4876 case '&': /* (?&NAME) */
4877 parse_start = RExC_parse - 1;
4878 {
0a4db386
YO
4879 SV *sv_dat = reg_scan_name(pRExC_state,
4880 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4881 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4882 }
4883 goto gen_recurse_regop;
4884 /* NOT REACHED */
6bda09f9
YO
4885 case '1': case '2': case '3': case '4': /* (?1) */
4886 case '5': case '6': case '7': case '8': case '9':
4887 RExC_parse--;
894be9b7
YO
4888 num = atoi(RExC_parse);
4889 parse_start = RExC_parse - 1; /* MJD */
6bda09f9
YO
4890 while (isDIGIT(*RExC_parse))
4891 RExC_parse++;
4892 if (*RExC_parse!=')')
4893 vFAIL("Expecting close bracket");
894be9b7
YO
4894
4895 gen_recurse_regop:
1a147d38 4896 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4897 if (!SIZE_ONLY) {
4898 if (num > (I32)RExC_rx->nparens) {
4899 RExC_parse++;
4900 vFAIL("Reference to nonexistent group");
4901 }
40d049e4 4902 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4903 RExC_emit++;
226de585 4904 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4905 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4906 } else {
6bda09f9 4907 RExC_size++;
6bda09f9 4908 }
0a4db386 4909 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4910 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4911 Set_Node_Offset(ret, parse_start); /* MJD */
4912
6bda09f9
YO
4913 nextchar(pRExC_state);
4914 return ret;
894be9b7
YO
4915 } /* named and numeric backreferences */
4916 /* NOT REACHED */
4917
fac92740 4918 case 'p': /* (?p...) */
9014280d 4919 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4920 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4921 /* FALL THROUGH*/
fac92740 4922 case '?': /* (??...) */
6136c704 4923 is_logical = 1;
438a3801
YST
4924 if (*RExC_parse != '{')
4925 goto unknown;
830247a4 4926 paren = *RExC_parse++;
0f5d15d6 4927 /* FALL THROUGH */
fac92740 4928 case '{': /* (?{...}) */
c277df42 4929 {
c277df42
IZ
4930 I32 count = 1, n = 0;
4931 char c;
830247a4 4932 char *s = RExC_parse;
c277df42 4933
830247a4
IZ
4934 RExC_seen_zerolen++;
4935 RExC_seen |= REG_SEEN_EVAL;
4936 while (count && (c = *RExC_parse)) {
6136c704
AL
4937 if (c == '\\') {
4938 if (RExC_parse[1])
4939 RExC_parse++;
4940 }
b81d288d 4941 else if (c == '{')
c277df42 4942 count++;
b81d288d 4943 else if (c == '}')
c277df42 4944 count--;
830247a4 4945 RExC_parse++;
c277df42 4946 }
6136c704 4947 if (*RExC_parse != ')') {
b81d288d 4948 RExC_parse = s;
b45f050a
JF
4949 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4950 }
c277df42 4951 if (!SIZE_ONLY) {
f3548bdc 4952 PAD *pad;
6136c704
AL
4953 OP_4tree *sop, *rop;
4954 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4955
569233ed
SB
4956 ENTER;
4957 Perl_save_re_context(aTHX);
f3548bdc 4958 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4959 sop->op_private |= OPpREFCOUNTED;
4960 /* re_dup will OpREFCNT_inc */
4961 OpREFCNT_set(sop, 1);
569233ed 4962 LEAVE;
c277df42 4963
830247a4
IZ
4964 n = add_data(pRExC_state, 3, "nop");
4965 RExC_rx->data->data[n] = (void*)rop;
4966 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4967 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4968 SvREFCNT_dec(sv);
a0ed51b3 4969 }
e24b16f9 4970 else { /* First pass */
830247a4 4971 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4972 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4973 /* No compiled RE interpolated, has runtime
4974 components ===> unsafe. */
4975 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4976 if (PL_tainting && PL_tainted)
cc6b7395 4977 FAIL("Eval-group in insecure regular expression");
54df2634 4978#if PERL_VERSION > 8
923e4eb5 4979 if (IN_PERL_COMPILETIME)
b5c19bd7 4980 PL_cv_has_eval = 1;
54df2634 4981#endif
c277df42 4982 }
b5c19bd7 4983
830247a4 4984 nextchar(pRExC_state);
6136c704 4985 if (is_logical) {
830247a4 4986 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4987 if (!SIZE_ONLY)
4988 ret->flags = 2;
3dab1dad 4989 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4990 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4991 return ret;
4992 }
ccb2c380
MP
4993 ret = reganode(pRExC_state, EVAL, n);
4994 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4995 Set_Node_Offset(ret, parse_start);
4996 return ret;
c277df42 4997 }
fac92740 4998 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4999 {
0a4db386 5000 int is_define= 0;
fac92740 5001 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5002 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5003 || RExC_parse[1] == '<'
830247a4 5004 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5005 I32 flag;
5006
830247a4 5007 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5008 if (!SIZE_ONLY)
5009 ret->flags = 1;
3dab1dad 5010 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5011 goto insert_if;
b81d288d 5012 }
a0ed51b3 5013 }
0a4db386
YO
5014 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5015 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5016 {
5017 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5018 char *name_start= RExC_parse++;
5019 I32 num = 0;
5020 SV *sv_dat=reg_scan_name(pRExC_state,
5021 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5022 if (RExC_parse == name_start || *RExC_parse != ch)
5023 vFAIL2("Sequence (?(%c... not terminated",
5024 (ch == '>' ? '<' : ch));
5025 RExC_parse++;
5026 if (!SIZE_ONLY) {
5027 num = add_data( pRExC_state, 1, "S" );
5028 RExC_rx->data->data[num]=(void*)sv_dat;
5029 SvREFCNT_inc(sv_dat);
5030 }
5031 ret = reganode(pRExC_state,NGROUPP,num);
5032 goto insert_if_check_paren;
5033 }
5034 else if (RExC_parse[0] == 'D' &&
5035 RExC_parse[1] == 'E' &&
5036 RExC_parse[2] == 'F' &&
5037 RExC_parse[3] == 'I' &&
5038 RExC_parse[4] == 'N' &&
5039 RExC_parse[5] == 'E')
5040 {
5041 ret = reganode(pRExC_state,DEFINEP,0);
5042 RExC_parse +=6 ;
5043 is_define = 1;
5044 goto insert_if_check_paren;
5045 }
5046 else if (RExC_parse[0] == 'R') {
5047 RExC_parse++;
5048 parno = 0;
5049 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5050 parno = atoi(RExC_parse++);
5051 while (isDIGIT(*RExC_parse))
5052 RExC_parse++;
5053 } else if (RExC_parse[0] == '&') {
5054 SV *sv_dat;
5055 RExC_parse++;
5056 sv_dat = reg_scan_name(pRExC_state,
5057 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5058 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5059 }
1a147d38 5060 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5061 goto insert_if_check_paren;
5062 }
830247a4 5063 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5064 /* (?(1)...) */
6136c704 5065 char c;
830247a4 5066 parno = atoi(RExC_parse++);
c277df42 5067
830247a4
IZ
5068 while (isDIGIT(*RExC_parse))
5069 RExC_parse++;
fac92740 5070 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5071
0a4db386 5072 insert_if_check_paren:
830247a4 5073 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5074 vFAIL("Switch condition not recognized");
c277df42 5075 insert_if:
3dab1dad
YO
5076 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5077 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5078 if (br == NULL)
830247a4 5079 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5080 else
3dab1dad 5081 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5082 c = *nextchar(pRExC_state);
d1b80229
IZ
5083 if (flags&HASWIDTH)
5084 *flagp |= HASWIDTH;
c277df42 5085 if (c == '|') {
0a4db386
YO
5086 if (is_define)
5087 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5088 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5089 regbranch(pRExC_state, &flags, 1,depth+1);
5090 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5091 if (flags&HASWIDTH)
5092 *flagp |= HASWIDTH;
830247a4 5093 c = *nextchar(pRExC_state);
a0ed51b3
LW
5094 }
5095 else
c277df42
IZ
5096 lastbr = NULL;
5097 if (c != ')')
8615cb43 5098 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5099 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5100 REGTAIL(pRExC_state, br, ender);
c277df42 5101 if (lastbr) {
3dab1dad
YO
5102 REGTAIL(pRExC_state, lastbr, ender);
5103 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5104 }
5105 else
3dab1dad 5106 REGTAIL(pRExC_state, ret, ender);
c277df42 5107 return ret;
a0ed51b3
LW
5108 }
5109 else {
830247a4 5110 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5111 }
5112 }
1b1626e4 5113 case 0:
830247a4 5114 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5115 vFAIL("Sequence (? incomplete");
1b1626e4 5116 break;
a0d0e21e 5117 default:
830247a4 5118 --RExC_parse;
fac92740 5119 parse_flags: /* (?i) */
830247a4 5120 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5121 /* (?g), (?gc) and (?o) are useless here
5122 and must be globally applied -- japhy */
5123
5124 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5125 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5126 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5127 if (! (wastedflags & wflagbit) ) {
5128 wastedflags |= wflagbit;
5129 vWARN5(
5130 RExC_parse + 1,
5131 "Useless (%s%c) - %suse /%c modifier",
5132 flagsp == &negflags ? "?-" : "?",
5133 *RExC_parse,
5134 flagsp == &negflags ? "don't " : "",
5135 *RExC_parse
5136 );
5137 }
5138 }
5139 }
5140 else if (*RExC_parse == 'c') {
5141 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5142 if (! (wastedflags & WASTED_C) ) {
5143 wastedflags |= WASTED_GC;
9d1d55b5
JP
5144 vWARN3(
5145 RExC_parse + 1,
5146 "Useless (%sc) - %suse /gc modifier",
5147 flagsp == &negflags ? "?-" : "?",
5148 flagsp == &negflags ? "don't " : ""
5149 );
5150 }
5151 }
5152 }
5153 else { pmflag(flagsp, *RExC_parse); }
5154
830247a4 5155 ++RExC_parse;
ca9dfc88 5156 }
830247a4 5157 if (*RExC_parse == '-') {
ca9dfc88 5158 flagsp = &negflags;
9d1d55b5 5159 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5160 ++RExC_parse;
ca9dfc88 5161 goto parse_flags;
48c036b1 5162 }
e2509266
JH
5163 RExC_flags |= posflags;
5164 RExC_flags &= ~negflags;
830247a4
IZ
5165 if (*RExC_parse == ':') {
5166 RExC_parse++;
ca9dfc88
IZ
5167 paren = ':';
5168 break;
5169 }
c277df42 5170 unknown:
830247a4
IZ
5171 if (*RExC_parse != ')') {
5172 RExC_parse++;
5173 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5174 }
830247a4 5175 nextchar(pRExC_state);
a0d0e21e
LW
5176 *flagp = TRYAGAIN;
5177 return NULL;
5178 }
5179 }
fac92740 5180 else { /* (...) */
81714fb9 5181 capturing_parens:
830247a4
IZ
5182 parno = RExC_npar;
5183 RExC_npar++;
e2e6a0f1 5184
830247a4 5185 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5186 if (!SIZE_ONLY ){
5187 if (!RExC_nestroot)
5188 RExC_nestroot = parno;
5189 if (RExC_seen & REG_SEEN_RECURSE) {
5190 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5191 "Setting open paren #%"IVdf" to %d\n",
5192 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5193 RExC_open_parens[parno-1]= ret;
5194 }
6bda09f9 5195 }
fac92740
MJD
5196 Set_Node_Length(ret, 1); /* MJD */
5197 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5198 is_open = 1;
a0d0e21e 5199 }
a0ed51b3 5200 }
fac92740 5201 else /* ! paren */
a0d0e21e
LW
5202 ret = NULL;
5203
5204 /* Pick up the branches, linking them together. */
fac92740 5205 parse_start = RExC_parse; /* MJD */
3dab1dad 5206 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5207 /* branch_len = (paren != 0); */
2af232bd 5208
a0d0e21e
LW
5209 if (br == NULL)
5210 return(NULL);
830247a4
IZ
5211 if (*RExC_parse == '|') {
5212 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5213 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5214 }
fac92740 5215 else { /* MJD */
6bda09f9 5216 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5217 Set_Node_Length(br, paren != 0);
5218 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5219 }
c277df42
IZ
5220 have_branch = 1;
5221 if (SIZE_ONLY)
830247a4 5222 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5223 }
5224 else if (paren == ':') {
c277df42
IZ
5225 *flagp |= flags&SIMPLE;
5226 }
6136c704 5227 if (is_open) { /* Starts with OPEN. */
3dab1dad 5228 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5229 }
5230 else if (paren != '?') /* Not Conditional */
a0d0e21e 5231 ret = br;
32a0ca98 5232 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5233 lastbr = br;
830247a4
IZ
5234 while (*RExC_parse == '|') {
5235 if (!SIZE_ONLY && RExC_extralen) {
5236 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5237 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5238 }
5239 if (SIZE_ONLY)
830247a4
IZ
5240 RExC_extralen += 2; /* Account for LONGJMP. */
5241 nextchar(pRExC_state);
3dab1dad 5242 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5243
a687059c 5244 if (br == NULL)
a0d0e21e 5245 return(NULL);
3dab1dad 5246 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5247 lastbr = br;
821b33a5
IZ
5248 if (flags&HASWIDTH)
5249 *flagp |= HASWIDTH;
a687059c 5250 *flagp |= flags&SPSTART;
a0d0e21e
LW
5251 }
5252
c277df42
IZ
5253 if (have_branch || paren != ':') {
5254 /* Make a closing node, and hook it on the end. */
5255 switch (paren) {
5256 case ':':
830247a4 5257 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5258 break;
5259 case 1:
830247a4 5260 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5261 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5262 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5263 "Setting close paren #%"IVdf" to %d\n",
5264 (IV)parno, REG_NODE_NUM(ender)));
5265 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5266 if (RExC_nestroot == parno)
5267 RExC_nestroot = 0;
40d049e4 5268 }
fac92740
MJD
5269 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5270 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5271 break;
5272 case '<':
c277df42
IZ
5273 case ',':
5274 case '=':
5275 case '!':
c277df42 5276 *flagp &= ~HASWIDTH;
821b33a5
IZ
5277 /* FALL THROUGH */
5278 case '>':
830247a4 5279 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5280 break;
5281 case 0:
830247a4 5282 ender = reg_node(pRExC_state, END);
40d049e4
YO
5283 if (!SIZE_ONLY) {
5284 assert(!RExC_opend); /* there can only be one! */
5285 RExC_opend = ender;
5286 }
c277df42
IZ
5287 break;
5288 }
eaf3ca90 5289 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5290
9674d46a 5291 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5292 if (depth==1)
5293 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5294
c277df42 5295 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5296 for (br = ret; br; br = regnext(br)) {
5297 const U8 op = PL_regkind[OP(br)];
5298 if (op == BRANCH) {
07be1b83 5299 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5300 }
5301 else if (op == BRANCHJ) {
07be1b83 5302 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5303 }
c277df42
IZ
5304 }
5305 }
a0d0e21e 5306 }
c277df42
IZ
5307
5308 {
e1ec3a88
AL
5309 const char *p;
5310 static const char parens[] = "=!<,>";
c277df42
IZ
5311
5312 if (paren && (p = strchr(parens, paren))) {
eb160463 5313 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5314 int flag = (p - parens) > 1;
5315
5316 if (paren == '>')
5317 node = SUSPEND, flag = 0;
6bda09f9 5318 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5319 Set_Node_Cur_Length(ret);
5320 Set_Node_Offset(ret, parse_start + 1);
c277df42 5321 ret->flags = flag;
07be1b83 5322 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5323 }
a0d0e21e
LW
5324 }
5325
5326 /* Check for proper termination. */
ce3e6498 5327 if (paren) {
e2509266 5328 RExC_flags = oregflags;
830247a4
IZ
5329 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5330 RExC_parse = oregcomp_parse;
380a0633 5331 vFAIL("Unmatched (");
ce3e6498 5332 }
a0ed51b3 5333 }
830247a4
IZ
5334 else if (!paren && RExC_parse < RExC_end) {
5335 if (*RExC_parse == ')') {
5336 RExC_parse++;
380a0633 5337 vFAIL("Unmatched )");
a0ed51b3
LW
5338 }
5339 else
b45f050a 5340 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5341 /* NOTREACHED */
5342 }
a687059c 5343
a0d0e21e 5344 return(ret);
a687059c
LW
5345}
5346
5347/*
5348 - regbranch - one alternative of an | operator
5349 *
5350 * Implements the concatenation operator.
5351 */
76e3520e 5352STATIC regnode *
3dab1dad 5353S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5354{
97aff369 5355 dVAR;
c277df42
IZ
5356 register regnode *ret;
5357 register regnode *chain = NULL;
5358 register regnode *latest;
5359 I32 flags = 0, c = 0;
3dab1dad
YO
5360 GET_RE_DEBUG_FLAGS_DECL;
5361 DEBUG_PARSE("brnc");
b81d288d 5362 if (first)
c277df42
IZ
5363 ret = NULL;
5364 else {
b81d288d 5365 if (!SIZE_ONLY && RExC_extralen)
830247a4 5366 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5367 else {
830247a4 5368 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5369 Set_Node_Length(ret, 1);
5370 }
c277df42
IZ
5371 }
5372
b81d288d 5373 if (!first && SIZE_ONLY)
830247a4 5374 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5375
c277df42 5376 *flagp = WORST; /* Tentatively. */
a0d0e21e 5377
830247a4
IZ
5378 RExC_parse--;
5379 nextchar(pRExC_state);
5380 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5381 flags &= ~TRYAGAIN;
3dab1dad 5382 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5383 if (latest == NULL) {
5384 if (flags & TRYAGAIN)
5385 continue;
5386 return(NULL);
a0ed51b3
LW
5387 }
5388 else if (ret == NULL)
c277df42 5389 ret = latest;
a0d0e21e 5390 *flagp |= flags&HASWIDTH;
c277df42 5391 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5392 *flagp |= flags&SPSTART;
5393 else {
830247a4 5394 RExC_naughty++;
3dab1dad 5395 REGTAIL(pRExC_state, chain, latest);
a687059c 5396 }
a0d0e21e 5397 chain = latest;
c277df42
IZ
5398 c++;
5399 }
5400 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5401 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5402 if (ret == NULL)
5403 ret = chain;
5404 }
5405 if (c == 1) {
5406 *flagp |= flags&SIMPLE;
a0d0e21e 5407 }
a687059c 5408
d4c19fe8 5409 return ret;
a687059c
LW
5410}
5411
5412/*
5413 - regpiece - something followed by possible [*+?]
5414 *
5415 * Note that the branching code sequences used for ? and the general cases
5416 * of * and + are somewhat optimized: they use the same NOTHING node as
5417 * both the endmarker for their branch list and the body of the last branch.
5418 * It might seem that this node could be dispensed with entirely, but the
5419 * endmarker role is not redundant.
5420 */
76e3520e 5421STATIC regnode *
3dab1dad 5422S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5423{
97aff369 5424 dVAR;
c277df42 5425 register regnode *ret;
a0d0e21e
LW
5426 register char op;
5427 register char *next;
5428 I32 flags;
1df70142 5429 const char * const origparse = RExC_parse;
a0d0e21e 5430 I32 min;
c277df42 5431 I32 max = REG_INFTY;
fac92740 5432 char *parse_start;
10edeb5d 5433 const char *maxpos = NULL;
3dab1dad
YO
5434 GET_RE_DEBUG_FLAGS_DECL;
5435 DEBUG_PARSE("piec");
a0d0e21e 5436
3dab1dad 5437 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5438 if (ret == NULL) {
5439 if (flags & TRYAGAIN)
5440 *flagp |= TRYAGAIN;
5441 return(NULL);
5442 }
5443
830247a4 5444 op = *RExC_parse;
a0d0e21e 5445
830247a4 5446 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5447 maxpos = NULL;
fac92740 5448 parse_start = RExC_parse; /* MJD */
830247a4 5449 next = RExC_parse + 1;
a0d0e21e
LW
5450 while (isDIGIT(*next) || *next == ',') {
5451 if (*next == ',') {
5452 if (maxpos)
5453 break;
5454 else
5455 maxpos = next;
a687059c 5456 }
a0d0e21e
LW
5457 next++;
5458 }
5459 if (*next == '}') { /* got one */
5460 if (!maxpos)
5461 maxpos = next;
830247a4
IZ
5462 RExC_parse++;
5463 min = atoi(RExC_parse);
a0d0e21e
LW
5464 if (*maxpos == ',')
5465 maxpos++;
5466 else
830247a4 5467 maxpos = RExC_parse;
a0d0e21e
LW
5468 max = atoi(maxpos);
5469 if (!max && *maxpos != '0')
c277df42
IZ
5470 max = REG_INFTY; /* meaning "infinity" */
5471 else if (max >= REG_INFTY)
8615cb43 5472 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5473 RExC_parse = next;
5474 nextchar(pRExC_state);
a0d0e21e
LW
5475
5476 do_curly:
5477 if ((flags&SIMPLE)) {
830247a4 5478 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5479 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5480 Set_Node_Offset(ret, parse_start+1); /* MJD */
5481 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5482 }
5483 else {
3dab1dad 5484 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5485
5486 w->flags = 0;
3dab1dad 5487 REGTAIL(pRExC_state, ret, w);
830247a4 5488 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5489 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5490 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5491 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5492 }
6bda09f9 5493 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5494 /* MJD hk */
5495 Set_Node_Offset(ret, parse_start+1);
2af232bd 5496 Set_Node_Length(ret,
fac92740 5497 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5498
830247a4 5499 if (!SIZE_ONLY && RExC_extralen)
c277df42 5500 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5501 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5502 if (SIZE_ONLY)
830247a4
IZ
5503 RExC_whilem_seen++, RExC_extralen += 3;
5504 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5505 }
c277df42 5506 ret->flags = 0;
a0d0e21e
LW
5507
5508 if (min > 0)
821b33a5
IZ
5509 *flagp = WORST;
5510 if (max > 0)
5511 *flagp |= HASWIDTH;
a0d0e21e 5512 if (max && max < min)
8615cb43 5513 vFAIL("Can't do {n,m} with n > m");
c277df42 5514 if (!SIZE_ONLY) {
eb160463
GS
5515 ARG1_SET(ret, (U16)min);
5516 ARG2_SET(ret, (U16)max);
a687059c 5517 }
a687059c 5518
a0d0e21e 5519 goto nest_check;
a687059c 5520 }
a0d0e21e 5521 }
a687059c 5522
a0d0e21e
LW
5523 if (!ISMULT1(op)) {
5524 *flagp = flags;
a687059c 5525 return(ret);
a0d0e21e 5526 }
bb20fd44 5527
c277df42 5528#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5529
5530 /* if this is reinstated, don't forget to put this back into perldiag:
5531
5532 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5533
5534 (F) The part of the regexp subject to either the * or + quantifier
5535 could match an empty string. The {#} shows in the regular
5536 expression about where the problem was discovered.
5537
5538 */
5539
bb20fd44 5540 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5541 vFAIL("Regexp *+ operand could be empty");
b81d288d 5542#endif
bb20fd44 5543
fac92740 5544 parse_start = RExC_parse;
830247a4 5545 nextchar(pRExC_state);
a0d0e21e 5546
821b33a5 5547 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5548
5549 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5550 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5551 ret->flags = 0;
830247a4 5552 RExC_naughty += 4;
a0d0e21e
LW
5553 }
5554 else if (op == '*') {
5555 min = 0;
5556 goto do_curly;
a0ed51b3
LW
5557 }
5558 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5559 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5560 ret->flags = 0;
830247a4 5561 RExC_naughty += 3;
a0d0e21e
LW
5562 }
5563 else if (op == '+') {
5564 min = 1;
5565 goto do_curly;
a0ed51b3
LW
5566 }
5567 else if (op == '?') {
a0d0e21e
LW
5568 min = 0; max = 1;
5569 goto do_curly;
5570 }
5571 nest_check:
041457d9 5572 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5573 vWARN3(RExC_parse,
b45f050a 5574 "%.*s matches null string many times",
afd78fd5 5575 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5576 origparse);
a0d0e21e
LW
5577 }
5578
b9b4dddf 5579 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5580 nextchar(pRExC_state);
6bda09f9 5581 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5582 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5583 }
b9b4dddf
YO
5584#ifndef REG_ALLOW_MINMOD_SUSPEND
5585 else
5586#endif
5587 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5588 regnode *ender;
5589 nextchar(pRExC_state);
5590 ender = reg_node(pRExC_state, SUCCEED);
5591 REGTAIL(pRExC_state, ret, ender);
5592 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5593 ret->flags = 0;
5594 ender = reg_node(pRExC_state, TAIL);
5595 REGTAIL(pRExC_state, ret, ender);
5596 /*ret= ender;*/
5597 }
5598
5599 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5600 RExC_parse++;
b45f050a
JF
5601 vFAIL("Nested quantifiers");
5602 }
a0d0e21e
LW
5603
5604 return(ret);
a687059c
LW
5605}
5606
fc8cd66c
YO
5607
5608/* reg_namedseq(pRExC_state,UVp)
5609
5610 This is expected to be called by a parser routine that has
5611 recognized'\N' and needs to handle the rest. RExC_parse is
5612 expected to point at the first char following the N at the time
5613 of the call.
5614
5615 If valuep is non-null then it is assumed that we are parsing inside
5616 of a charclass definition and the first codepoint in the resolved
5617 string is returned via *valuep and the routine will return NULL.
5618 In this mode if a multichar string is returned from the charnames
5619 handler a warning will be issued, and only the first char in the
5620 sequence will be examined. If the string returned is zero length
5621 then the value of *valuep is undefined and NON-NULL will
5622 be returned to indicate failure. (This will NOT be a valid pointer
5623 to a regnode.)
5624
5625 If value is null then it is assumed that we are parsing normal text
5626 and inserts a new EXACT node into the program containing the resolved
5627 string and returns a pointer to the new node. If the string is
5628 zerolength a NOTHING node is emitted.
5629
5630 On success RExC_parse is set to the char following the endbrace.
5631 Parsing failures will generate a fatal errorvia vFAIL(...)
5632
5633 NOTE: We cache all results from the charnames handler locally in
5634 the RExC_charnames hash (created on first use) to prevent a charnames
5635 handler from playing silly-buggers and returning a short string and
5636 then a long string for a given pattern. Since the regexp program
5637 size is calculated during an initial parse this would result
5638 in a buffer overrun so we cache to prevent the charname result from
5639 changing during the course of the parse.
5640
5641 */
5642STATIC regnode *
5643S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5644{
5645 char * name; /* start of the content of the name */
5646 char * endbrace; /* endbrace following the name */
5647 SV *sv_str = NULL;
5648 SV *sv_name = NULL;
5649 STRLEN len; /* this has various purposes throughout the code */
5650 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5651 regnode *ret = NULL;
5652
5653 if (*RExC_parse != '{') {
5654 vFAIL("Missing braces on \\N{}");
5655 }
5656 name = RExC_parse+1;
5657 endbrace = strchr(RExC_parse, '}');
5658 if ( ! endbrace ) {
5659 RExC_parse++;
5660 vFAIL("Missing right brace on \\N{}");
5661 }
5662 RExC_parse = endbrace + 1;
5663
5664
5665 /* RExC_parse points at the beginning brace,
5666 endbrace points at the last */
5667 if ( name[0]=='U' && name[1]=='+' ) {
5668 /* its a "unicode hex" notation {U+89AB} */
5669 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5670 | PERL_SCAN_DISALLOW_PREFIX
5671 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5672 UV cp;
196f1508 5673 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5674 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5675 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5676 cp = 0xFFFD;
5677 }
5678 if (cp > 0xff)
5679 RExC_utf8 = 1;
5680 if ( valuep ) {
5681 *valuep = cp;
5682 return NULL;
5683 }
5684 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5685 } else {
5686 /* fetch the charnames handler for this scope */
5687 HV * const table = GvHV(PL_hintgv);
5688 SV **cvp= table ?
5689 hv_fetchs(table, "charnames", FALSE) :
5690 NULL;
5691 SV *cv= cvp ? *cvp : NULL;
5692 HE *he_str;
5693 int count;
5694 /* create an SV with the name as argument */
5695 sv_name = newSVpvn(name, endbrace - name);
5696
5697 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5698 vFAIL2("Constant(\\N{%s}) unknown: "
5699 "(possibly a missing \"use charnames ...\")",
5700 SvPVX(sv_name));
5701 }
5702 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5703 vFAIL2("Constant(\\N{%s}): "
5704 "$^H{charnames} is not defined",SvPVX(sv_name));
5705 }
5706
5707
5708
5709 if (!RExC_charnames) {
5710 /* make sure our cache is allocated */
5711 RExC_charnames = newHV();
6bda09f9 5712 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5713 }
5714 /* see if we have looked this one up before */
5715 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5716 if ( he_str ) {
5717 sv_str = HeVAL(he_str);
5718 cached = 1;
5719 } else {
5720 dSP ;
5721
5722 ENTER ;
5723 SAVETMPS ;
5724 PUSHMARK(SP) ;
5725
5726 XPUSHs(sv_name);
5727
5728 PUTBACK ;
5729
5730 count= call_sv(cv, G_SCALAR);
5731
5732 if (count == 1) { /* XXXX is this right? dmq */
5733 sv_str = POPs;
5734 SvREFCNT_inc_simple_void(sv_str);
5735 }
5736
5737 SPAGAIN ;
5738 PUTBACK ;
5739 FREETMPS ;
5740 LEAVE ;
5741
5742 if ( !sv_str || !SvOK(sv_str) ) {
5743 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5744 "did not return a defined value",SvPVX(sv_name));
5745 }
5746 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5747 cached = 1;
5748 }
5749 }
5750 if (valuep) {
5751 char *p = SvPV(sv_str, len);
5752 if (len) {
5753 STRLEN numlen = 1;
5754 if ( SvUTF8(sv_str) ) {
196f1508 5755 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5756 if (*valuep > 0x7F)
5757 RExC_utf8 = 1;
5758 /* XXXX
5759 We have to turn on utf8 for high bit chars otherwise
5760 we get failures with
5761
5762 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5763 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5764
5765 This is different from what \x{} would do with the same
5766 codepoint, where the condition is > 0xFF.
5767 - dmq
5768 */
5769
5770
5771 } else {
5772 *valuep = (UV)*p;
5773 /* warn if we havent used the whole string? */
5774 }
5775 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5776 vWARN2(RExC_parse,
5777 "Ignoring excess chars from \\N{%s} in character class",
5778 SvPVX(sv_name)
5779 );
5780 }
5781 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5782 vWARN2(RExC_parse,
5783 "Ignoring zero length \\N{%s} in character class",
5784 SvPVX(sv_name)
5785 );
5786 }
5787 if (sv_name)
5788 SvREFCNT_dec(sv_name);
5789 if (!cached)
5790 SvREFCNT_dec(sv_str);
5791 return len ? NULL : (regnode *)&len;
5792 } else if(SvCUR(sv_str)) {
5793
5794 char *s;
5795 char *p, *pend;
5796 STRLEN charlen = 1;
5797 char * parse_start = name-3; /* needed for the offsets */
5798 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5799
5800 ret = reg_node(pRExC_state,
5801 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5802 s= STRING(ret);
5803
5804 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5805 sv_utf8_upgrade(sv_str);
5806 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5807 RExC_utf8= 1;
5808 }
5809
5810 p = SvPV(sv_str, len);
5811 pend = p + len;
5812 /* len is the length written, charlen is the size the char read */
5813 for ( len = 0; p < pend; p += charlen ) {
5814 if (UTF) {
196f1508 5815 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5816 if (FOLD) {
5817 STRLEN foldlen,numlen;
5818 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5819 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5820 /* Emit all the Unicode characters. */
5821
5822 for (foldbuf = tmpbuf;
5823 foldlen;
5824 foldlen -= numlen)
5825 {
5826 uvc = utf8_to_uvchr(foldbuf, &numlen);
5827 if (numlen > 0) {
5828 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5829 s += unilen;
5830 len += unilen;
5831 /* In EBCDIC the numlen
5832 * and unilen can differ. */
5833 foldbuf += numlen;
5834 if (numlen >= foldlen)
5835 break;
5836 }
5837 else
5838 break; /* "Can't happen." */
5839 }
5840 } else {
5841 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5842 if (unilen > 0) {
5843 s += unilen;
5844 len += unilen;
5845 }
5846 }
5847 } else {
5848 len++;
5849 REGC(*p, s++);
5850 }
5851 }
5852 if (SIZE_ONLY) {
5853 RExC_size += STR_SZ(len);
5854 } else {
5855 STR_LEN(ret) = len;
5856 RExC_emit += STR_SZ(len);
5857 }
5858 Set_Node_Cur_Length(ret); /* MJD */
5859 RExC_parse--;
5860 nextchar(pRExC_state);
5861 } else {
5862 ret = reg_node(pRExC_state,NOTHING);
5863 }
5864 if (!cached) {
5865 SvREFCNT_dec(sv_str);
5866 }
5867 if (sv_name) {
5868 SvREFCNT_dec(sv_name);
5869 }
5870 return ret;
5871
5872}
5873
5874
9e08bc66
TS
5875/*
5876 * reg_recode
5877 *
5878 * It returns the code point in utf8 for the value in *encp.
5879 * value: a code value in the source encoding
5880 * encp: a pointer to an Encode object
5881 *
5882 * If the result from Encode is not a single character,
5883 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5884 */
5885STATIC UV
5886S_reg_recode(pTHX_ const char value, SV **encp)
5887{
5888 STRLEN numlen = 1;
5889 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5890 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5891 : SvPVX(sv);
5892 const STRLEN newlen = SvCUR(sv);
5893 UV uv = UNICODE_REPLACEMENT;
5894
5895 if (newlen)
5896 uv = SvUTF8(sv)
5897 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5898 : *(U8*)s;
5899
5900 if (!newlen || numlen != newlen) {
5901 uv = UNICODE_REPLACEMENT;
5902 if (encp)
5903 *encp = NULL;
5904 }
5905 return uv;
5906}
5907
fc8cd66c 5908
a687059c
LW
5909/*
5910 - regatom - the lowest level
5911 *
5912 * Optimization: gobbles an entire sequence of ordinary characters so that
5913 * it can turn them into a single node, which is smaller to store and
5914 * faster to run. Backslashed characters are exceptions, each becoming a
5915 * separate node; the code is simpler that way and it's not worth fixing.
5916 *
7f6f358c
YO
5917 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5918 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5919 */
76e3520e 5920STATIC regnode *
3dab1dad 5921S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5922{
97aff369 5923 dVAR;
cbbf8932 5924 register regnode *ret = NULL;
a0d0e21e 5925 I32 flags;
45948336 5926 char *parse_start = RExC_parse;
3dab1dad
YO
5927 GET_RE_DEBUG_FLAGS_DECL;
5928 DEBUG_PARSE("atom");
a0d0e21e
LW
5929 *flagp = WORST; /* Tentatively. */
5930
5931tryagain:
830247a4 5932 switch (*RExC_parse) {
a0d0e21e 5933 case '^':
830247a4
IZ
5934 RExC_seen_zerolen++;
5935 nextchar(pRExC_state);
e2509266 5936 if (RExC_flags & PMf_MULTILINE)
830247a4 5937 ret = reg_node(pRExC_state, MBOL);
e2509266 5938 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5939 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5940 else
830247a4 5941 ret = reg_node(pRExC_state, BOL);
fac92740 5942 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5943 break;
5944 case '$':
830247a4 5945 nextchar(pRExC_state);
b81d288d 5946 if (*RExC_parse)
830247a4 5947 RExC_seen_zerolen++;
e2509266 5948 if (RExC_flags & PMf_MULTILINE)
830247a4 5949 ret = reg_node(pRExC_state, MEOL);
e2509266 5950 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5951 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5952 else
830247a4 5953 ret = reg_node(pRExC_state, EOL);
fac92740 5954 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5955 break;
5956 case '.':
830247a4 5957 nextchar(pRExC_state);
e2509266 5958 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5959 ret = reg_node(pRExC_state, SANY);
5960 else
5961 ret = reg_node(pRExC_state, REG_ANY);
5962 *flagp |= HASWIDTH|SIMPLE;
830247a4 5963 RExC_naughty++;
fac92740 5964 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5965 break;
5966 case '[':
b45f050a 5967 {
3dab1dad
YO
5968 char * const oregcomp_parse = ++RExC_parse;
5969 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5970 if (*RExC_parse != ']') {
5971 RExC_parse = oregcomp_parse;
b45f050a
JF
5972 vFAIL("Unmatched [");
5973 }
830247a4 5974 nextchar(pRExC_state);
a0d0e21e 5975 *flagp |= HASWIDTH|SIMPLE;
fac92740 5976 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5977 break;
b45f050a 5978 }
a0d0e21e 5979 case '(':
830247a4 5980 nextchar(pRExC_state);
3dab1dad 5981 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5982 if (ret == NULL) {
bf93d4cc 5983 if (flags & TRYAGAIN) {
830247a4 5984 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5985 /* Make parent create an empty node if needed. */
5986 *flagp |= TRYAGAIN;
5987 return(NULL);
5988 }
a0d0e21e 5989 goto tryagain;
bf93d4cc 5990 }
a0d0e21e
LW
5991 return(NULL);
5992 }
c277df42 5993 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5994 break;
5995 case '|':
5996 case ')':
5997 if (flags & TRYAGAIN) {
5998 *flagp |= TRYAGAIN;
5999 return NULL;
6000 }
b45f050a 6001 vFAIL("Internal urp");
a0d0e21e
LW
6002 /* Supposed to be caught earlier. */
6003 break;
85afd4ae 6004 case '{':
830247a4
IZ
6005 if (!regcurly(RExC_parse)) {
6006 RExC_parse++;
85afd4ae
CS
6007 goto defchar;
6008 }
6009 /* FALL THROUGH */
a0d0e21e
LW
6010 case '?':
6011 case '+':
6012 case '*':
830247a4 6013 RExC_parse++;
b45f050a 6014 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6015 break;
6016 case '\\':
830247a4 6017 switch (*++RExC_parse) {
a0d0e21e 6018 case 'A':
830247a4
IZ
6019 RExC_seen_zerolen++;
6020 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6021 *flagp |= SIMPLE;
830247a4 6022 nextchar(pRExC_state);
fac92740 6023 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6024 break;
6025 case 'G':
830247a4
IZ
6026 ret = reg_node(pRExC_state, GPOS);
6027 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6028 *flagp |= SIMPLE;
830247a4 6029 nextchar(pRExC_state);
fac92740 6030 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6031 break;
6032 case 'Z':
830247a4 6033 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6034 *flagp |= SIMPLE;
a1917ab9 6035 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6036 nextchar(pRExC_state);
a0d0e21e 6037 break;
b85d18e9 6038 case 'z':
830247a4 6039 ret = reg_node(pRExC_state, EOS);
b85d18e9 6040 *flagp |= SIMPLE;
830247a4
IZ
6041 RExC_seen_zerolen++; /* Do not optimize RE away */
6042 nextchar(pRExC_state);
fac92740 6043 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6044 break;
4a2d328f 6045 case 'C':
f33976b4
DB
6046 ret = reg_node(pRExC_state, CANY);
6047 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6048 *flagp |= HASWIDTH|SIMPLE;
830247a4 6049 nextchar(pRExC_state);
fac92740 6050 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6051 break;
6052 case 'X':
830247a4 6053 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6054 *flagp |= HASWIDTH;
830247a4 6055 nextchar(pRExC_state);
fac92740 6056 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6057 break;
a0d0e21e 6058 case 'w':
eb160463 6059 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6060 *flagp |= HASWIDTH|SIMPLE;
830247a4 6061 nextchar(pRExC_state);
fac92740 6062 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6063 break;
6064 case 'W':
eb160463 6065 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6066 *flagp |= HASWIDTH|SIMPLE;
830247a4 6067 nextchar(pRExC_state);
fac92740 6068 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6069 break;
6070 case 'b':
830247a4
IZ
6071 RExC_seen_zerolen++;
6072 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6073 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6074 *flagp |= SIMPLE;
830247a4 6075 nextchar(pRExC_state);
fac92740 6076 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6077 break;
6078 case 'B':
830247a4
IZ
6079 RExC_seen_zerolen++;
6080 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6081 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6082 *flagp |= SIMPLE;
830247a4 6083 nextchar(pRExC_state);
fac92740 6084 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6085 break;
6086 case 's':
eb160463 6087 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6088 *flagp |= HASWIDTH|SIMPLE;
830247a4 6089 nextchar(pRExC_state);
fac92740 6090 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6091 break;
6092 case 'S':
eb160463 6093 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6094 *flagp |= HASWIDTH|SIMPLE;
830247a4 6095 nextchar(pRExC_state);
fac92740 6096 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6097 break;
6098 case 'd':
ffc61ed2 6099 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6100 *flagp |= HASWIDTH|SIMPLE;
830247a4 6101 nextchar(pRExC_state);
fac92740 6102 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6103 break;
6104 case 'D':
ffc61ed2 6105 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6106 *flagp |= HASWIDTH|SIMPLE;
830247a4 6107 nextchar(pRExC_state);
fac92740 6108 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6109 break;
a14b48bc
LW
6110 case 'p':
6111 case 'P':
3568d838 6112 {
3dab1dad 6113 char* const oldregxend = RExC_end;
ccb2c380 6114 char* parse_start = RExC_parse - 2;
a14b48bc 6115
830247a4 6116 if (RExC_parse[1] == '{') {
3568d838 6117 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6118 RExC_end = strchr(RExC_parse, '}');
6119 if (!RExC_end) {
3dab1dad 6120 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6121 RExC_parse += 2;
6122 RExC_end = oldregxend;
0da60cf5 6123 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6124 }
830247a4 6125 RExC_end++;
a14b48bc 6126 }
af6f566e 6127 else {
830247a4 6128 RExC_end = RExC_parse + 2;
af6f566e
HS
6129 if (RExC_end > oldregxend)
6130 RExC_end = oldregxend;
6131 }
830247a4 6132 RExC_parse--;
a14b48bc 6133
3dab1dad 6134 ret = regclass(pRExC_state,depth+1);
a14b48bc 6135
830247a4
IZ
6136 RExC_end = oldregxend;
6137 RExC_parse--;
ccb2c380
MP
6138
6139 Set_Node_Offset(ret, parse_start + 2);
6140 Set_Node_Cur_Length(ret);
830247a4 6141 nextchar(pRExC_state);
a14b48bc
LW
6142 *flagp |= HASWIDTH|SIMPLE;
6143 }
6144 break;
fc8cd66c
YO
6145 case 'N':
6146 /* Handle \N{NAME} here and not below because it can be
6147 multicharacter. join_exact() will join them up later on.
6148 Also this makes sure that things like /\N{BLAH}+/ and
6149 \N{BLAH} being multi char Just Happen. dmq*/
6150 ++RExC_parse;
6151 ret= reg_namedseq(pRExC_state, NULL);
6152 break;
0a4db386 6153 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6154 {
6155 char ch= RExC_parse[1];
6156 if (ch != '<' && ch != '\'') {
6157 if (SIZE_ONLY)
6158 vWARN( RExC_parse + 1,
6159 "Possible broken named back reference treated as literal k");
6160 parse_start--;
6161 goto defchar;
6162 } else {
6163 char* name_start = (RExC_parse += 2);
6164 I32 num = 0;
0a4db386
YO
6165 SV *sv_dat = reg_scan_name(pRExC_state,
6166 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6167 ch= (ch == '<') ? '>' : '\'';
6168
81714fb9
YO
6169 if (RExC_parse == name_start || *RExC_parse != ch)
6170 vFAIL2("Sequence \\k%c... not terminated",
6171 (ch == '>' ? '<' : ch));
6172
6173 RExC_sawback = 1;
6174 ret = reganode(pRExC_state,
6175 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6176 num);
6177 *flagp |= HASWIDTH;
6178
6179
6180 if (!SIZE_ONLY) {
81714fb9
YO
6181 num = add_data( pRExC_state, 1, "S" );
6182 ARG_SET(ret,num);
6183 RExC_rx->data->data[num]=(void*)sv_dat;
6184 SvREFCNT_inc(sv_dat);
6185 }
6186 /* override incorrect value set in reganode MJD */
6187 Set_Node_Offset(ret, parse_start+1);
6188 Set_Node_Cur_Length(ret); /* MJD */
6189 nextchar(pRExC_state);
6190
6191 }
6192 break;
6193 }
a0d0e21e
LW
6194 case 'n':
6195 case 'r':
6196 case 't':
6197 case 'f':
6198 case 'e':
6199 case 'a':
6200 case 'x':
6201 case 'c':
6202 case '0':
6203 goto defchar;
6204 case '1': case '2': case '3': case '4':
6205 case '5': case '6': case '7': case '8': case '9':
6206 {
1df70142 6207 const I32 num = atoi(RExC_parse);
a0d0e21e 6208
830247a4 6209 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6210 goto defchar;
6211 else {
3dab1dad 6212 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6213 while (isDIGIT(*RExC_parse))
6214 RExC_parse++;
b45f050a 6215
eb160463 6216 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 6217 vFAIL("Reference to nonexistent group");
830247a4 6218 RExC_sawback = 1;
eb160463
GS
6219 ret = reganode(pRExC_state,
6220 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6221 num);
a0d0e21e 6222 *flagp |= HASWIDTH;
2af232bd 6223
fac92740 6224 /* override incorrect value set in reganode MJD */
2af232bd 6225 Set_Node_Offset(ret, parse_start+1);
fac92740 6226 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6227 RExC_parse--;
6228 nextchar(pRExC_state);
a0d0e21e
LW
6229 }
6230 }
6231 break;
6232 case '\0':
830247a4 6233 if (RExC_parse >= RExC_end)
b45f050a 6234 FAIL("Trailing \\");
a0d0e21e
LW
6235 /* FALL THROUGH */
6236 default:
a0288114 6237 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6238 back into the quick-grab loop below */
45948336 6239 parse_start--;
a0d0e21e
LW
6240 goto defchar;
6241 }
6242 break;
4633a7c4
LW
6243
6244 case '#':
e2509266 6245 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6246 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6247 RExC_parse++;
830247a4 6248 if (RExC_parse < RExC_end)
4633a7c4
LW
6249 goto tryagain;
6250 }
6251 /* FALL THROUGH */
6252
a0d0e21e 6253 default: {
ba210ebe 6254 register STRLEN len;
58ae7d3f 6255 register UV ender;
a0d0e21e 6256 register char *p;
3dab1dad 6257 char *s;
80aecb99 6258 STRLEN foldlen;
89ebb4a3 6259 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6260
6261 parse_start = RExC_parse - 1;
a0d0e21e 6262
830247a4 6263 RExC_parse++;
a0d0e21e
LW
6264
6265 defchar:
58ae7d3f 6266 ender = 0;
eb160463
GS
6267 ret = reg_node(pRExC_state,
6268 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6269 s = STRING(ret);
830247a4
IZ
6270 for (len = 0, p = RExC_parse - 1;
6271 len < 127 && p < RExC_end;
a0d0e21e
LW
6272 len++)
6273 {
3dab1dad 6274 char * const oldp = p;
5b5a24f7 6275
e2509266 6276 if (RExC_flags & PMf_EXTENDED)
830247a4 6277 p = regwhite(p, RExC_end);
a0d0e21e
LW
6278 switch (*p) {
6279 case '^':
6280 case '$':
6281 case '.':
6282 case '[':
6283 case '(':
6284 case ')':
6285 case '|':
6286 goto loopdone;
6287 case '\\':
6288 switch (*++p) {
6289 case 'A':
1ed8eac0
JF
6290 case 'C':
6291 case 'X':
a0d0e21e
LW
6292 case 'G':
6293 case 'Z':
b85d18e9 6294 case 'z':
a0d0e21e
LW
6295 case 'w':
6296 case 'W':
6297 case 'b':
6298 case 'B':
6299 case 's':
6300 case 'S':
6301 case 'd':
6302 case 'D':
a14b48bc
LW
6303 case 'p':
6304 case 'P':
fc8cd66c 6305 case 'N':
a0d0e21e
LW
6306 --p;
6307 goto loopdone;
6308 case 'n':
6309 ender = '\n';
6310 p++;
a687059c 6311 break;
a0d0e21e
LW
6312 case 'r':
6313 ender = '\r';
6314 p++;
a687059c 6315 break;
a0d0e21e
LW
6316 case 't':
6317 ender = '\t';
6318 p++;
a687059c 6319 break;
a0d0e21e
LW
6320 case 'f':
6321 ender = '\f';
6322 p++;
a687059c 6323 break;
a0d0e21e 6324 case 'e':
c7f1f016 6325 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6326 p++;
a687059c 6327 break;
a0d0e21e 6328 case 'a':
c7f1f016 6329 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6330 p++;
a687059c 6331 break;
a0d0e21e 6332 case 'x':
a0ed51b3 6333 if (*++p == '{') {
1df70142 6334 char* const e = strchr(p, '}');
b81d288d 6335
b45f050a 6336 if (!e) {
830247a4 6337 RExC_parse = p + 1;
b45f050a
JF
6338 vFAIL("Missing right brace on \\x{}");
6339 }
de5f0749 6340 else {
a4c04bdc
NC
6341 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6342 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6343 STRLEN numlen = e - p - 1;
53305cf1 6344 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6345 if (ender > 0xff)
6346 RExC_utf8 = 1;
a0ed51b3
LW
6347 p = e + 1;
6348 }
a0ed51b3
LW
6349 }
6350 else {
a4c04bdc 6351 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6352 STRLEN numlen = 2;
53305cf1 6353 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6354 p += numlen;
6355 }
9e08bc66
TS
6356 if (PL_encoding && ender < 0x100)
6357 goto recode_encoding;
a687059c 6358 break;
a0d0e21e
LW
6359 case 'c':
6360 p++;
bbce6d69 6361 ender = UCHARAT(p++);
6362 ender = toCTRL(ender);
a687059c 6363 break;
a0d0e21e
LW
6364 case '0': case '1': case '2': case '3':case '4':
6365 case '5': case '6': case '7': case '8':case '9':
6366 if (*p == '0' ||
830247a4 6367 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6368 I32 flags = 0;
1df70142 6369 STRLEN numlen = 3;
53305cf1 6370 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6371 p += numlen;
6372 }
6373 else {
6374 --p;
6375 goto loopdone;
a687059c 6376 }
9e08bc66
TS
6377 if (PL_encoding && ender < 0x100)
6378 goto recode_encoding;
6379 break;
6380 recode_encoding:
6381 {
6382 SV* enc = PL_encoding;
6383 ender = reg_recode((const char)(U8)ender, &enc);
6384 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6385 vWARN(p, "Invalid escape in the specified encoding");
6386 RExC_utf8 = 1;
6387 }
a687059c 6388 break;
a0d0e21e 6389 case '\0':
830247a4 6390 if (p >= RExC_end)
b45f050a 6391 FAIL("Trailing \\");
a687059c 6392 /* FALL THROUGH */
a0d0e21e 6393 default:
041457d9 6394 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6395 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6396 goto normal_default;
a0d0e21e
LW
6397 }
6398 break;
a687059c 6399 default:
a0ed51b3 6400 normal_default:
fd400ab9 6401 if (UTF8_IS_START(*p) && UTF) {
1df70142 6402 STRLEN numlen;
5e12f4fb 6403 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6404 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6405 p += numlen;
6406 }
6407 else
6408 ender = *p++;
a0d0e21e 6409 break;
a687059c 6410 }
e2509266 6411 if (RExC_flags & PMf_EXTENDED)
830247a4 6412 p = regwhite(p, RExC_end);
60a8b682
JH
6413 if (UTF && FOLD) {
6414 /* Prime the casefolded buffer. */
ac7e0132 6415 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6416 }
a0d0e21e
LW
6417 if (ISMULT2(p)) { /* Back off on ?+*. */
6418 if (len)
6419 p = oldp;
16ea2a2e 6420 else if (UTF) {
80aecb99 6421 if (FOLD) {
60a8b682 6422 /* Emit all the Unicode characters. */
1df70142 6423 STRLEN numlen;
80aecb99
JH
6424 for (foldbuf = tmpbuf;
6425 foldlen;
6426 foldlen -= numlen) {
6427 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6428 if (numlen > 0) {
71207a34 6429 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6430 s += unilen;
6431 len += unilen;
6432 /* In EBCDIC the numlen
6433 * and unilen can differ. */
9dc45d57 6434 foldbuf += numlen;
47654450
JH
6435 if (numlen >= foldlen)
6436 break;
9dc45d57
JH
6437 }
6438 else
6439 break; /* "Can't happen." */
80aecb99
JH
6440 }
6441 }
6442 else {
71207a34 6443 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6444 if (unilen > 0) {
0ebc6274
JH
6445 s += unilen;
6446 len += unilen;
9dc45d57 6447 }
80aecb99 6448 }
a0ed51b3 6449 }
a0d0e21e
LW
6450 else {
6451 len++;
eb160463 6452 REGC((char)ender, s++);
a0d0e21e
LW
6453 }
6454 break;
a687059c 6455 }
16ea2a2e 6456 if (UTF) {
80aecb99 6457 if (FOLD) {
60a8b682 6458 /* Emit all the Unicode characters. */
1df70142 6459 STRLEN numlen;
80aecb99
JH
6460 for (foldbuf = tmpbuf;
6461 foldlen;
6462 foldlen -= numlen) {
6463 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6464 if (numlen > 0) {
71207a34 6465 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6466 len += unilen;
6467 s += unilen;
6468 /* In EBCDIC the numlen
6469 * and unilen can differ. */
9dc45d57 6470 foldbuf += numlen;
47654450
JH
6471 if (numlen >= foldlen)
6472 break;
9dc45d57
JH
6473 }
6474 else
6475 break;
80aecb99
JH
6476 }
6477 }
6478 else {
71207a34 6479 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6480 if (unilen > 0) {
0ebc6274
JH
6481 s += unilen;
6482 len += unilen;
9dc45d57 6483 }
80aecb99
JH
6484 }
6485 len--;
a0ed51b3
LW
6486 }
6487 else
eb160463 6488 REGC((char)ender, s++);
a0d0e21e
LW
6489 }
6490 loopdone:
830247a4 6491 RExC_parse = p - 1;
fac92740 6492 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6493 nextchar(pRExC_state);
793db0cb
JH
6494 {
6495 /* len is STRLEN which is unsigned, need to copy to signed */
6496 IV iv = len;
6497 if (iv < 0)
6498 vFAIL("Internal disaster");
6499 }
a0d0e21e
LW
6500 if (len > 0)
6501 *flagp |= HASWIDTH;
090f7165 6502 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6503 *flagp |= SIMPLE;
3dab1dad 6504
cd439c50 6505 if (SIZE_ONLY)
830247a4 6506 RExC_size += STR_SZ(len);
3dab1dad
YO
6507 else {
6508 STR_LEN(ret) = len;
830247a4 6509 RExC_emit += STR_SZ(len);
07be1b83 6510 }
3dab1dad 6511 }
a0d0e21e
LW
6512 break;
6513 }
a687059c 6514
a0d0e21e 6515 return(ret);
a687059c
LW
6516}
6517
873ef191 6518STATIC char *
5f66b61c 6519S_regwhite(char *p, const char *e)
5b5a24f7
CS
6520{
6521 while (p < e) {
6522 if (isSPACE(*p))
6523 ++p;
6524 else if (*p == '#') {
6525 do {
6526 p++;
6527 } while (p < e && *p != '\n');
6528 }
6529 else
6530 break;
6531 }
6532 return p;
6533}
6534
b8c5462f
JH
6535/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6536 Character classes ([:foo:]) can also be negated ([:^foo:]).
6537 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6538 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6539 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6540
6541#define POSIXCC_DONE(c) ((c) == ':')
6542#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6543#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6544
b8c5462f 6545STATIC I32
830247a4 6546S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6547{
97aff369 6548 dVAR;
936ed897 6549 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6550
830247a4 6551 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6552 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6553 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6554 const char c = UCHARAT(RExC_parse);
097eb12c 6555 char* const s = RExC_parse++;
b81d288d 6556
9a86a77b 6557 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6558 RExC_parse++;
6559 if (RExC_parse == RExC_end)
620e46c5 6560 /* Grandfather lone [:, [=, [. */
830247a4 6561 RExC_parse = s;
620e46c5 6562 else {
3dab1dad 6563 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6564 assert(*t == c);
6565
9a86a77b 6566 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6567 const char *posixcc = s + 1;
830247a4 6568 RExC_parse++; /* skip over the ending ] */
3dab1dad 6569
b8c5462f 6570 if (*s == ':') {
1df70142
AL
6571 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6572 const I32 skip = t - posixcc;
80916619
NC
6573
6574 /* Initially switch on the length of the name. */
6575 switch (skip) {
6576 case 4:
3dab1dad
YO
6577 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6578 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6579 break;
80916619
NC
6580 case 5:
6581 /* Names all of length 5. */
6582 /* alnum alpha ascii blank cntrl digit graph lower
6583 print punct space upper */
6584 /* Offset 4 gives the best switch position. */
6585 switch (posixcc[4]) {
6586 case 'a':
3dab1dad
YO
6587 if (memEQ(posixcc, "alph", 4)) /* alpha */
6588 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6589 break;
6590 case 'e':
3dab1dad
YO
6591 if (memEQ(posixcc, "spac", 4)) /* space */
6592 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6593 break;
6594 case 'h':
3dab1dad
YO
6595 if (memEQ(posixcc, "grap", 4)) /* graph */
6596 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6597 break;
6598 case 'i':
3dab1dad
YO
6599 if (memEQ(posixcc, "asci", 4)) /* ascii */
6600 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6601 break;
6602 case 'k':
3dab1dad
YO
6603 if (memEQ(posixcc, "blan", 4)) /* blank */
6604 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6605 break;
6606 case 'l':
3dab1dad
YO
6607 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6608 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6609 break;
6610 case 'm':
3dab1dad
YO
6611 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6612 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6613 break;
6614 case 'r':
3dab1dad
YO
6615 if (memEQ(posixcc, "lowe", 4)) /* lower */
6616 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6617 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6618 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6619 break;
6620 case 't':
3dab1dad
YO
6621 if (memEQ(posixcc, "digi", 4)) /* digit */
6622 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6623 else if (memEQ(posixcc, "prin", 4)) /* print */
6624 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6625 else if (memEQ(posixcc, "punc", 4)) /* punct */
6626 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6627 break;
b8c5462f
JH
6628 }
6629 break;
80916619 6630 case 6:
3dab1dad
YO
6631 if (memEQ(posixcc, "xdigit", 6))
6632 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6633 break;
6634 }
80916619
NC
6635
6636 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6637 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6638 t - s - 1, s + 1);
80916619
NC
6639 assert (posixcc[skip] == ':');
6640 assert (posixcc[skip+1] == ']');
b45f050a 6641 } else if (!SIZE_ONLY) {
b8c5462f 6642 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6643
830247a4 6644 /* adjust RExC_parse so the warning shows after
b45f050a 6645 the class closes */
9a86a77b 6646 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6647 RExC_parse++;
b45f050a
JF
6648 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6649 }
b8c5462f
JH
6650 } else {
6651 /* Maternal grandfather:
6652 * "[:" ending in ":" but not in ":]" */
830247a4 6653 RExC_parse = s;
767d463e 6654 }
620e46c5
JH
6655 }
6656 }
6657
b8c5462f
JH
6658 return namedclass;
6659}
6660
6661STATIC void
830247a4 6662S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6663{
97aff369 6664 dVAR;
3dab1dad 6665 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6666 const char *s = RExC_parse;
6667 const char c = *s++;
b8c5462f 6668
3dab1dad 6669 while (isALNUM(*s))
b8c5462f
JH
6670 s++;
6671 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6672 if (ckWARN(WARN_REGEXP))
6673 vWARN3(s+2,
6674 "POSIX syntax [%c %c] belongs inside character classes",
6675 c, c);
b45f050a
JF
6676
6677 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6678 if (POSIXCC_NOTYET(c)) {
830247a4 6679 /* adjust RExC_parse so the error shows after
b45f050a 6680 the class closes */
9a86a77b 6681 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6682 NOOP;
b45f050a
JF
6683 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6684 }
b8c5462f
JH
6685 }
6686 }
620e46c5
JH
6687}
6688
7f6f358c
YO
6689
6690/*
6691 parse a class specification and produce either an ANYOF node that
6692 matches the pattern. If the pattern matches a single char only and
6693 that char is < 256 then we produce an EXACT node instead.
6694*/
76e3520e 6695STATIC regnode *
3dab1dad 6696S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6697{
97aff369 6698 dVAR;
9ef43ace 6699 register UV value = 0;
9a86a77b 6700 register UV nextvalue;
3568d838 6701 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6702 register IV range = 0;
c277df42 6703 register regnode *ret;
ba210ebe 6704 STRLEN numlen;
ffc61ed2 6705 IV namedclass;
cbbf8932 6706 char *rangebegin = NULL;
936ed897 6707 bool need_class = 0;
c445ea15 6708 SV *listsv = NULL;
ffc61ed2 6709 UV n;
9e55ce06 6710 bool optimize_invert = TRUE;
cbbf8932 6711 AV* unicode_alternate = NULL;
1b2d223b
JH
6712#ifdef EBCDIC
6713 UV literal_endpoint = 0;
6714#endif
7f6f358c 6715 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6716
3dab1dad 6717 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6718 case we need to change the emitted regop to an EXACT. */
07be1b83 6719 const char * orig_parse = RExC_parse;
72f13be8 6720 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6721#ifndef DEBUGGING
6722 PERL_UNUSED_ARG(depth);
6723#endif
72f13be8 6724
3dab1dad 6725 DEBUG_PARSE("clas");
7f6f358c
YO
6726
6727 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6728 ret = reganode(pRExC_state, ANYOF, 0);
6729
6730 if (!SIZE_ONLY)
6731 ANYOF_FLAGS(ret) = 0;
6732
9a86a77b 6733 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6734 RExC_naughty++;
6735 RExC_parse++;
6736 if (!SIZE_ONLY)
6737 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6738 }
a0d0e21e 6739
73060fc4 6740 if (SIZE_ONLY) {
830247a4 6741 RExC_size += ANYOF_SKIP;
73060fc4
JH
6742 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6743 }
936ed897 6744 else {
830247a4 6745 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6746 if (FOLD)
6747 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6748 if (LOC)
6749 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6750 ANYOF_BITMAP_ZERO(ret);
396482e1 6751 listsv = newSVpvs("# comment\n");
a0d0e21e 6752 }
b8c5462f 6753
9a86a77b
JH
6754 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6755
b938889d 6756 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6757 checkposixcc(pRExC_state);
b8c5462f 6758
f064b6ad
HS
6759 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6760 if (UCHARAT(RExC_parse) == ']')
6761 goto charclassloop;
ffc61ed2 6762
fc8cd66c 6763parseit:
9a86a77b 6764 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6765
6766 charclassloop:
6767
6768 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6769
73b437c8 6770 if (!range)
830247a4 6771 rangebegin = RExC_parse;
ffc61ed2 6772 if (UTF) {
5e12f4fb 6773 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6774 RExC_end - RExC_parse,
9f7f3913 6775 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6776 RExC_parse += numlen;
6777 }
6778 else
6779 value = UCHARAT(RExC_parse++);
7f6f358c 6780
9a86a77b
JH
6781 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6782 if (value == '[' && POSIXCC(nextvalue))
830247a4 6783 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6784 else if (value == '\\') {
ffc61ed2 6785 if (UTF) {
5e12f4fb 6786 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6787 RExC_end - RExC_parse,
9f7f3913 6788 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6789 RExC_parse += numlen;
6790 }
6791 else
6792 value = UCHARAT(RExC_parse++);
470c3474 6793 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6794 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6795 * be a problem later if we want switch on Unicode.
6796 * A similar issue a little bit later when switching on
6797 * namedclass. --jhi */
ffc61ed2 6798 switch ((I32)value) {
b8c5462f
JH
6799 case 'w': namedclass = ANYOF_ALNUM; break;
6800 case 'W': namedclass = ANYOF_NALNUM; break;
6801 case 's': namedclass = ANYOF_SPACE; break;
6802 case 'S': namedclass = ANYOF_NSPACE; break;
6803 case 'd': namedclass = ANYOF_DIGIT; break;
6804 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6805 case 'N': /* Handle \N{NAME} in class */
6806 {
6807 /* We only pay attention to the first char of
6808 multichar strings being returned. I kinda wonder
6809 if this makes sense as it does change the behaviour
6810 from earlier versions, OTOH that behaviour was broken
6811 as well. */
6812 UV v; /* value is register so we cant & it /grrr */
6813 if (reg_namedseq(pRExC_state, &v)) {
6814 goto parseit;
6815 }
6816 value= v;
6817 }
6818 break;
ffc61ed2
JH
6819 case 'p':
6820 case 'P':
3dab1dad
YO
6821 {
6822 char *e;
af6f566e 6823 if (RExC_parse >= RExC_end)
2a4859cd 6824 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6825 if (*RExC_parse == '{') {
1df70142 6826 const U8 c = (U8)value;
ffc61ed2
JH
6827 e = strchr(RExC_parse++, '}');
6828 if (!e)
0da60cf5 6829 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6830 while (isSPACE(UCHARAT(RExC_parse)))
6831 RExC_parse++;
6832 if (e == RExC_parse)
0da60cf5 6833 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6834 n = e - RExC_parse;
ab13f0c7
JH
6835 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6836 n--;
ffc61ed2
JH
6837 }
6838 else {
6839 e = RExC_parse;
6840 n = 1;
6841 }
6842 if (!SIZE_ONLY) {
ab13f0c7
JH
6843 if (UCHARAT(RExC_parse) == '^') {
6844 RExC_parse++;
6845 n--;
6846 value = value == 'p' ? 'P' : 'p'; /* toggle */
6847 while (isSPACE(UCHARAT(RExC_parse))) {
6848 RExC_parse++;
6849 n--;
6850 }
6851 }
097eb12c
AL
6852 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6853 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6854 }
6855 RExC_parse = e + 1;
6856 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6857 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6858 }
f81125e2 6859 break;
b8c5462f
JH
6860 case 'n': value = '\n'; break;
6861 case 'r': value = '\r'; break;
6862 case 't': value = '\t'; break;
6863 case 'f': value = '\f'; break;
6864 case 'b': value = '\b'; break;
c7f1f016
NIS
6865 case 'e': value = ASCII_TO_NATIVE('\033');break;
6866 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6867 case 'x':
ffc61ed2 6868 if (*RExC_parse == '{') {
a4c04bdc
NC
6869 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6870 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6871 char * const e = strchr(RExC_parse++, '}');
b81d288d 6872 if (!e)
ffc61ed2 6873 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6874
6875 numlen = e - RExC_parse;
6876 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6877 RExC_parse = e + 1;
6878 }
6879 else {
a4c04bdc 6880 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6881 numlen = 2;
6882 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6883 RExC_parse += numlen;
6884 }
9e08bc66
TS
6885 if (PL_encoding && value < 0x100)
6886 goto recode_encoding;
b8c5462f
JH
6887 break;
6888 case 'c':
830247a4 6889 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6890 value = toCTRL(value);
6891 break;
6892 case '0': case '1': case '2': case '3': case '4':
6893 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
6894 {
6895 I32 flags = 0;
6896 numlen = 3;
6897 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6898 RExC_parse += numlen;
6899 if (PL_encoding && value < 0x100)
6900 goto recode_encoding;
6901 break;
6902 }
6903 recode_encoding:
6904 {
6905 SV* enc = PL_encoding;
6906 value = reg_recode((const char)(U8)value, &enc);
6907 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6908 vWARN(RExC_parse,
6909 "Invalid escape in the specified encoding");
6910 break;
6911 }
1028017a 6912 default:
041457d9 6913 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6914 vWARN2(RExC_parse,
6915 "Unrecognized escape \\%c in character class passed through",
6916 (int)value);
1028017a 6917 break;
b8c5462f 6918 }
ffc61ed2 6919 } /* end of \blah */
1b2d223b
JH
6920#ifdef EBCDIC
6921 else
6922 literal_endpoint++;
6923#endif
ffc61ed2
JH
6924
6925 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6926
6927 if (!SIZE_ONLY && !need_class)
936ed897 6928 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6929
936ed897 6930 need_class = 1;
ffc61ed2
JH
6931
6932 /* a bad range like a-\d, a-[:digit:] ? */
6933 if (range) {
73b437c8 6934 if (!SIZE_ONLY) {
afd78fd5 6935 if (ckWARN(WARN_REGEXP)) {
097eb12c 6936 const int w =
afd78fd5
JH
6937 RExC_parse >= rangebegin ?
6938 RExC_parse - rangebegin : 0;
830247a4 6939 vWARN4(RExC_parse,
b45f050a 6940 "False [] range \"%*.*s\"",
097eb12c 6941 w, w, rangebegin);
afd78fd5 6942 }
3568d838
JH
6943 if (prevvalue < 256) {
6944 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6945 ANYOF_BITMAP_SET(ret, '-');
6946 }
6947 else {
6948 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6949 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6950 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6951 }
b8c5462f 6952 }
ffc61ed2
JH
6953
6954 range = 0; /* this was not a true range */
73b437c8 6955 }
ffc61ed2 6956
73b437c8 6957 if (!SIZE_ONLY) {
c49a72a9
NC
6958 const char *what = NULL;
6959 char yesno = 0;
6960
3568d838
JH
6961 if (namedclass > OOB_NAMEDCLASS)
6962 optimize_invert = FALSE;
e2962f66
JH
6963 /* Possible truncation here but in some 64-bit environments
6964 * the compiler gets heartburn about switch on 64-bit values.
6965 * A similar issue a little earlier when switching on value.
98f323fa 6966 * --jhi */
e2962f66 6967 switch ((I32)namedclass) {
73b437c8
JH
6968 case ANYOF_ALNUM:
6969 if (LOC)
936ed897 6970 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6971 else {
6972 for (value = 0; value < 256; value++)
6973 if (isALNUM(value))
936ed897 6974 ANYOF_BITMAP_SET(ret, value);
73b437c8 6975 }
c49a72a9
NC
6976 yesno = '+';
6977 what = "Word";
73b437c8
JH
6978 break;
6979 case ANYOF_NALNUM:
6980 if (LOC)
936ed897 6981 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6982 else {
6983 for (value = 0; value < 256; value++)
6984 if (!isALNUM(value))
936ed897 6985 ANYOF_BITMAP_SET(ret, value);
73b437c8 6986 }
c49a72a9
NC
6987 yesno = '!';
6988 what = "Word";
73b437c8 6989 break;
ffc61ed2 6990 case ANYOF_ALNUMC:
73b437c8 6991 if (LOC)
ffc61ed2 6992 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6993 else {
6994 for (value = 0; value < 256; value++)
ffc61ed2 6995 if (isALNUMC(value))
936ed897 6996 ANYOF_BITMAP_SET(ret, value);
73b437c8 6997 }
c49a72a9
NC
6998 yesno = '+';
6999 what = "Alnum";
73b437c8
JH
7000 break;
7001 case ANYOF_NALNUMC:
7002 if (LOC)
936ed897 7003 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7004 else {
7005 for (value = 0; value < 256; value++)
7006 if (!isALNUMC(value))
936ed897 7007 ANYOF_BITMAP_SET(ret, value);
73b437c8 7008 }
c49a72a9
NC
7009 yesno = '!';
7010 what = "Alnum";
73b437c8
JH
7011 break;
7012 case ANYOF_ALPHA:
7013 if (LOC)
936ed897 7014 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7015 else {
7016 for (value = 0; value < 256; value++)
7017 if (isALPHA(value))
936ed897 7018 ANYOF_BITMAP_SET(ret, value);
73b437c8 7019 }
c49a72a9
NC
7020 yesno = '+';
7021 what = "Alpha";
73b437c8
JH
7022 break;
7023 case ANYOF_NALPHA:
7024 if (LOC)
936ed897 7025 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7026 else {
7027 for (value = 0; value < 256; value++)
7028 if (!isALPHA(value))
936ed897 7029 ANYOF_BITMAP_SET(ret, value);
73b437c8 7030 }
c49a72a9
NC
7031 yesno = '!';
7032 what = "Alpha";
73b437c8
JH
7033 break;
7034 case ANYOF_ASCII:
7035 if (LOC)
936ed897 7036 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7037 else {
c7f1f016 7038#ifndef EBCDIC
1ba5c669
JH
7039 for (value = 0; value < 128; value++)
7040 ANYOF_BITMAP_SET(ret, value);
7041#else /* EBCDIC */
ffbc6a93 7042 for (value = 0; value < 256; value++) {
3a3c4447
JH
7043 if (isASCII(value))
7044 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7045 }
1ba5c669 7046#endif /* EBCDIC */
73b437c8 7047 }
c49a72a9
NC
7048 yesno = '+';
7049 what = "ASCII";
73b437c8
JH
7050 break;
7051 case ANYOF_NASCII:
7052 if (LOC)
936ed897 7053 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7054 else {
c7f1f016 7055#ifndef EBCDIC
1ba5c669
JH
7056 for (value = 128; value < 256; value++)
7057 ANYOF_BITMAP_SET(ret, value);
7058#else /* EBCDIC */
ffbc6a93 7059 for (value = 0; value < 256; value++) {
3a3c4447
JH
7060 if (!isASCII(value))
7061 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7062 }
1ba5c669 7063#endif /* EBCDIC */
73b437c8 7064 }
c49a72a9
NC
7065 yesno = '!';
7066 what = "ASCII";
73b437c8 7067 break;
aaa51d5e
JF
7068 case ANYOF_BLANK:
7069 if (LOC)
7070 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7071 else {
7072 for (value = 0; value < 256; value++)
7073 if (isBLANK(value))
7074 ANYOF_BITMAP_SET(ret, value);
7075 }
c49a72a9
NC
7076 yesno = '+';
7077 what = "Blank";
aaa51d5e
JF
7078 break;
7079 case ANYOF_NBLANK:
7080 if (LOC)
7081 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7082 else {
7083 for (value = 0; value < 256; value++)
7084 if (!isBLANK(value))
7085 ANYOF_BITMAP_SET(ret, value);
7086 }
c49a72a9
NC
7087 yesno = '!';
7088 what = "Blank";
aaa51d5e 7089 break;
73b437c8
JH
7090 case ANYOF_CNTRL:
7091 if (LOC)
936ed897 7092 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7093 else {
7094 for (value = 0; value < 256; value++)
7095 if (isCNTRL(value))
936ed897 7096 ANYOF_BITMAP_SET(ret, value);
73b437c8 7097 }
c49a72a9
NC
7098 yesno = '+';
7099 what = "Cntrl";
73b437c8
JH
7100 break;
7101 case ANYOF_NCNTRL:
7102 if (LOC)
936ed897 7103 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7104 else {
7105 for (value = 0; value < 256; value++)
7106 if (!isCNTRL(value))
936ed897 7107 ANYOF_BITMAP_SET(ret, value);
73b437c8 7108 }
c49a72a9
NC
7109 yesno = '!';
7110 what = "Cntrl";
ffc61ed2
JH
7111 break;
7112 case ANYOF_DIGIT:
7113 if (LOC)
7114 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7115 else {
7116 /* consecutive digits assumed */
7117 for (value = '0'; value <= '9'; value++)
7118 ANYOF_BITMAP_SET(ret, value);
7119 }
c49a72a9
NC
7120 yesno = '+';
7121 what = "Digit";
ffc61ed2
JH
7122 break;
7123 case ANYOF_NDIGIT:
7124 if (LOC)
7125 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7126 else {
7127 /* consecutive digits assumed */
7128 for (value = 0; value < '0'; value++)
7129 ANYOF_BITMAP_SET(ret, value);
7130 for (value = '9' + 1; value < 256; value++)
7131 ANYOF_BITMAP_SET(ret, value);
7132 }
c49a72a9
NC
7133 yesno = '!';
7134 what = "Digit";
73b437c8
JH
7135 break;
7136 case ANYOF_GRAPH:
7137 if (LOC)
936ed897 7138 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7139 else {
7140 for (value = 0; value < 256; value++)
7141 if (isGRAPH(value))
936ed897 7142 ANYOF_BITMAP_SET(ret, value);
73b437c8 7143 }
c49a72a9
NC
7144 yesno = '+';
7145 what = "Graph";
73b437c8
JH
7146 break;
7147 case ANYOF_NGRAPH:
7148 if (LOC)
936ed897 7149 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7150 else {
7151 for (value = 0; value < 256; value++)
7152 if (!isGRAPH(value))
936ed897 7153 ANYOF_BITMAP_SET(ret, value);
73b437c8 7154 }
c49a72a9
NC
7155 yesno = '!';
7156 what = "Graph";
73b437c8
JH
7157 break;
7158 case ANYOF_LOWER:
7159 if (LOC)
936ed897 7160 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7161 else {
7162 for (value = 0; value < 256; value++)
7163 if (isLOWER(value))
936ed897 7164 ANYOF_BITMAP_SET(ret, value);
73b437c8 7165 }
c49a72a9
NC
7166 yesno = '+';
7167 what = "Lower";
73b437c8
JH
7168 break;
7169 case ANYOF_NLOWER:
7170 if (LOC)
936ed897 7171 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7172 else {
7173 for (value = 0; value < 256; value++)
7174 if (!isLOWER(value))
936ed897 7175 ANYOF_BITMAP_SET(ret, value);
73b437c8 7176 }
c49a72a9
NC
7177 yesno = '!';
7178 what = "Lower";
73b437c8
JH
7179 break;
7180 case ANYOF_PRINT:
7181 if (LOC)
936ed897 7182 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7183 else {
7184 for (value = 0; value < 256; value++)
7185 if (isPRINT(value))
936ed897 7186 ANYOF_BITMAP_SET(ret, value);
73b437c8 7187 }
c49a72a9
NC
7188 yesno = '+';
7189 what = "Print";
73b437c8
JH
7190 break;
7191 case ANYOF_NPRINT:
7192 if (LOC)
936ed897 7193 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7194 else {
7195 for (value = 0; value < 256; value++)
7196 if (!isPRINT(value))
936ed897 7197 ANYOF_BITMAP_SET(ret, value);
73b437c8 7198 }
c49a72a9
NC
7199 yesno = '!';
7200 what = "Print";
73b437c8 7201 break;
aaa51d5e
JF
7202 case ANYOF_PSXSPC:
7203 if (LOC)
7204 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7205 else {
7206 for (value = 0; value < 256; value++)
7207 if (isPSXSPC(value))
7208 ANYOF_BITMAP_SET(ret, value);
7209 }
c49a72a9
NC
7210 yesno = '+';
7211 what = "Space";
aaa51d5e
JF
7212 break;
7213 case ANYOF_NPSXSPC:
7214 if (LOC)
7215 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7216 else {
7217 for (value = 0; value < 256; value++)
7218 if (!isPSXSPC(value))
7219 ANYOF_BITMAP_SET(ret, value);
7220 }
c49a72a9
NC
7221 yesno = '!';
7222 what = "Space";
aaa51d5e 7223 break;
73b437c8
JH
7224 case ANYOF_PUNCT:
7225 if (LOC)
936ed897 7226 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7227 else {
7228 for (value = 0; value < 256; value++)
7229 if (isPUNCT(value))
936ed897 7230 ANYOF_BITMAP_SET(ret, value);
73b437c8 7231 }
c49a72a9
NC
7232 yesno = '+';
7233 what = "Punct";
73b437c8
JH
7234 break;
7235 case ANYOF_NPUNCT:
7236 if (LOC)
936ed897 7237 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7238 else {
7239 for (value = 0; value < 256; value++)
7240 if (!isPUNCT(value))
936ed897 7241 ANYOF_BITMAP_SET(ret, value);
73b437c8 7242 }
c49a72a9
NC
7243 yesno = '!';
7244 what = "Punct";
ffc61ed2
JH
7245 break;
7246 case ANYOF_SPACE:
7247 if (LOC)
7248 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7249 else {
7250 for (value = 0; value < 256; value++)
7251 if (isSPACE(value))
7252 ANYOF_BITMAP_SET(ret, value);
7253 }
c49a72a9
NC
7254 yesno = '+';
7255 what = "SpacePerl";
ffc61ed2
JH
7256 break;
7257 case ANYOF_NSPACE:
7258 if (LOC)
7259 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7260 else {
7261 for (value = 0; value < 256; value++)
7262 if (!isSPACE(value))
7263 ANYOF_BITMAP_SET(ret, value);
7264 }
c49a72a9
NC
7265 yesno = '!';
7266 what = "SpacePerl";
73b437c8
JH
7267 break;
7268 case ANYOF_UPPER:
7269 if (LOC)
936ed897 7270 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7271 else {
7272 for (value = 0; value < 256; value++)
7273 if (isUPPER(value))
936ed897 7274 ANYOF_BITMAP_SET(ret, value);
73b437c8 7275 }
c49a72a9
NC
7276 yesno = '+';
7277 what = "Upper";
73b437c8
JH
7278 break;
7279 case ANYOF_NUPPER:
7280 if (LOC)
936ed897 7281 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7282 else {
7283 for (value = 0; value < 256; value++)
7284 if (!isUPPER(value))
936ed897 7285 ANYOF_BITMAP_SET(ret, value);
73b437c8 7286 }
c49a72a9
NC
7287 yesno = '!';
7288 what = "Upper";
73b437c8
JH
7289 break;
7290 case ANYOF_XDIGIT:
7291 if (LOC)
936ed897 7292 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7293 else {
7294 for (value = 0; value < 256; value++)
7295 if (isXDIGIT(value))
936ed897 7296 ANYOF_BITMAP_SET(ret, value);
73b437c8 7297 }
c49a72a9
NC
7298 yesno = '+';
7299 what = "XDigit";
73b437c8
JH
7300 break;
7301 case ANYOF_NXDIGIT:
7302 if (LOC)
936ed897 7303 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7304 else {
7305 for (value = 0; value < 256; value++)
7306 if (!isXDIGIT(value))
936ed897 7307 ANYOF_BITMAP_SET(ret, value);
73b437c8 7308 }
c49a72a9
NC
7309 yesno = '!';
7310 what = "XDigit";
73b437c8 7311 break;
f81125e2
JP
7312 case ANYOF_MAX:
7313 /* this is to handle \p and \P */
7314 break;
73b437c8 7315 default:
b45f050a 7316 vFAIL("Invalid [::] class");
73b437c8 7317 break;
b8c5462f 7318 }
c49a72a9
NC
7319 if (what) {
7320 /* Strings such as "+utf8::isWord\n" */
7321 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7322 }
b8c5462f 7323 if (LOC)
936ed897 7324 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7325 continue;
a0d0e21e 7326 }
ffc61ed2
JH
7327 } /* end of namedclass \blah */
7328
a0d0e21e 7329 if (range) {
eb160463 7330 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7331 const int w = RExC_parse - rangebegin;
7332 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7333 range = 0; /* not a valid range */
73b437c8 7334 }
a0d0e21e
LW
7335 }
7336 else {
3568d838 7337 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7338 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7339 RExC_parse[1] != ']') {
7340 RExC_parse++;
ffc61ed2
JH
7341
7342 /* a bad range like \w-, [:word:]- ? */
7343 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7344 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7345 const int w =
afd78fd5
JH
7346 RExC_parse >= rangebegin ?
7347 RExC_parse - rangebegin : 0;
830247a4 7348 vWARN4(RExC_parse,
b45f050a 7349 "False [] range \"%*.*s\"",
097eb12c 7350 w, w, rangebegin);
afd78fd5 7351 }
73b437c8 7352 if (!SIZE_ONLY)
936ed897 7353 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7354 } else
ffc61ed2
JH
7355 range = 1; /* yeah, it's a range! */
7356 continue; /* but do it the next time */
a0d0e21e 7357 }
a687059c 7358 }
ffc61ed2 7359
93733859 7360 /* now is the next time */
07be1b83 7361 /*stored += (value - prevvalue + 1);*/
ae5c130c 7362 if (!SIZE_ONLY) {
3568d838 7363 if (prevvalue < 256) {
1df70142 7364 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7365 IV i;
3568d838 7366#ifdef EBCDIC
1b2d223b
JH
7367 /* In EBCDIC [\x89-\x91] should include
7368 * the \x8e but [i-j] should not. */
7369 if (literal_endpoint == 2 &&
7370 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7371 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7372 {
3568d838
JH
7373 if (isLOWER(prevvalue)) {
7374 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7375 if (isLOWER(i))
7376 ANYOF_BITMAP_SET(ret, i);
7377 } else {
3568d838 7378 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7379 if (isUPPER(i))
7380 ANYOF_BITMAP_SET(ret, i);
7381 }
8ada0baa 7382 }
ffc61ed2 7383 else
8ada0baa 7384#endif
07be1b83
YO
7385 for (i = prevvalue; i <= ceilvalue; i++) {
7386 if (!ANYOF_BITMAP_TEST(ret,i)) {
7387 stored++;
7388 ANYOF_BITMAP_SET(ret, i);
7389 }
7390 }
3568d838 7391 }
a5961de5 7392 if (value > 255 || UTF) {
1df70142
AL
7393 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7394 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7395 stored+=2; /* can't optimize this class */
ffc61ed2 7396 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7397 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7398 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7399 prevnatvalue, natvalue);
7400 }
7401 else if (prevnatvalue == natvalue) {
7402 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7403 if (FOLD) {
89ebb4a3 7404 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7405 STRLEN foldlen;
1df70142 7406 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7407
e294cc5d
JH
7408#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7409 if (RExC_precomp[0] == ':' &&
7410 RExC_precomp[1] == '[' &&
7411 (f == 0xDF || f == 0x92)) {
7412 f = NATIVE_TO_UNI(f);
7413 }
7414#endif
c840d2a2
JH
7415 /* If folding and foldable and a single
7416 * character, insert also the folded version
7417 * to the charclass. */
9e55ce06 7418 if (f != value) {
e294cc5d
JH
7419#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7420 if ((RExC_precomp[0] == ':' &&
7421 RExC_precomp[1] == '[' &&
7422 (f == 0xA2 &&
7423 (value == 0xFB05 || value == 0xFB06))) ?
7424 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7425 foldlen == (STRLEN)UNISKIP(f) )
7426#else
eb160463 7427 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7428#endif
9e55ce06
JH
7429 Perl_sv_catpvf(aTHX_ listsv,
7430 "%04"UVxf"\n", f);
7431 else {
7432 /* Any multicharacter foldings
7433 * require the following transform:
7434 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7435 * where E folds into "pq" and F folds
7436 * into "rst", all other characters
7437 * fold to single characters. We save
7438 * away these multicharacter foldings,
7439 * to be later saved as part of the
7440 * additional "s" data. */
7441 SV *sv;
7442
7443 if (!unicode_alternate)
7444 unicode_alternate = newAV();
7445 sv = newSVpvn((char*)foldbuf, foldlen);
7446 SvUTF8_on(sv);
7447 av_push(unicode_alternate, sv);
7448 }
7449 }
254ba52a 7450
60a8b682
JH
7451 /* If folding and the value is one of the Greek
7452 * sigmas insert a few more sigmas to make the
7453 * folding rules of the sigmas to work right.
7454 * Note that not all the possible combinations
7455 * are handled here: some of them are handled
9e55ce06
JH
7456 * by the standard folding rules, and some of
7457 * them (literal or EXACTF cases) are handled
7458 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7459 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7460 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7461 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7462 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7463 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7464 }
7465 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7466 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7467 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7468 }
7469 }
ffc61ed2 7470 }
1b2d223b
JH
7471#ifdef EBCDIC
7472 literal_endpoint = 0;
7473#endif
8ada0baa 7474 }
ffc61ed2
JH
7475
7476 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7477 }
ffc61ed2 7478
936ed897 7479 if (need_class) {
4f66b38d 7480 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7481 if (SIZE_ONLY)
830247a4 7482 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7483 else
830247a4 7484 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7485 }
ffc61ed2 7486
7f6f358c
YO
7487
7488 if (SIZE_ONLY)
7489 return ret;
7490 /****** !SIZE_ONLY AFTER HERE *********/
7491
7492 if( stored == 1 && value < 256
7493 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7494 ) {
7495 /* optimize single char class to an EXACT node
7496 but *only* when its not a UTF/high char */
07be1b83
YO
7497 const char * cur_parse= RExC_parse;
7498 RExC_emit = (regnode *)orig_emit;
7499 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7500 ret = reg_node(pRExC_state,
7501 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7502 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7503 *STRING(ret)= (char)value;
7504 STR_LEN(ret)= 1;
7505 RExC_emit += STR_SZ(1);
7506 return ret;
7507 }
ae5c130c 7508 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7509 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7510 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7511 ) {
a0ed51b3 7512 for (value = 0; value < 256; ++value) {
936ed897 7513 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7514 UV fold = PL_fold[value];
ffc61ed2
JH
7515
7516 if (fold != value)
7517 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7518 }
7519 }
936ed897 7520 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7521 }
ffc61ed2 7522
ae5c130c 7523 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7524 if (optimize_invert &&
ffc61ed2
JH
7525 /* If the only flag is inversion. */
7526 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7527 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7528 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7529 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7530 }
7f6f358c 7531 {
097eb12c 7532 AV * const av = newAV();
ffc61ed2 7533 SV *rv;
9e55ce06 7534 /* The 0th element stores the character class description
6a0407ee 7535 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7536 * to initialize the appropriate swash (which gets stored in
7537 * the 1st element), and also useful for dumping the regnode.
7538 * The 2nd element stores the multicharacter foldings,
6a0407ee 7539 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7540 av_store(av, 0, listsv);
7541 av_store(av, 1, NULL);
9e55ce06 7542 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7543 rv = newRV_noinc((SV*)av);
19860706 7544 n = add_data(pRExC_state, 1, "s");
830247a4 7545 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7546 ARG_SET(ret, n);
a0ed51b3 7547 }
a0ed51b3
LW
7548 return ret;
7549}
7550
76e3520e 7551STATIC char*
830247a4 7552S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7553{
097eb12c 7554 char* const retval = RExC_parse++;
a0d0e21e 7555
4633a7c4 7556 for (;;) {
830247a4
IZ
7557 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7558 RExC_parse[2] == '#') {
e994fd66
AE
7559 while (*RExC_parse != ')') {
7560 if (RExC_parse == RExC_end)
7561 FAIL("Sequence (?#... not terminated");
830247a4 7562 RExC_parse++;
e994fd66 7563 }
830247a4 7564 RExC_parse++;
4633a7c4
LW
7565 continue;
7566 }
e2509266 7567 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7568 if (isSPACE(*RExC_parse)) {
7569 RExC_parse++;
748a9306
LW
7570 continue;
7571 }
830247a4 7572 else if (*RExC_parse == '#') {
e994fd66
AE
7573 while (RExC_parse < RExC_end)
7574 if (*RExC_parse++ == '\n') break;
748a9306
LW
7575 continue;
7576 }
748a9306 7577 }
4633a7c4 7578 return retval;
a0d0e21e 7579 }
a687059c
LW
7580}
7581
7582/*
c277df42 7583- reg_node - emit a node
a0d0e21e 7584*/
76e3520e 7585STATIC regnode * /* Location. */
830247a4 7586S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7587{
97aff369 7588 dVAR;
c277df42 7589 register regnode *ptr;
504618e9 7590 regnode * const ret = RExC_emit;
07be1b83 7591 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7592
c277df42 7593 if (SIZE_ONLY) {
830247a4
IZ
7594 SIZE_ALIGN(RExC_size);
7595 RExC_size += 1;
a0d0e21e
LW
7596 return(ret);
7597 }
e2e6a0f1
YO
7598#ifdef DEBUGGING
7599 if (OP(RExC_emit) == 255)
7600 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7601 reg_name[op], OP(RExC_emit));
7602#endif
c277df42 7603 NODE_ALIGN_FILL(ret);
a0d0e21e 7604 ptr = ret;
c277df42 7605 FILL_ADVANCE_NODE(ptr, op);
fac92740 7606 if (RExC_offsets) { /* MJD */
07be1b83 7607 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7608 "reg_node", __LINE__,
7609 reg_name[op],
07be1b83
YO
7610 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7611 ? "Overwriting end of array!\n" : "OK",
7612 (UV)(RExC_emit - RExC_emit_start),
7613 (UV)(RExC_parse - RExC_start),
7614 (UV)RExC_offsets[0]));
ccb2c380 7615 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7616 }
07be1b83 7617
830247a4 7618 RExC_emit = ptr;
a0d0e21e 7619 return(ret);
a687059c
LW
7620}
7621
7622/*
a0d0e21e
LW
7623- reganode - emit a node with an argument
7624*/
76e3520e 7625STATIC regnode * /* Location. */
830247a4 7626S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7627{
97aff369 7628 dVAR;
c277df42 7629 register regnode *ptr;
504618e9 7630 regnode * const ret = RExC_emit;
07be1b83 7631 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7632
c277df42 7633 if (SIZE_ONLY) {
830247a4
IZ
7634 SIZE_ALIGN(RExC_size);
7635 RExC_size += 2;
6bda09f9
YO
7636 /*
7637 We can't do this:
7638
7639 assert(2==regarglen[op]+1);
7640
7641 Anything larger than this has to allocate the extra amount.
7642 If we changed this to be:
7643
7644 RExC_size += (1 + regarglen[op]);
7645
7646 then it wouldn't matter. Its not clear what side effect
7647 might come from that so its not done so far.
7648 -- dmq
7649 */
a0d0e21e
LW
7650 return(ret);
7651 }
e2e6a0f1
YO
7652#ifdef DEBUGGING
7653 if (OP(RExC_emit) == 255)
7654 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7655#endif
c277df42 7656 NODE_ALIGN_FILL(ret);
a0d0e21e 7657 ptr = ret;
c277df42 7658 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7659 if (RExC_offsets) { /* MJD */
07be1b83 7660 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7661 "reganode",
ccb2c380
MP
7662 __LINE__,
7663 reg_name[op],
07be1b83 7664 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7665 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7666 (UV)(RExC_emit - RExC_emit_start),
7667 (UV)(RExC_parse - RExC_start),
7668 (UV)RExC_offsets[0]));
ccb2c380 7669 Set_Cur_Node_Offset;
fac92740
MJD
7670 }
7671
830247a4 7672 RExC_emit = ptr;
a0d0e21e 7673 return(ret);
fe14fcc3
LW
7674}
7675
7676/*
cd439c50 7677- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7678*/
71207a34
AL
7679STATIC STRLEN
7680S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7681{
97aff369 7682 dVAR;
71207a34 7683 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7684}
7685
7686/*
a0d0e21e
LW
7687- reginsert - insert an operator in front of already-emitted operand
7688*
7689* Means relocating the operand.
7690*/
76e3520e 7691STATIC void
6bda09f9 7692S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7693{
97aff369 7694 dVAR;
c277df42
IZ
7695 register regnode *src;
7696 register regnode *dst;
7697 register regnode *place;
504618e9 7698 const int offset = regarglen[(U8)op];
6bda09f9 7699 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7700 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7701/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7702 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7703 if (SIZE_ONLY) {
6bda09f9 7704 RExC_size += size;
a0d0e21e
LW
7705 return;
7706 }
a687059c 7707
830247a4 7708 src = RExC_emit;
6bda09f9 7709 RExC_emit += size;
830247a4 7710 dst = RExC_emit;
40d049e4 7711 if (RExC_open_parens) {
6bda09f9 7712 int paren;
296c5fee 7713 DEBUG_PARSE_FMT("inst"," - %"IVdf,RExC_npar);
6bda09f9 7714 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7715 if ( RExC_open_parens[paren] >= opnd ) {
7716 DEBUG_PARSE_FMT("open"," - %d",size);
7717 RExC_open_parens[paren] += size;
7718 } else {
7719 DEBUG_PARSE_FMT("open"," - %s","ok");
7720 }
7721 if ( RExC_close_parens[paren] >= opnd ) {
7722 DEBUG_PARSE_FMT("close"," - %d",size);
7723 RExC_close_parens[paren] += size;
7724 } else {
7725 DEBUG_PARSE_FMT("close"," - %s","ok");
7726 }
7727 }
6bda09f9 7728 }
40d049e4 7729
fac92740 7730 while (src > opnd) {
c277df42 7731 StructCopy(--src, --dst, regnode);
fac92740 7732 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7733 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7734 "reg_insert",
ccb2c380
MP
7735 __LINE__,
7736 reg_name[op],
07be1b83
YO
7737 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7738 ? "Overwriting end of array!\n" : "OK",
7739 (UV)(src - RExC_emit_start),
7740 (UV)(dst - RExC_emit_start),
7741 (UV)RExC_offsets[0]));
ccb2c380
MP
7742 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7743 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7744 }
7745 }
7746
a0d0e21e
LW
7747
7748 place = opnd; /* Op node, where operand used to be. */
fac92740 7749 if (RExC_offsets) { /* MJD */
07be1b83 7750 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7751 "reginsert",
ccb2c380
MP
7752 __LINE__,
7753 reg_name[op],
07be1b83 7754 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7755 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7756 (UV)(place - RExC_emit_start),
7757 (UV)(RExC_parse - RExC_start),
786e8c11 7758 (UV)RExC_offsets[0]));
ccb2c380 7759 Set_Node_Offset(place, RExC_parse);
45948336 7760 Set_Node_Length(place, 1);
fac92740 7761 }
c277df42
IZ
7762 src = NEXTOPER(place);
7763 FILL_ADVANCE_NODE(place, op);
7764 Zero(src, offset, regnode);
a687059c
LW
7765}
7766
7767/*
c277df42 7768- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7769- SEE ALSO: regtail_study
a0d0e21e 7770*/
097eb12c 7771/* TODO: All three parms should be const */
76e3520e 7772STATIC void
3dab1dad 7773S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7774{
97aff369 7775 dVAR;
c277df42 7776 register regnode *scan;
72f13be8 7777 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7778#ifndef DEBUGGING
7779 PERL_UNUSED_ARG(depth);
7780#endif
a0d0e21e 7781
c277df42 7782 if (SIZE_ONLY)
a0d0e21e
LW
7783 return;
7784
7785 /* Find last node. */
7786 scan = p;
7787 for (;;) {
504618e9 7788 regnode * const temp = regnext(scan);
3dab1dad
YO
7789 DEBUG_PARSE_r({
7790 SV * const mysv=sv_newmortal();
7791 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7792 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7793 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7794 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7795 (temp == NULL ? "->" : ""),
7796 (temp == NULL ? reg_name[OP(val)] : "")
7797 );
3dab1dad
YO
7798 });
7799 if (temp == NULL)
7800 break;
7801 scan = temp;
7802 }
7803
7804 if (reg_off_by_arg[OP(scan)]) {
7805 ARG_SET(scan, val - scan);
7806 }
7807 else {
7808 NEXT_OFF(scan) = val - scan;
7809 }
7810}
7811
07be1b83 7812#ifdef DEBUGGING
3dab1dad
YO
7813/*
7814- regtail_study - set the next-pointer at the end of a node chain of p to val.
7815- Look for optimizable sequences at the same time.
7816- currently only looks for EXACT chains.
07be1b83
YO
7817
7818This is expermental code. The idea is to use this routine to perform
7819in place optimizations on branches and groups as they are constructed,
7820with the long term intention of removing optimization from study_chunk so
7821that it is purely analytical.
7822
7823Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7824to control which is which.
7825
3dab1dad
YO
7826*/
7827/* TODO: All four parms should be const */
07be1b83 7828
3dab1dad
YO
7829STATIC U8
7830S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7831{
7832 dVAR;
7833 register regnode *scan;
07be1b83
YO
7834 U8 exact = PSEUDO;
7835#ifdef EXPERIMENTAL_INPLACESCAN
7836 I32 min = 0;
7837#endif
7838
3dab1dad
YO
7839 GET_RE_DEBUG_FLAGS_DECL;
7840
07be1b83 7841
3dab1dad
YO
7842 if (SIZE_ONLY)
7843 return exact;
7844
7845 /* Find last node. */
7846
7847 scan = p;
7848 for (;;) {
7849 regnode * const temp = regnext(scan);
07be1b83
YO
7850#ifdef EXPERIMENTAL_INPLACESCAN
7851 if (PL_regkind[OP(scan)] == EXACT)
7852 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7853 return EXACT;
7854#endif
3dab1dad
YO
7855 if ( exact ) {
7856 switch (OP(scan)) {
7857 case EXACT:
7858 case EXACTF:
7859 case EXACTFL:
7860 if( exact == PSEUDO )
7861 exact= OP(scan);
07be1b83
YO
7862 else if ( exact != OP(scan) )
7863 exact= 0;
3dab1dad
YO
7864 case NOTHING:
7865 break;
7866 default:
7867 exact= 0;
7868 }
7869 }
7870 DEBUG_PARSE_r({
7871 SV * const mysv=sv_newmortal();
7872 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7873 regprop(RExC_rx, mysv, scan);
eaf3ca90 7874 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7875 SvPV_nolen_const(mysv),
eaf3ca90
YO
7876 REG_NODE_NUM(scan),
7877 reg_name[exact]);
3dab1dad 7878 });
a0d0e21e
LW
7879 if (temp == NULL)
7880 break;
7881 scan = temp;
7882 }
07be1b83
YO
7883 DEBUG_PARSE_r({
7884 SV * const mysv_val=sv_newmortal();
7885 DEBUG_PARSE_MSG("");
7886 regprop(RExC_rx, mysv_val, val);
7887 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7888 SvPV_nolen_const(mysv_val),
7889 REG_NODE_NUM(val),
7890 val - scan
7891 );
7892 });
c277df42
IZ
7893 if (reg_off_by_arg[OP(scan)]) {
7894 ARG_SET(scan, val - scan);
a0ed51b3
LW
7895 }
7896 else {
c277df42
IZ
7897 NEXT_OFF(scan) = val - scan;
7898 }
3dab1dad
YO
7899
7900 return exact;
a687059c 7901}
07be1b83 7902#endif
a687059c
LW
7903
7904/*
a687059c
LW
7905 - regcurly - a little FSA that accepts {\d+,?\d*}
7906 */
79072805 7907STATIC I32
5f66b61c 7908S_regcurly(register const char *s)
a687059c
LW
7909{
7910 if (*s++ != '{')
7911 return FALSE;
f0fcb552 7912 if (!isDIGIT(*s))
a687059c 7913 return FALSE;
f0fcb552 7914 while (isDIGIT(*s))
a687059c
LW
7915 s++;
7916 if (*s == ',')
7917 s++;
f0fcb552 7918 while (isDIGIT(*s))
a687059c
LW
7919 s++;
7920 if (*s != '}')
7921 return FALSE;
7922 return TRUE;
7923}
7924
a687059c
LW
7925
7926/*
fd181c75 7927 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7928 */
7929void
097eb12c 7930Perl_regdump(pTHX_ const regexp *r)
a687059c 7931{
35ff7856 7932#ifdef DEBUGGING
97aff369 7933 dVAR;
c445ea15 7934 SV * const sv = sv_newmortal();
ab3bbdeb 7935 SV *dsv= sv_newmortal();
a687059c 7936
786e8c11 7937 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7938
7939 /* Header fields of interest. */
ab3bbdeb
YO
7940 if (r->anchored_substr) {
7941 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7942 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7943 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7944 "anchored %s%s at %"IVdf" ",
7945 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7946 (IV)r->anchored_offset);
ab3bbdeb
YO
7947 } else if (r->anchored_utf8) {
7948 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7949 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7950 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7951 "anchored utf8 %s%s at %"IVdf" ",
7952 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7953 (IV)r->anchored_offset);
ab3bbdeb
YO
7954 }
7955 if (r->float_substr) {
7956 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7957 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7958 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7959 "floating %s%s at %"IVdf"..%"UVuf" ",
7960 s, RE_SV_TAIL(r->float_substr),
7b0972df 7961 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7962 } else if (r->float_utf8) {
7963 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7964 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7965 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7966 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7967 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7968 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7969 }
33b8afdf 7970 if (r->check_substr || r->check_utf8)
b81d288d 7971 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7972 (const char *)
7973 (r->check_substr == r->float_substr
7974 && r->check_utf8 == r->float_utf8
7975 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7976 if (r->reganch & ROPT_NOSCAN)
7977 PerlIO_printf(Perl_debug_log, " noscan");
7978 if (r->reganch & ROPT_CHECK_ALL)
7979 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7980 if (r->check_substr || r->check_utf8)
c277df42
IZ
7981 PerlIO_printf(Perl_debug_log, ") ");
7982
46fc3d4c 7983 if (r->regstclass) {
32fc9b6a 7984 regprop(r, sv, r->regstclass);
1de06328 7985 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7986 }
774d564b 7987 if (r->reganch & ROPT_ANCH) {
7988 PerlIO_printf(Perl_debug_log, "anchored");
7989 if (r->reganch & ROPT_ANCH_BOL)
7990 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7991 if (r->reganch & ROPT_ANCH_MBOL)
7992 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7993 if (r->reganch & ROPT_ANCH_SBOL)
7994 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7995 if (r->reganch & ROPT_ANCH_GPOS)
7996 PerlIO_printf(Perl_debug_log, "(GPOS)");
7997 PerlIO_putc(Perl_debug_log, ' ');
7998 }
c277df42
IZ
7999 if (r->reganch & ROPT_GPOS_SEEN)
8000 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 8001 if (r->reganch & ROPT_SKIP)
760ac839 8002 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8003 if (r->reganch & ROPT_IMPLICIT)
760ac839 8004 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8005 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8006 if (r->reganch & ROPT_EVAL_SEEN)
8007 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8008 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8009#else
96a5add6 8010 PERL_UNUSED_CONTEXT;
65e66c80 8011 PERL_UNUSED_ARG(r);
17c3b450 8012#endif /* DEBUGGING */
a687059c
LW
8013}
8014
8015/*
a0d0e21e
LW
8016- regprop - printable representation of opcode
8017*/
46fc3d4c 8018void
32fc9b6a 8019Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8020{
35ff7856 8021#ifdef DEBUGGING
97aff369 8022 dVAR;
9b155405 8023 register int k;
1de06328 8024 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8025
54dc92de 8026 sv_setpvn(sv, "", 0);
03363afd 8027 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8028 /* It would be nice to FAIL() here, but this may be called from
8029 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8030 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8031 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8032
3dab1dad 8033 k = PL_regkind[OP(o)];
9b155405 8034
2a782b5b 8035 if (k == EXACT) {
396482e1 8036 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8037 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8038 * is a crude hack but it may be the best for now since
8039 * we have no flag "this EXACTish node was UTF-8"
8040 * --jhi */
8041 const char * const s =
ddc5bc0f 8042 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8043 PL_colors[0], PL_colors[1],
8044 PERL_PV_ESCAPE_UNI_DETECT |
8045 PERL_PV_PRETTY_ELIPSES |
8046 PERL_PV_PRETTY_LTGT
8047 );
8048 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8049 } else if (k == TRIE) {
3dab1dad 8050 /* print the details of the trie in dumpuntil instead, as
4f639d21 8051 * prog->data isn't available here */
1de06328
YO
8052 const char op = OP(o);
8053 const I32 n = ARG(o);
8054 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8055 (reg_ac_data *)prog->data->data[n] :
8056 NULL;
8057 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8058 (reg_trie_data*)prog->data->data[n] :
8059 ac->trie;
8060
8061 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8062 DEBUG_TRIE_COMPILE_r(
8063 Perl_sv_catpvf(aTHX_ sv,
8064 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8065 (UV)trie->startstate,
1e2e3d02 8066 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8067 (UV)trie->wordcount,
8068 (UV)trie->minlen,
8069 (UV)trie->maxlen,
8070 (UV)TRIE_CHARCOUNT(trie),
8071 (UV)trie->uniquecharcount
8072 )
8073 );
8074 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8075 int i;
8076 int rangestart = -1;
f46cb337 8077 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8078 Perl_sv_catpvf(aTHX_ sv, "[");
8079 for (i = 0; i <= 256; i++) {
8080 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8081 if (rangestart == -1)
8082 rangestart = i;
8083 } else if (rangestart != -1) {
8084 if (i <= rangestart + 3)
8085 for (; rangestart < i; rangestart++)
8086 put_byte(sv, rangestart);
8087 else {
8088 put_byte(sv, rangestart);
8089 sv_catpvs(sv, "-");
8090 put_byte(sv, i - 1);
8091 }
8092 rangestart = -1;
8093 }
8094 }
8095 Perl_sv_catpvf(aTHX_ sv, "]");
8096 }
8097
a3621e74 8098 } else if (k == CURLY) {
cb434fcc 8099 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8100 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8101 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8102 }
2c2d71f5
JH
8103 else if (k == WHILEM && o->flags) /* Ordinal/of */
8104 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8105 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8106 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8107 else if (k == GOSUB)
6bda09f9 8108 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8109 else if (k == VERB) {
8110 if (!o->flags)
8111 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8112 (SV*)prog->data->data[ ARG( o ) ]);
8113 } else if (k == LOGICAL)
04ebc1ab 8114 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8115 else if (k == ANYOF) {
8116 int i, rangestart = -1;
2d03de9c 8117 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8118
8119 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8120 static const char * const anyofs[] = {
653099ff
GS
8121 "\\w",
8122 "\\W",
8123 "\\s",
8124 "\\S",
8125 "\\d",
8126 "\\D",
8127 "[:alnum:]",
8128 "[:^alnum:]",
8129 "[:alpha:]",
8130 "[:^alpha:]",
8131 "[:ascii:]",
8132 "[:^ascii:]",
8133 "[:ctrl:]",
8134 "[:^ctrl:]",
8135 "[:graph:]",
8136 "[:^graph:]",
8137 "[:lower:]",
8138 "[:^lower:]",
8139 "[:print:]",
8140 "[:^print:]",
8141 "[:punct:]",
8142 "[:^punct:]",
8143 "[:upper:]",
aaa51d5e 8144 "[:^upper:]",
653099ff 8145 "[:xdigit:]",
aaa51d5e
JF
8146 "[:^xdigit:]",
8147 "[:space:]",
8148 "[:^space:]",
8149 "[:blank:]",
8150 "[:^blank:]"
653099ff
GS
8151 };
8152
19860706 8153 if (flags & ANYOF_LOCALE)
396482e1 8154 sv_catpvs(sv, "{loc}");
19860706 8155 if (flags & ANYOF_FOLD)
396482e1 8156 sv_catpvs(sv, "{i}");
653099ff 8157 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8158 if (flags & ANYOF_INVERT)
396482e1 8159 sv_catpvs(sv, "^");
ffc61ed2
JH
8160 for (i = 0; i <= 256; i++) {
8161 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8162 if (rangestart == -1)
8163 rangestart = i;
8164 } else if (rangestart != -1) {
8165 if (i <= rangestart + 3)
8166 for (; rangestart < i; rangestart++)
653099ff 8167 put_byte(sv, rangestart);
ffc61ed2
JH
8168 else {
8169 put_byte(sv, rangestart);
396482e1 8170 sv_catpvs(sv, "-");
ffc61ed2 8171 put_byte(sv, i - 1);
653099ff 8172 }
ffc61ed2 8173 rangestart = -1;
653099ff 8174 }
847a199f 8175 }
ffc61ed2
JH
8176
8177 if (o->flags & ANYOF_CLASS)
bb7a0f54 8178 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8179 if (ANYOF_CLASS_TEST(o,i))
8180 sv_catpv(sv, anyofs[i]);
8181
8182 if (flags & ANYOF_UNICODE)
396482e1 8183 sv_catpvs(sv, "{unicode}");
1aa99e6b 8184 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8185 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8186
8187 {
8188 SV *lv;
32fc9b6a 8189 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8190
ffc61ed2
JH
8191 if (lv) {
8192 if (sw) {
89ebb4a3 8193 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8194
ffc61ed2 8195 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8196 uvchr_to_utf8(s, i);
ffc61ed2 8197
3568d838 8198 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8199 if (rangestart == -1)
8200 rangestart = i;
8201 } else if (rangestart != -1) {
ffc61ed2
JH
8202 if (i <= rangestart + 3)
8203 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8204 const U8 * const e = uvchr_to_utf8(s,rangestart);
8205 U8 *p;
8206 for(p = s; p < e; p++)
ffc61ed2
JH
8207 put_byte(sv, *p);
8208 }
8209 else {
2d03de9c
AL
8210 const U8 *e = uvchr_to_utf8(s,rangestart);
8211 U8 *p;
8212 for (p = s; p < e; p++)
ffc61ed2 8213 put_byte(sv, *p);
396482e1 8214 sv_catpvs(sv, "-");
2d03de9c
AL
8215 e = uvchr_to_utf8(s, i-1);
8216 for (p = s; p < e; p++)
1df70142 8217 put_byte(sv, *p);
ffc61ed2
JH
8218 }
8219 rangestart = -1;
8220 }
19860706 8221 }
ffc61ed2 8222
396482e1 8223 sv_catpvs(sv, "..."); /* et cetera */
19860706 8224 }
fde631ed 8225
ffc61ed2 8226 {
2e0de35c 8227 char *s = savesvpv(lv);
c445ea15 8228 char * const origs = s;
b81d288d 8229
3dab1dad
YO
8230 while (*s && *s != '\n')
8231 s++;
b81d288d 8232
ffc61ed2 8233 if (*s == '\n') {
2d03de9c 8234 const char * const t = ++s;
ffc61ed2
JH
8235
8236 while (*s) {
8237 if (*s == '\n')
8238 *s = ' ';
8239 s++;
8240 }
8241 if (s[-1] == ' ')
8242 s[-1] = 0;
8243
8244 sv_catpv(sv, t);
fde631ed 8245 }
b81d288d 8246
ffc61ed2 8247 Safefree(origs);
fde631ed
JH
8248 }
8249 }
653099ff 8250 }
ffc61ed2 8251
653099ff
GS
8252 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8253 }
9b155405 8254 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8255 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8256#else
96a5add6 8257 PERL_UNUSED_CONTEXT;
65e66c80
SP
8258 PERL_UNUSED_ARG(sv);
8259 PERL_UNUSED_ARG(o);
f9049ba1 8260 PERL_UNUSED_ARG(prog);
17c3b450 8261#endif /* DEBUGGING */
35ff7856 8262}
a687059c 8263
cad2e5aa
JH
8264SV *
8265Perl_re_intuit_string(pTHX_ regexp *prog)
8266{ /* Assume that RE_INTUIT is set */
97aff369 8267 dVAR;
a3621e74 8268 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8269 PERL_UNUSED_CONTEXT;
8270
a3621e74 8271 DEBUG_COMPILE_r(
cfd0369c 8272 {
2d03de9c 8273 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8274 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8275
8276 if (!PL_colorset) reginitcolors();
8277 PerlIO_printf(Perl_debug_log,
a0288114 8278 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8279 PL_colors[4],
8280 prog->check_substr ? "" : "utf8 ",
8281 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8282 s,
8283 PL_colors[1],
8284 (strlen(s) > 60 ? "..." : ""));
8285 } );
8286
33b8afdf 8287 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8288}
8289
84da74a7
YO
8290/*
8291 pregfree - free a regexp
8292
8293 See regdupe below if you change anything here.
8294*/
8295
2b69d0c2 8296void
864dbfa3 8297Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8298{
27da23d5 8299 dVAR;
0df25f3d 8300
fc32ee4a 8301 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8302
7821416a
IZ
8303 if (!r || (--r->refcnt > 0))
8304 return;
ab3bbdeb 8305 DEBUG_COMPILE_r({
0df25f3d
YO
8306 if (!PL_colorset)
8307 reginitcolors();
ab3bbdeb
YO
8308 if (RX_DEBUG(r)){
8309 SV *dsv= sv_newmortal();
8310 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8311 dsv, r->precomp, r->prelen, 60);
8312 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8313 PL_colors[4],PL_colors[5],s);
8314 }
9e55ce06 8315 });
cad2e5aa 8316
43c5f42d
NC
8317 /* gcov results gave these as non-null 100% of the time, so there's no
8318 optimisation in checking them before calling Safefree */
8319 Safefree(r->precomp);
8320 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8321 RX_MATCH_COPY_FREE(r);
f8c7b90f 8322#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8323 if (r->saved_copy)
8324 SvREFCNT_dec(r->saved_copy);
8325#endif
a193d654
GS
8326 if (r->substrs) {
8327 if (r->anchored_substr)
8328 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8329 if (r->anchored_utf8)
8330 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8331 if (r->float_substr)
8332 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8333 if (r->float_utf8)
8334 SvREFCNT_dec(r->float_utf8);
2779dcf1 8335 Safefree(r->substrs);
a193d654 8336 }
81714fb9
YO
8337 if (r->paren_names)
8338 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8339 if (r->data) {
8340 int n = r->data->count;
f3548bdc
DM
8341 PAD* new_comppad = NULL;
8342 PAD* old_comppad;
4026c95a 8343 PADOFFSET refcnt;
dfad63ad 8344
c277df42 8345 while (--n >= 0) {
261faec3 8346 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8347 switch (r->data->what[n]) {
8348 case 's':
81714fb9 8349 case 'S':
c277df42
IZ
8350 SvREFCNT_dec((SV*)r->data->data[n]);
8351 break;
653099ff
GS
8352 case 'f':
8353 Safefree(r->data->data[n]);
8354 break;
dfad63ad
HS
8355 case 'p':
8356 new_comppad = (AV*)r->data->data[n];
8357 break;
c277df42 8358 case 'o':
dfad63ad 8359 if (new_comppad == NULL)
cea2e8a9 8360 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8361 PAD_SAVE_LOCAL(old_comppad,
8362 /* Watch out for global destruction's random ordering. */
c445ea15 8363 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8364 );
b34c0dd4 8365 OP_REFCNT_LOCK;
4026c95a
SH
8366 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8367 OP_REFCNT_UNLOCK;
8368 if (!refcnt)
9b978d73 8369 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8370
f3548bdc 8371 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8372 SvREFCNT_dec((SV*)new_comppad);
8373 new_comppad = NULL;
c277df42
IZ
8374 break;
8375 case 'n':
9e55ce06 8376 break;
07be1b83 8377 case 'T':
be8e71aa
YO
8378 { /* Aho Corasick add-on structure for a trie node.
8379 Used in stclass optimization only */
07be1b83
YO
8380 U32 refcount;
8381 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8382 OP_REFCNT_LOCK;
8383 refcount = --aho->refcount;
8384 OP_REFCNT_UNLOCK;
8385 if ( !refcount ) {
8386 Safefree(aho->states);
8387 Safefree(aho->fail);
8388 aho->trie=NULL; /* not necessary to free this as it is
8389 handled by the 't' case */
8390 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8391 Safefree(r->regstclass);
07be1b83
YO
8392 }
8393 }
8394 break;
a3621e74 8395 case 't':
07be1b83 8396 {
be8e71aa 8397 /* trie structure. */
07be1b83
YO
8398 U32 refcount;
8399 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8400 OP_REFCNT_LOCK;
8401 refcount = --trie->refcount;
8402 OP_REFCNT_UNLOCK;
8403 if ( !refcount ) {
8404 Safefree(trie->charmap);
8405 if (trie->widecharmap)
8406 SvREFCNT_dec((SV*)trie->widecharmap);
8407 Safefree(trie->states);
8408 Safefree(trie->trans);
8409 if (trie->bitmap)
8410 Safefree(trie->bitmap);
8411 if (trie->wordlen)
8412 Safefree(trie->wordlen);
786e8c11
YO
8413 if (trie->jump)
8414 Safefree(trie->jump);
8415 if (trie->nextword)
8416 Safefree(trie->nextword);
a3621e74 8417#ifdef DEBUGGING
be8e71aa
YO
8418 if (RX_DEBUG(r)) {
8419 if (trie->words)
8420 SvREFCNT_dec((SV*)trie->words);
8421 if (trie->revcharmap)
8422 SvREFCNT_dec((SV*)trie->revcharmap);
8423 }
a3621e74 8424#endif
07be1b83 8425 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8426 }
07be1b83
YO
8427 }
8428 break;
c277df42 8429 default:
830247a4 8430 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8431 }
8432 }
8433 Safefree(r->data->what);
8434 Safefree(r->data);
a0d0e21e
LW
8435 }
8436 Safefree(r->startp);
8437 Safefree(r->endp);
8438 Safefree(r);
a687059c 8439}
c277df42 8440
84da74a7
YO
8441#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8442#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8443#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8444#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8445
8446/*
8447 regdupe - duplicate a regexp.
8448
8449 This routine is called by sv.c's re_dup and is expected to clone a
8450 given regexp structure. It is a no-op when not under USE_ITHREADS.
8451 (Originally this *was* re_dup() for change history see sv.c)
8452
8453 See pregfree() above if you change anything here.
8454*/
a3c0e9ca 8455#if defined(USE_ITHREADS)
84da74a7
YO
8456regexp *
8457Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8458{
84da74a7
YO
8459 dVAR;
8460 REGEXP *ret;
8461 int i, len, npar;
8462 struct reg_substr_datum *s;
8463
8464 if (!r)
8465 return (REGEXP *)NULL;
8466
8467 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8468 return ret;
8469
8470 len = r->offsets[0];
8471 npar = r->nparens+1;
8472
8473 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8474 Copy(r->program, ret->program, len+1, regnode);
8475
8476 Newx(ret->startp, npar, I32);
8477 Copy(r->startp, ret->startp, npar, I32);
8478 Newx(ret->endp, npar, I32);
8479 Copy(r->startp, ret->startp, npar, I32);
8480
8481 Newx(ret->substrs, 1, struct reg_substr_data);
8482 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8483 s->min_offset = r->substrs->data[i].min_offset;
8484 s->max_offset = r->substrs->data[i].max_offset;
8485 s->end_shift = r->substrs->data[i].end_shift;
8486 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8487 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8488 }
8489
8490 ret->regstclass = NULL;
8491 if (r->data) {
8492 struct reg_data *d;
8493 const int count = r->data->count;
8494 int i;
8495
8496 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8497 char, struct reg_data);
8498 Newx(d->what, count, U8);
8499
8500 d->count = count;
8501 for (i = 0; i < count; i++) {
8502 d->what[i] = r->data->what[i];
8503 switch (d->what[i]) {
e2e6a0f1 8504 /* legal options are one of: sSfpont
84da74a7
YO
8505 see also regcomp.h and pregfree() */
8506 case 's':
81714fb9 8507 case 'S':
84da74a7
YO
8508 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8509 break;
8510 case 'p':
8511 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8512 break;
8513 case 'f':
8514 /* This is cheating. */
8515 Newx(d->data[i], 1, struct regnode_charclass_class);
8516 StructCopy(r->data->data[i], d->data[i],
8517 struct regnode_charclass_class);
8518 ret->regstclass = (regnode*)d->data[i];
8519 break;
8520 case 'o':
8521 /* Compiled op trees are readonly, and can thus be
8522 shared without duplication. */
8523 OP_REFCNT_LOCK;
8524 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8525 OP_REFCNT_UNLOCK;
8526 break;
8527 case 'n':
8528 d->data[i] = r->data->data[i];
8529 break;
8530 case 't':
8531 d->data[i] = r->data->data[i];
8532 OP_REFCNT_LOCK;
8533 ((reg_trie_data*)d->data[i])->refcount++;
8534 OP_REFCNT_UNLOCK;
8535 break;
8536 case 'T':
8537 d->data[i] = r->data->data[i];
8538 OP_REFCNT_LOCK;
8539 ((reg_ac_data*)d->data[i])->refcount++;
8540 OP_REFCNT_UNLOCK;
8541 /* Trie stclasses are readonly and can thus be shared
8542 * without duplication. We free the stclass in pregfree
8543 * when the corresponding reg_ac_data struct is freed.
8544 */
8545 ret->regstclass= r->regstclass;
8546 break;
8547 default:
8548 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8549 }
8550 }
8551
8552 ret->data = d;
8553 }
8554 else
8555 ret->data = NULL;
8556
8557 Newx(ret->offsets, 2*len+1, U32);
8558 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8559
8560 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8561 ret->refcnt = r->refcnt;
8562 ret->minlen = r->minlen;
8563 ret->prelen = r->prelen;
8564 ret->nparens = r->nparens;
8565 ret->lastparen = r->lastparen;
8566 ret->lastcloseparen = r->lastcloseparen;
8567 ret->reganch = r->reganch;
8568
8569 ret->sublen = r->sublen;
8570
f9f4320a 8571 ret->engine = r->engine;
81714fb9
YO
8572
8573 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8574
84da74a7
YO
8575 if (RX_MATCH_COPIED(ret))
8576 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8577 else
8578 ret->subbeg = NULL;
8579#ifdef PERL_OLD_COPY_ON_WRITE
8580 ret->saved_copy = NULL;
8581#endif
8582
8583 ptr_table_store(PL_ptr_table, r, ret);
8584 return ret;
84da74a7 8585}
a3c0e9ca 8586#endif
84da74a7 8587
76234dfb 8588#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8589/*
8590 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8591 */
8592regnode *
864dbfa3 8593Perl_regnext(pTHX_ register regnode *p)
c277df42 8594{
97aff369 8595 dVAR;
c277df42
IZ
8596 register I32 offset;
8597
3280af22 8598 if (p == &PL_regdummy)
c277df42
IZ
8599 return(NULL);
8600
8601 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8602 if (offset == 0)
8603 return(NULL);
8604
c277df42 8605 return(p+offset);
c277df42 8606}
76234dfb 8607#endif
c277df42 8608
01f988be 8609STATIC void
cea2e8a9 8610S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8611{
8612 va_list args;
8613 STRLEN l1 = strlen(pat1);
8614 STRLEN l2 = strlen(pat2);
8615 char buf[512];
06bf62c7 8616 SV *msv;
73d840c0 8617 const char *message;
c277df42
IZ
8618
8619 if (l1 > 510)
8620 l1 = 510;
8621 if (l1 + l2 > 510)
8622 l2 = 510 - l1;
8623 Copy(pat1, buf, l1 , char);
8624 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8625 buf[l1 + l2] = '\n';
8626 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8627#ifdef I_STDARG
8628 /* ANSI variant takes additional second argument */
c277df42 8629 va_start(args, pat2);
8736538c
AS
8630#else
8631 va_start(args);
8632#endif
5a844595 8633 msv = vmess(buf, &args);
c277df42 8634 va_end(args);
cfd0369c 8635 message = SvPV_const(msv,l1);
c277df42
IZ
8636 if (l1 > 512)
8637 l1 = 512;
8638 Copy(message, buf, l1 , char);
197cf9b9 8639 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8640 Perl_croak(aTHX_ "%s", buf);
c277df42 8641}
a0ed51b3
LW
8642
8643/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8644
76234dfb 8645#ifndef PERL_IN_XSUB_RE
a0ed51b3 8646void
864dbfa3 8647Perl_save_re_context(pTHX)
b81d288d 8648{
97aff369 8649 dVAR;
1ade1aa1
NC
8650
8651 struct re_save_state *state;
8652
8653 SAVEVPTR(PL_curcop);
8654 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8655
8656 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8657 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8658 SSPUSHINT(SAVEt_RE_STATE);
8659
46ab3289 8660 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8661
a0ed51b3 8662 PL_reg_start_tmp = 0;
a0ed51b3 8663 PL_reg_start_tmpl = 0;
c445ea15 8664 PL_reg_oldsaved = NULL;
a5db57d6 8665 PL_reg_oldsavedlen = 0;
a5db57d6 8666 PL_reg_maxiter = 0;
a5db57d6 8667 PL_reg_leftiter = 0;
c445ea15 8668 PL_reg_poscache = NULL;
a5db57d6 8669 PL_reg_poscache_size = 0;
1ade1aa1
NC
8670#ifdef PERL_OLD_COPY_ON_WRITE
8671 PL_nrs = NULL;
8672#endif
ada6e8a9 8673
c445ea15
AL
8674 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8675 if (PL_curpm) {
8676 const REGEXP * const rx = PM_GETRE(PL_curpm);
8677 if (rx) {
1df70142 8678 U32 i;
ada6e8a9 8679 for (i = 1; i <= rx->nparens; i++) {
1df70142 8680 char digits[TYPE_CHARS(long)];
d9fad198 8681 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8682 GV *const *const gvp
8683 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8684
b37c2d43
AL
8685 if (gvp) {
8686 GV * const gv = *gvp;
8687 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8688 save_scalar(gv);
49f27e4b 8689 }
ada6e8a9
AMS
8690 }
8691 }
8692 }
a0ed51b3 8693}
76234dfb 8694#endif
51371543 8695
51371543 8696static void
acfe0abc 8697clear_re(pTHX_ void *r)
51371543 8698{
97aff369 8699 dVAR;
51371543
GS
8700 ReREFCNT_dec((regexp *)r);
8701}
ffbc6a93 8702
a28509cc
AL
8703#ifdef DEBUGGING
8704
8705STATIC void
8706S_put_byte(pTHX_ SV *sv, int c)
8707{
8708 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8709 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8710 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8711 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8712 else
8713 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8714}
8715
786e8c11 8716
3dab1dad
YO
8717#define CLEAR_OPTSTART \
8718 if (optstart) STMT_START { \
07be1b83 8719 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8720 optstart=NULL; \
8721 } STMT_END
8722
786e8c11 8723#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8724
b5a2f8d8
NC
8725STATIC const regnode *
8726S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8727 const regnode *last, const regnode *plast,
8728 SV* sv, I32 indent, U32 depth)
a28509cc 8729{
97aff369 8730 dVAR;
786e8c11 8731 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8732 register const regnode *next;
3dab1dad
YO
8733 const regnode *optstart= NULL;
8734 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8735
786e8c11
YO
8736#ifdef DEBUG_DUMPUNTIL
8737 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8738 last ? last-start : 0,plast ? plast-start : 0);
8739#endif
8740
8741 if (plast && plast < last)
8742 last= plast;
8743
8744 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8745 /* While that wasn't END last time... */
8746
8747 NODE_ALIGN(node);
8748 op = OP(node);
8749 if (op == CLOSE)
786e8c11 8750 indent--;
b5a2f8d8 8751 next = regnext((regnode *)node);
07be1b83 8752
a28509cc 8753 /* Where, what. */
8e11feef 8754 if (OP(node) == OPTIMIZED) {
e68ec53f 8755 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8756 optstart = node;
3dab1dad 8757 else
8e11feef 8758 goto after_print;
3dab1dad
YO
8759 } else
8760 CLEAR_OPTSTART;
07be1b83 8761
32fc9b6a 8762 regprop(r, sv, node);
a28509cc 8763 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8764 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8765
8766 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8767 if (next == NULL) /* Next ptr. */
8768 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8769 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8770 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8771 else
8772 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8773
1de06328 8774 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8775 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8776 }
8777
a28509cc
AL
8778 after_print:
8779 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8780 assert(next);
8781 {
8782 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8783 ? regnext((regnode *)next)
8784 : next);
be8e71aa
YO
8785 if (last && nnode > last)
8786 nnode = last;
786e8c11 8787 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8788 }
a28509cc
AL
8789 }
8790 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8791 assert(next);
786e8c11 8792 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8793 }
8794 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 8795 const regnode *this_trie = node;
1de06328 8796 const char op = OP(node);
a28509cc 8797 const I32 n = ARG(node);
1de06328
YO
8798 const reg_ac_data * const ac = op>=AHOCORASICK ?
8799 (reg_ac_data *)r->data->data[n] :
8800 NULL;
8801 const reg_trie_data * const trie = op<AHOCORASICK ?
8802 (reg_trie_data*)r->data->data[n] :
8803 ac->trie;
786e8c11 8804 const regnode *nextbranch= NULL;
a28509cc 8805 I32 word_idx;
1de06328 8806 sv_setpvn(sv, "", 0);
786e8c11 8807 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8808 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8809
8810 PerlIO_printf(Perl_debug_log, "%*s%s ",
8811 (int)(2*(indent+3)), "",
8812 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8813 PL_colors[0], PL_colors[1],
8814 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8815 PERL_PV_PRETTY_ELIPSES |
7f69552c 8816 PERL_PV_PRETTY_LTGT
786e8c11
YO
8817 )
8818 : "???"
8819 );
8820 if (trie->jump) {
40d049e4 8821 U16 dist= trie->jump[word_idx+1];
7f69552c 8822 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 8823 (dist ? this_trie + dist : next) - start);
786e8c11
YO
8824 if (dist) {
8825 if (!nextbranch)
24b23f37 8826 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
8827 DUMPUNTIL(this_trie + dist, nextbranch);
8828 }
786e8c11
YO
8829 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8830 nextbranch= regnext((regnode *)nextbranch);
8831 } else {
8832 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8833 }
786e8c11
YO
8834 }
8835 if (last && next > last)
8836 node= last;
8837 else
8838 node= next;
a28509cc 8839 }
786e8c11
YO
8840 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8841 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8842 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8843 }
8844 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8845 assert(next);
786e8c11 8846 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8847 }
8848 else if ( op == PLUS || op == STAR) {
786e8c11 8849 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8850 }
8851 else if (op == ANYOF) {
8852 /* arglen 1 + class block */
8853 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8854 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8855 node = NEXTOPER(node);
8856 }
8857 else if (PL_regkind[(U8)op] == EXACT) {
8858 /* Literal string, where present. */
8859 node += NODE_SZ_STR(node) - 1;
8860 node = NEXTOPER(node);
8861 }
8862 else {
8863 node = NEXTOPER(node);
8864 node += regarglen[(U8)op];
8865 }
8866 if (op == CURLYX || op == OPEN)
786e8c11 8867 indent++;
a28509cc 8868 else if (op == WHILEM)
786e8c11 8869 indent--;
a28509cc 8870 }
3dab1dad 8871 CLEAR_OPTSTART;
786e8c11
YO
8872#ifdef DEBUG_DUMPUNTIL
8873 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8874#endif
1de06328 8875 return node;
a28509cc
AL
8876}
8877
8878#endif /* DEBUGGING */
8879
241d1a3b
NC
8880/*
8881 * Local variables:
8882 * c-indentation-style: bsd
8883 * c-basic-offset: 4
8884 * indent-tabs-mode: t
8885 * End:
8886 *
37442d52
RGS
8887 * ex: set ts=8 sts=4 sw=4 noet:
8888 */