This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Reinstate warning 'Useless use of "re" pragma'
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
e2e6a0f1 117 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
118 I32 extralen;
119 I32 seen_zerolen;
120 I32 seen_evals;
40d049e4
YO
121 regnode **open_parens; /* pointers to open parens */
122 regnode **close_parens; /* pointers to close parens */
123 regnode *opend; /* END node in program */
1aa99e6b 124 I32 utf8;
6bda09f9 125 HV *charnames; /* cache of named sequences */
81714fb9 126 HV *paren_names; /* Paren names */
40d049e4
YO
127 regnode **recurse; /* Recurse regops */
128 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
3dab1dad 133#ifdef DEBUGGING
be8e71aa 134 const char *lastparse;
3dab1dad
YO
135 I32 lastnum;
136#define RExC_lastparse (pRExC_state->lastparse)
137#define RExC_lastnum (pRExC_state->lastnum)
138#endif
830247a4
IZ
139} RExC_state_t;
140
e2509266 141#define RExC_flags (pRExC_state->flags)
830247a4
IZ
142#define RExC_precomp (pRExC_state->precomp)
143#define RExC_rx (pRExC_state->rx)
fac92740 144#define RExC_start (pRExC_state->start)
830247a4
IZ
145#define RExC_end (pRExC_state->end)
146#define RExC_parse (pRExC_state->parse)
147#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 148#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 149#define RExC_emit (pRExC_state->emit)
fac92740 150#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
151#define RExC_naughty (pRExC_state->naughty)
152#define RExC_sawback (pRExC_state->sawback)
153#define RExC_seen (pRExC_state->seen)
154#define RExC_size (pRExC_state->size)
155#define RExC_npar (pRExC_state->npar)
e2e6a0f1 156#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
157#define RExC_extralen (pRExC_state->extralen)
158#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 160#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 161#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
162#define RExC_open_parens (pRExC_state->open_parens)
163#define RExC_close_parens (pRExC_state->close_parens)
164#define RExC_opend (pRExC_state->opend)
81714fb9 165#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
166#define RExC_recurse (pRExC_state->recurse)
167#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 168
a687059c
LW
169#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
170#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
171 ((*s) == '{' && regcurly(s)))
a687059c 172
35c8bce7
LW
173#ifdef SPSTART
174#undef SPSTART /* dratted cpp namespace... */
175#endif
a687059c
LW
176/*
177 * Flags to be passed up and down.
178 */
a687059c 179#define WORST 0 /* Worst case. */
821b33a5 180#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
181#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
182#define SPSTART 0x4 /* Starts with * or +. */
183#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 184
3dab1dad
YO
185#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
186
07be1b83
YO
187/* whether trie related optimizations are enabled */
188#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
189#define TRIE_STUDY_OPT
786e8c11 190#define FULL_TRIE_STUDY
07be1b83
YO
191#define TRIE_STCLASS
192#endif
1de06328
YO
193
194
40d049e4
YO
195
196#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
197#define PBITVAL(paren) (1 << ((paren) & 7))
198#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
199#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
200#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
201
202
1de06328
YO
203/* About scan_data_t.
204
205 During optimisation we recurse through the regexp program performing
206 various inplace (keyhole style) optimisations. In addition study_chunk
207 and scan_commit populate this data structure with information about
208 what strings MUST appear in the pattern. We look for the longest
209 string that must appear for at a fixed location, and we look for the
210 longest string that may appear at a floating location. So for instance
211 in the pattern:
212
213 /FOO[xX]A.*B[xX]BAR/
214
215 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
216 strings (because they follow a .* construct). study_chunk will identify
217 both FOO and BAR as being the longest fixed and floating strings respectively.
218
219 The strings can be composites, for instance
220
221 /(f)(o)(o)/
222
223 will result in a composite fixed substring 'foo'.
224
225 For each string some basic information is maintained:
226
227 - offset or min_offset
228 This is the position the string must appear at, or not before.
229 It also implicitly (when combined with minlenp) tells us how many
230 character must match before the string we are searching.
231 Likewise when combined with minlenp and the length of the string
232 tells us how many characters must appear after the string we have
233 found.
234
235 - max_offset
236 Only used for floating strings. This is the rightmost point that
237 the string can appear at. Ifset to I32 max it indicates that the
238 string can occur infinitely far to the right.
239
240 - minlenp
241 A pointer to the minimum length of the pattern that the string
242 was found inside. This is important as in the case of positive
243 lookahead or positive lookbehind we can have multiple patterns
244 involved. Consider
245
246 /(?=FOO).*F/
247
248 The minimum length of the pattern overall is 3, the minimum length
249 of the lookahead part is 3, but the minimum length of the part that
250 will actually match is 1. So 'FOO's minimum length is 3, but the
251 minimum length for the F is 1. This is important as the minimum length
252 is used to determine offsets in front of and behind the string being
253 looked for. Since strings can be composites this is the length of the
254 pattern at the time it was commited with a scan_commit. Note that
255 the length is calculated by study_chunk, so that the minimum lengths
256 are not known until the full pattern has been compiled, thus the
257 pointer to the value.
258
259 - lookbehind
260
261 In the case of lookbehind the string being searched for can be
262 offset past the start point of the final matching string.
263 If this value was just blithely removed from the min_offset it would
264 invalidate some of the calculations for how many chars must match
265 before or after (as they are derived from min_offset and minlen and
266 the length of the string being searched for).
267 When the final pattern is compiled and the data is moved from the
268 scan_data_t structure into the regexp structure the information
269 about lookbehind is factored in, with the information that would
270 have been lost precalculated in the end_shift field for the
271 associated string.
272
273 The fields pos_min and pos_delta are used to store the minimum offset
274 and the delta to the maximum offset at the current point in the pattern.
275
276*/
2c2d71f5
JH
277
278typedef struct scan_data_t {
1de06328
YO
279 /*I32 len_min; unused */
280 /*I32 len_delta; unused */
2c2d71f5
JH
281 I32 pos_min;
282 I32 pos_delta;
283 SV *last_found;
1de06328 284 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
285 I32 last_start_min;
286 I32 last_start_max;
1de06328
YO
287 SV **longest; /* Either &l_fixed, or &l_float. */
288 SV *longest_fixed; /* longest fixed string found in pattern */
289 I32 offset_fixed; /* offset where it starts */
290 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
291 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
292 SV *longest_float; /* longest floating string found in pattern */
293 I32 offset_float_min; /* earliest point in string it can appear */
294 I32 offset_float_max; /* latest point in string it can appear */
295 I32 *minlen_float; /* pointer to the minlen relevent to the string */
296 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
297 I32 flags;
298 I32 whilem_c;
cb434fcc 299 I32 *last_closep;
653099ff 300 struct regnode_charclass_class *start_class;
2c2d71f5
JH
301} scan_data_t;
302
a687059c 303/*
e50aee73 304 * Forward declarations for pregcomp()'s friends.
a687059c 305 */
a0d0e21e 306
27da23d5 307static const scan_data_t zero_scan_data =
1de06328 308 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
309
310#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
311#define SF_BEFORE_SEOL 0x0001
312#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
313#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
314#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
315
09b7f37c
CB
316#ifdef NO_UNARY_PLUS
317# define SF_FIX_SHIFT_EOL (0+2)
318# define SF_FL_SHIFT_EOL (0+4)
319#else
320# define SF_FIX_SHIFT_EOL (+2)
321# define SF_FL_SHIFT_EOL (+4)
322#endif
c277df42
IZ
323
324#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
325#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
326
327#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
328#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
329#define SF_IS_INF 0x0040
330#define SF_HAS_PAR 0x0080
331#define SF_IN_PAR 0x0100
332#define SF_HAS_EVAL 0x0200
333#define SCF_DO_SUBSTR 0x0400
653099ff
GS
334#define SCF_DO_STCLASS_AND 0x0800
335#define SCF_DO_STCLASS_OR 0x1000
336#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 337#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 338
786e8c11 339#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 340#define SCF_SEEN_ACCEPT 0x8000
07be1b83 341
eb160463 342#define UTF (RExC_utf8 != 0)
e2509266
JH
343#define LOC ((RExC_flags & PMf_LOCALE) != 0)
344#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 345
ffc61ed2 346#define OOB_UNICODE 12345678
93733859 347#define OOB_NAMEDCLASS -1
b8c5462f 348
a0ed51b3
LW
349#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
350#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
351
8615cb43 352
b45f050a
JF
353/* length of regex to show in messages that don't mark a position within */
354#define RegexLengthToShowInErrorMessages 127
355
356/*
357 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
358 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
359 * op/pragma/warn/regcomp.
360 */
7253e4e3
RK
361#define MARKER1 "<-- HERE" /* marker as it appears in the description */
362#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 363
7253e4e3 364#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
365
366/*
367 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
368 * arg. Show regex, up to a maximum length. If it's too long, chop and add
369 * "...".
370 */
ccb2c380 371#define FAIL(msg) STMT_START { \
bfed75c6 372 const char *ellipses = ""; \
ccb2c380
MP
373 IV len = RExC_end - RExC_precomp; \
374 \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 if (len > RegexLengthToShowInErrorMessages) { \
378 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
379 len = RegexLengthToShowInErrorMessages - 10; \
380 ellipses = "..."; \
381 } \
382 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
383 msg, (int)len, RExC_precomp, ellipses); \
384} STMT_END
8615cb43 385
b45f050a 386/*
b45f050a
JF
387 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
388 */
ccb2c380 389#define Simple_vFAIL(m) STMT_START { \
a28509cc 390 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
391 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
392 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
b45f050a
JF
394
395/*
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
397 */
ccb2c380
MP
398#define vFAIL(m) STMT_START { \
399 if (!SIZE_ONLY) \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
401 Simple_vFAIL(m); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Like Simple_vFAIL(), but accepts two arguments.
406 */
ccb2c380 407#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 408 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
409 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
410 (int)offset, RExC_precomp, RExC_precomp + offset); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
415 */
ccb2c380
MP
416#define vFAIL2(m,a1) STMT_START { \
417 if (!SIZE_ONLY) \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 Simple_vFAIL2(m, a1); \
420} STMT_END
b45f050a
JF
421
422
423/*
424 * Like Simple_vFAIL(), but accepts three arguments.
425 */
ccb2c380 426#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
434 */
ccb2c380
MP
435#define vFAIL3(m,a1,a2) STMT_START { \
436 if (!SIZE_ONLY) \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
438 Simple_vFAIL3(m, a1, a2); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts four arguments.
443 */
ccb2c380 444#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a 449
ccb2c380 450#define vWARN(loc,m) STMT_START { \
a28509cc 451 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
455
456#define vWARNdep(loc,m) STMT_START { \
a28509cc 457 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
458 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
459 "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
461} STMT_END
462
463
464#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 471 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
473 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
480} STMT_END
481
482#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
485 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
9d1d55b5 487
8615cb43 488
cd439c50 489/* Allow for side effects in s */
ccb2c380
MP
490#define REGC(c,s) STMT_START { \
491 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
492} STMT_END
cd439c50 493
fac92740
MJD
494/* Macros for recording node offsets. 20001227 mjd@plover.com
495 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
496 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
497 * Element 0 holds the number n.
07be1b83 498 * Position is 1 indexed.
fac92740
MJD
499 */
500
ccb2c380
MP
501#define Set_Node_Offset_To_R(node,byte) STMT_START { \
502 if (! SIZE_ONLY) { \
503 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 504 __LINE__, (node), (int)(byte))); \
ccb2c380 505 if((node) < 0) { \
551405c4 506 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
507 } else { \
508 RExC_offsets[2*(node)-1] = (byte); \
509 } \
510 } \
511} STMT_END
512
513#define Set_Node_Offset(node,byte) \
514 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
515#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
516
517#define Set_Node_Length_To_R(node,len) STMT_START { \
518 if (! SIZE_ONLY) { \
519 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 520 __LINE__, (int)(node), (int)(len))); \
ccb2c380 521 if((node) < 0) { \
551405c4 522 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
523 } else { \
524 RExC_offsets[2*(node)] = (len); \
525 } \
526 } \
527} STMT_END
528
529#define Set_Node_Length(node,len) \
530 Set_Node_Length_To_R((node)-RExC_emit_start, len)
531#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
532#define Set_Node_Cur_Length(node) \
533 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
534
535/* Get offsets and lengths */
536#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
537#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
538
07be1b83
YO
539#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
540 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
541 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
542} STMT_END
543
544
545#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
546#define EXPERIMENTAL_INPLACESCAN
547#endif
548
1de06328 549#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 550DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
551 PerlIO_printf(Perl_debug_log, \
552 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
553 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
554 (int)(depth)*2, "", \
555 (IV)((data)->pos_min), \
556 (IV)((data)->pos_delta), \
557 (IV)((data)->flags), \
558 (IV)((data)->whilem_c), \
559 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
560 ); \
561 if ((data)->last_found) \
562 PerlIO_printf(Perl_debug_log, \
563 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
564 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
565 SvPVX_const((data)->last_found), \
566 (IV)((data)->last_end), \
567 (IV)((data)->last_start_min), \
568 (IV)((data)->last_start_max), \
569 ((data)->longest && \
570 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
571 SvPVX_const((data)->longest_fixed), \
572 (IV)((data)->offset_fixed), \
573 ((data)->longest && \
574 (data)->longest==&((data)->longest_float)) ? "*" : "", \
575 SvPVX_const((data)->longest_float), \
576 (IV)((data)->offset_float_min), \
577 (IV)((data)->offset_float_max) \
578 ); \
579 PerlIO_printf(Perl_debug_log,"\n"); \
580});
581
acfe0abc 582static void clear_re(pTHX_ void *r);
4327152a 583
653099ff 584/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 585 Update the longest found anchored substring and the longest found
653099ff
GS
586 floating substrings if needed. */
587
4327152a 588STATIC void
1de06328 589S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 590{
e1ec3a88
AL
591 const STRLEN l = CHR_SVLEN(data->last_found);
592 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 593 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 594
c277df42 595 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 596 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
597 if (*data->longest == data->longest_fixed) {
598 data->offset_fixed = l ? data->last_start_min : data->pos_min;
599 if (data->flags & SF_BEFORE_EOL)
b81d288d 600 data->flags
c277df42
IZ
601 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
602 else
603 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
604 data->minlen_fixed=minlenp;
605 data->lookbehind_fixed=0;
a0ed51b3
LW
606 }
607 else {
c277df42 608 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
609 data->offset_float_max = (l
610 ? data->last_start_max
c277df42 611 : data->pos_min + data->pos_delta);
9051bda5
HS
612 if ((U32)data->offset_float_max > (U32)I32_MAX)
613 data->offset_float_max = I32_MAX;
c277df42 614 if (data->flags & SF_BEFORE_EOL)
b81d288d 615 data->flags
c277df42
IZ
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
619 data->minlen_float=minlenp;
620 data->lookbehind_float=0;
c277df42
IZ
621 }
622 }
623 SvCUR_set(data->last_found, 0);
0eda9292 624 {
a28509cc 625 SV * const sv = data->last_found;
097eb12c
AL
626 if (SvUTF8(sv) && SvMAGICAL(sv)) {
627 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
628 if (mg)
629 mg->mg_len = 0;
630 }
0eda9292 631 }
c277df42
IZ
632 data->last_end = -1;
633 data->flags &= ~SF_BEFORE_EOL;
1de06328 634 DEBUG_STUDYDATA(data,0);
c277df42
IZ
635}
636
653099ff
GS
637/* Can match anything (initialization) */
638STATIC void
097eb12c 639S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 640{
653099ff 641 ANYOF_CLASS_ZERO(cl);
f8bef550 642 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 643 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
644 if (LOC)
645 cl->flags |= ANYOF_LOCALE;
646}
647
648/* Can match anything (initialization) */
649STATIC int
5f66b61c 650S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
651{
652 int value;
653
aaa51d5e 654 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
655 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
656 return 1;
1aa99e6b
IH
657 if (!(cl->flags & ANYOF_UNICODE_ALL))
658 return 0;
10edeb5d 659 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 660 return 0;
653099ff
GS
661 return 1;
662}
663
664/* Can match anything (initialization) */
665STATIC void
097eb12c 666S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 667{
8ecf7187 668 Zero(cl, 1, struct regnode_charclass_class);
653099ff 669 cl->type = ANYOF;
830247a4 670 cl_anything(pRExC_state, cl);
653099ff
GS
671}
672
673STATIC void
097eb12c 674S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 675{
8ecf7187 676 Zero(cl, 1, struct regnode_charclass_class);
653099ff 677 cl->type = ANYOF;
830247a4 678 cl_anything(pRExC_state, cl);
653099ff
GS
679 if (LOC)
680 cl->flags |= ANYOF_LOCALE;
681}
682
683/* 'And' a given class with another one. Can create false positives */
684/* We assume that cl is not inverted */
685STATIC void
5f66b61c 686S_cl_and(struct regnode_charclass_class *cl,
a28509cc 687 const struct regnode_charclass_class *and_with)
653099ff 688{
40d049e4
YO
689
690 assert(and_with->type == ANYOF);
653099ff
GS
691 if (!(and_with->flags & ANYOF_CLASS)
692 && !(cl->flags & ANYOF_CLASS)
693 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
694 && !(and_with->flags & ANYOF_FOLD)
695 && !(cl->flags & ANYOF_FOLD)) {
696 int i;
697
698 if (and_with->flags & ANYOF_INVERT)
699 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
700 cl->bitmap[i] &= ~and_with->bitmap[i];
701 else
702 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
703 cl->bitmap[i] &= and_with->bitmap[i];
704 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
705 if (!(and_with->flags & ANYOF_EOS))
706 cl->flags &= ~ANYOF_EOS;
1aa99e6b 707
14ebb1a2
JH
708 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
709 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
710 cl->flags &= ~ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_UNICODE;
712 ARG_SET(cl, ARG(and_with));
713 }
14ebb1a2
JH
714 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
715 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 716 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
717 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
718 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 719 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
720}
721
722/* 'OR' a given class with another one. Can create false positives */
723/* We assume that cl is not inverted */
724STATIC void
097eb12c 725S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 726{
653099ff
GS
727 if (or_with->flags & ANYOF_INVERT) {
728 /* We do not use
729 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
730 * <= (B1 | !B2) | (CL1 | !CL2)
731 * which is wasteful if CL2 is small, but we ignore CL2:
732 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
733 * XXXX Can we handle case-fold? Unclear:
734 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
735 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
736 */
737 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
738 && !(or_with->flags & ANYOF_FOLD)
739 && !(cl->flags & ANYOF_FOLD) ) {
740 int i;
741
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] |= ~or_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise */
745 else {
830247a4 746 cl_anything(pRExC_state, cl);
653099ff
GS
747 }
748 } else {
749 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
750 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 751 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
752 || (cl->flags & ANYOF_FOLD)) ) {
753 int i;
754
755 /* OR char bitmap and class bitmap separately */
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= or_with->bitmap[i];
758 if (or_with->flags & ANYOF_CLASS) {
759 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
760 cl->classflags[i] |= or_with->classflags[i];
761 cl->flags |= ANYOF_CLASS;
762 }
763 }
764 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 765 cl_anything(pRExC_state, cl);
653099ff
GS
766 }
767 }
768 if (or_with->flags & ANYOF_EOS)
769 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
770
771 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
772 ARG(cl) != ARG(or_with)) {
773 cl->flags |= ANYOF_UNICODE_ALL;
774 cl->flags &= ~ANYOF_UNICODE;
775 }
776 if (or_with->flags & ANYOF_UNICODE_ALL) {
777 cl->flags |= ANYOF_UNICODE_ALL;
778 cl->flags &= ~ANYOF_UNICODE;
779 }
653099ff
GS
780}
781
a3621e74
YO
782#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
783#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
784#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
785#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
786
3dab1dad
YO
787
788#ifdef DEBUGGING
07be1b83 789/*
3dab1dad
YO
790 dump_trie(trie)
791 dump_trie_interim_list(trie,next_alloc)
792 dump_trie_interim_table(trie,next_alloc)
793
794 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
795 The _interim_ variants are used for debugging the interim
796 tables that are used to generate the final compressed
797 representation which is what dump_trie expects.
798
3dab1dad
YO
799 Part of the reason for their existance is to provide a form
800 of documentation as to how the different representations function.
07be1b83
YO
801
802*/
3dab1dad
YO
803
804/*
805 dump_trie(trie)
806 Dumps the final compressed table form of the trie to Perl_debug_log.
807 Used for debugging make_trie().
808*/
809
810STATIC void
811S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
812{
813 U32 state;
ab3bbdeb
YO
814 SV *sv=sv_newmortal();
815 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
816 GET_RE_DEBUG_FLAGS_DECL;
817
ab3bbdeb 818
3dab1dad
YO
819 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
820 (int)depth * 2 + 2,"",
821 "Match","Base","Ofs" );
822
823 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 824 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 825 if ( tmp ) {
ab3bbdeb
YO
826 PerlIO_printf( Perl_debug_log, "%*s",
827 colwidth,
ddc5bc0f 828 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
829 PL_colors[0], PL_colors[1],
830 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
831 PERL_PV_ESCAPE_FIRSTCHAR
832 )
833 );
3dab1dad
YO
834 }
835 }
836 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
837 (int)depth * 2 + 2,"");
838
839 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 840 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
841 PerlIO_printf( Perl_debug_log, "\n");
842
1e2e3d02 843 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 844 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
845
846 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
847
848 if ( trie->states[ state ].wordnum ) {
849 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
850 } else {
851 PerlIO_printf( Perl_debug_log, "%6s", "" );
852 }
853
854 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
855
856 if ( base ) {
857 U32 ofs = 0;
858
859 while( ( base + ofs < trie->uniquecharcount ) ||
860 ( base + ofs - trie->uniquecharcount < trie->lasttrans
861 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
862 ofs++;
863
864 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
865
866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
867 if ( ( base + ofs >= trie->uniquecharcount ) &&
868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
870 {
ab3bbdeb
YO
871 PerlIO_printf( Perl_debug_log, "%*"UVXf,
872 colwidth,
3dab1dad
YO
873 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
874 } else {
ab3bbdeb 875 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
876 }
877 }
878
879 PerlIO_printf( Perl_debug_log, "]");
880
881 }
882 PerlIO_printf( Perl_debug_log, "\n" );
883 }
884}
885/*
886 dump_trie_interim_list(trie,next_alloc)
887 Dumps a fully constructed but uncompressed trie in list form.
888 List tries normally only are used for construction when the number of
889 possible chars (trie->uniquecharcount) is very high.
890 Used for debugging make_trie().
891*/
892STATIC void
893S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
894{
895 U32 state;
ab3bbdeb
YO
896 SV *sv=sv_newmortal();
897 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
898 GET_RE_DEBUG_FLAGS_DECL;
899 /* print out the table precompression. */
ab3bbdeb
YO
900 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
901 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
902 "------:-----+-----------------\n" );
3dab1dad
YO
903
904 for( state=1 ; state < next_alloc ; state ++ ) {
905 U16 charid;
906
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
908 (int)depth * 2 + 2,"", (UV)state );
909 if ( ! trie->states[ state ].wordnum ) {
910 PerlIO_printf( Perl_debug_log, "%5s| ","");
911 } else {
912 PerlIO_printf( Perl_debug_log, "W%4x| ",
913 trie->states[ state ].wordnum
914 );
915 }
916 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 917 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
918 if ( tmp ) {
919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
920 colwidth,
ddc5bc0f 921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
922 PL_colors[0], PL_colors[1],
923 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
924 PERL_PV_ESCAPE_FIRSTCHAR
925 ) ,
1e2e3d02
YO
926 TRIE_LIST_ITEM(state,charid).forid,
927 (UV)TRIE_LIST_ITEM(state,charid).newstate
928 );
929 if (!(charid % 10))
664e119d
RGS
930 PerlIO_printf(Perl_debug_log, "\n%*s| ",
931 (int)((depth * 2) + 14), "");
1e2e3d02 932 }
ab3bbdeb
YO
933 }
934 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
935 }
936}
937
938/*
939 dump_trie_interim_table(trie,next_alloc)
940 Dumps a fully constructed but uncompressed trie in table form.
941 This is the normal DFA style state transition table, with a few
942 twists to facilitate compression later.
943 Used for debugging make_trie().
944*/
945STATIC void
946S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
947{
948 U32 state;
949 U16 charid;
ab3bbdeb
YO
950 SV *sv=sv_newmortal();
951 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
952 GET_RE_DEBUG_FLAGS_DECL;
953
954 /*
955 print out the table precompression so that we can do a visual check
956 that they are identical.
957 */
958
959 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
960
961 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 962 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 963 if ( tmp ) {
ab3bbdeb
YO
964 PerlIO_printf( Perl_debug_log, "%*s",
965 colwidth,
ddc5bc0f 966 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
967 PL_colors[0], PL_colors[1],
968 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
969 PERL_PV_ESCAPE_FIRSTCHAR
970 )
971 );
3dab1dad
YO
972 }
973 }
974
975 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
976
977 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 978 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
979 }
980
981 PerlIO_printf( Perl_debug_log, "\n" );
982
983 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
984
985 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
986 (int)depth * 2 + 2,"",
987 (UV)TRIE_NODENUM( state ) );
988
989 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
990 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
991 if (v)
992 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
993 else
994 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
995 }
996 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
998 } else {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1000 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1001 }
1002 }
07be1b83 1003}
3dab1dad
YO
1004
1005#endif
1006
786e8c11
YO
1007/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1008 startbranch: the first branch in the whole branch sequence
1009 first : start branch of sequence of branch-exact nodes.
1010 May be the same as startbranch
1011 last : Thing following the last branch.
1012 May be the same as tail.
1013 tail : item following the branch sequence
1014 count : words in the sequence
1015 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1016 depth : indent depth
3dab1dad 1017
786e8c11 1018Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1019
786e8c11
YO
1020A trie is an N'ary tree where the branches are determined by digital
1021decomposition of the key. IE, at the root node you look up the 1st character and
1022follow that branch repeat until you find the end of the branches. Nodes can be
1023marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1024
786e8c11 1025 /he|she|his|hers/
72f13be8 1026
786e8c11
YO
1027would convert into the following structure. Numbers represent states, letters
1028following numbers represent valid transitions on the letter from that state, if
1029the number is in square brackets it represents an accepting state, otherwise it
1030will be in parenthesis.
07be1b83 1031
786e8c11
YO
1032 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1033 | |
1034 | (2)
1035 | |
1036 (1) +-i->(6)-+-s->[7]
1037 |
1038 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1039
786e8c11
YO
1040 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1041
1042This shows that when matching against the string 'hers' we will begin at state 1
1043read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1044then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1045is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1046single traverse. We store a mapping from accepting to state to which word was
1047matched, and then when we have multiple possibilities we try to complete the
1048rest of the regex in the order in which they occured in the alternation.
1049
1050The only prior NFA like behaviour that would be changed by the TRIE support is
1051the silent ignoring of duplicate alternations which are of the form:
1052
1053 / (DUPE|DUPE) X? (?{ ... }) Y /x
1054
1055Thus EVAL blocks follwing a trie may be called a different number of times with
1056and without the optimisation. With the optimisations dupes will be silently
1057ignored. This inconsistant behaviour of EVAL type nodes is well established as
1058the following demonstrates:
1059
1060 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1061
1062which prints out 'word' three times, but
1063
1064 'words'=~/(word|word|word)(?{ print $1 })S/
1065
1066which doesnt print it out at all. This is due to other optimisations kicking in.
1067
1068Example of what happens on a structural level:
1069
1070The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1071
1072 1: CURLYM[1] {1,32767}(18)
1073 5: BRANCH(8)
1074 6: EXACT <ac>(16)
1075 8: BRANCH(11)
1076 9: EXACT <ad>(16)
1077 11: BRANCH(14)
1078 12: EXACT <ab>(16)
1079 16: SUCCEED(0)
1080 17: NOTHING(18)
1081 18: END(0)
1082
1083This would be optimizable with startbranch=5, first=5, last=16, tail=16
1084and should turn into:
1085
1086 1: CURLYM[1] {1,32767}(18)
1087 5: TRIE(16)
1088 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1089 <ac>
1090 <ad>
1091 <ab>
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096Cases where tail != last would be like /(?foo|bar)baz/:
1097
1098 1: BRANCH(4)
1099 2: EXACT <foo>(8)
1100 4: BRANCH(7)
1101 5: EXACT <bar>(8)
1102 7: TAIL(8)
1103 8: EXACT <baz>(10)
1104 10: END(0)
1105
1106which would be optimizable with startbranch=1, first=1, last=7, tail=8
1107and would end up looking like:
1108
1109 1: TRIE(8)
1110 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1111 <foo>
1112 <bar>
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117 d = uvuni_to_utf8_flags(d, uv, 0);
1118
1119is the recommended Unicode-aware way of saying
1120
1121 *(d++) = uv;
1122*/
1123
1e2e3d02 1124#define TRIE_STORE_REVCHAR \
786e8c11 1125 STMT_START { \
1e2e3d02 1126 SV *tmp = newSVpvs(""); \
786e8c11 1127 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1128 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1129 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1130 } STMT_END
1131
1132#define TRIE_READ_CHAR STMT_START { \
1133 wordlen++; \
1134 if ( UTF ) { \
1135 if ( folder ) { \
1136 if ( foldlen > 0 ) { \
1137 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1138 foldlen -= len; \
1139 scan += len; \
1140 len = 0; \
1141 } else { \
1142 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1143 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1144 foldlen -= UNISKIP( uvc ); \
1145 scan = foldbuf + UNISKIP( uvc ); \
1146 } \
1147 } else { \
1148 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1149 } \
1150 } else { \
1151 uvc = (U32)*uc; \
1152 len = 1; \
1153 } \
1154} STMT_END
1155
1156
1157
1158#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1159 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1160 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1161 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1162 } \
1163 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1164 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1165 TRIE_LIST_CUR( state )++; \
1166} STMT_END
07be1b83 1167
786e8c11
YO
1168#define TRIE_LIST_NEW(state) STMT_START { \
1169 Newxz( trie->states[ state ].trans.list, \
1170 4, reg_trie_trans_le ); \
1171 TRIE_LIST_CUR( state ) = 1; \
1172 TRIE_LIST_LEN( state ) = 4; \
1173} STMT_END
07be1b83 1174
786e8c11
YO
1175#define TRIE_HANDLE_WORD(state) STMT_START { \
1176 U16 dupe= trie->states[ state ].wordnum; \
1177 regnode * const noper_next = regnext( noper ); \
1178 \
1179 if (trie->wordlen) \
1180 trie->wordlen[ curword ] = wordlen; \
1181 DEBUG_r({ \
1182 /* store the word for dumping */ \
1183 SV* tmp; \
1184 if (OP(noper) != NOTHING) \
1185 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1186 else \
1187 tmp = newSVpvn( "", 0 ); \
1188 if ( UTF ) SvUTF8_on( tmp ); \
1189 av_push( trie->words, tmp ); \
1190 }); \
1191 \
1192 curword++; \
1193 \
1194 if ( noper_next < tail ) { \
1195 if (!trie->jump) \
1196 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1197 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1198 if (!jumper) \
1199 jumper = noper_next; \
1200 if (!nextbranch) \
1201 nextbranch= regnext(cur); \
1202 } \
1203 \
1204 if ( dupe ) { \
1205 /* So it's a dupe. This means we need to maintain a */\
1206 /* linked-list from the first to the next. */\
1207 /* we only allocate the nextword buffer when there */\
1208 /* a dupe, so first time we have to do the allocation */\
1209 if (!trie->nextword) \
1210 Newxz( trie->nextword, word_count + 1, U16); \
1211 while ( trie->nextword[dupe] ) \
1212 dupe= trie->nextword[dupe]; \
1213 trie->nextword[dupe]= curword; \
1214 } else { \
1215 /* we haven't inserted this word yet. */ \
1216 trie->states[ state ].wordnum = curword; \
1217 } \
1218} STMT_END
07be1b83 1219
3dab1dad 1220
786e8c11
YO
1221#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1222 ( ( base + charid >= ucharcount \
1223 && base + charid < ubound \
1224 && state == trie->trans[ base - ucharcount + charid ].check \
1225 && trie->trans[ base - ucharcount + charid ].next ) \
1226 ? trie->trans[ base - ucharcount + charid ].next \
1227 : ( state==1 ? special : 0 ) \
1228 )
3dab1dad 1229
786e8c11
YO
1230#define MADE_TRIE 1
1231#define MADE_JUMP_TRIE 2
1232#define MADE_EXACT_TRIE 4
3dab1dad 1233
a3621e74 1234STATIC I32
786e8c11 1235S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1236{
27da23d5 1237 dVAR;
a3621e74
YO
1238 /* first pass, loop through and scan words */
1239 reg_trie_data *trie;
1240 regnode *cur;
9f7f3913 1241 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1242 STRLEN len = 0;
1243 UV uvc = 0;
1244 U16 curword = 0;
1245 U32 next_alloc = 0;
786e8c11
YO
1246 regnode *jumper = NULL;
1247 regnode *nextbranch = NULL;
7f69552c 1248 regnode *convert = NULL;
a3621e74 1249 /* we just use folder as a flag in utf8 */
e1ec3a88 1250 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1251 ? PL_fold
1252 : ( flags == EXACTFL
1253 ? PL_fold_locale
1254 : NULL
1255 )
1256 );
1257
e1ec3a88 1258 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1259 SV *re_trie_maxbuff;
3dab1dad
YO
1260#ifndef DEBUGGING
1261 /* these are only used during construction but are useful during
8e11feef 1262 * debugging so we store them in the struct when debugging.
8e11feef 1263 */
3dab1dad 1264 STRLEN trie_charcount=0;
3dab1dad
YO
1265 AV *trie_revcharmap;
1266#endif
a3621e74 1267 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1268#ifndef DEBUGGING
1269 PERL_UNUSED_ARG(depth);
1270#endif
a3621e74 1271
a02a5408 1272 Newxz( trie, 1, reg_trie_data );
a3621e74 1273 trie->refcount = 1;
3dab1dad 1274 trie->startstate = 1;
786e8c11 1275 trie->wordcount = word_count;
a3621e74 1276 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1277 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1278 if (!(UTF && folder))
1279 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1280 DEBUG_r({
1281 trie->words = newAV();
a3621e74 1282 });
3dab1dad 1283 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1284
0111c4fd 1285 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1286 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1287 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1288 }
3dab1dad
YO
1289 DEBUG_OPTIMISE_r({
1290 PerlIO_printf( Perl_debug_log,
786e8c11 1291 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1292 (int)depth * 2 + 2, "",
1293 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1294 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1295 (int)depth);
3dab1dad 1296 });
7f69552c
YO
1297
1298 /* Find the node we are going to overwrite */
1299 if ( first == startbranch && OP( last ) != BRANCH ) {
1300 /* whole branch chain */
1301 convert = first;
1302 } else {
1303 /* branch sub-chain */
1304 convert = NEXTOPER( first );
1305 }
1306
a3621e74
YO
1307 /* -- First loop and Setup --
1308
1309 We first traverse the branches and scan each word to determine if it
1310 contains widechars, and how many unique chars there are, this is
1311 important as we have to build a table with at least as many columns as we
1312 have unique chars.
1313
1314 We use an array of integers to represent the character codes 0..255
1315 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1316 native representation of the character value as the key and IV's for the
1317 coded index.
1318
1319 *TODO* If we keep track of how many times each character is used we can
1320 remap the columns so that the table compression later on is more
1321 efficient in terms of memory by ensuring most common value is in the
1322 middle and the least common are on the outside. IMO this would be better
1323 than a most to least common mapping as theres a decent chance the most
1324 common letter will share a node with the least common, meaning the node
1325 will not be compressable. With a middle is most common approach the worst
1326 case is when we have the least common nodes twice.
1327
1328 */
1329
a3621e74 1330 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1331 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1332 const U8 *uc = (U8*)STRING( noper );
a28509cc 1333 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1334 STRLEN foldlen = 0;
1335 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1336 const U8 *scan = (U8*)NULL;
07be1b83 1337 U32 wordlen = 0; /* required init */
3dab1dad 1338 STRLEN chars=0;
a3621e74 1339
3dab1dad
YO
1340 if (OP(noper) == NOTHING) {
1341 trie->minlen= 0;
1342 continue;
1343 }
1344 if (trie->bitmap) {
1345 TRIE_BITMAP_SET(trie,*uc);
1346 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1347 }
a3621e74 1348 for ( ; uc < e ; uc += len ) {
3dab1dad 1349 TRIE_CHARCOUNT(trie)++;
a3621e74 1350 TRIE_READ_CHAR;
3dab1dad 1351 chars++;
a3621e74
YO
1352 if ( uvc < 256 ) {
1353 if ( !trie->charmap[ uvc ] ) {
1354 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1355 if ( folder )
1356 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1357 TRIE_STORE_REVCHAR;
a3621e74
YO
1358 }
1359 } else {
1360 SV** svpp;
1361 if ( !trie->widecharmap )
1362 trie->widecharmap = newHV();
1363
1364 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1365
1366 if ( !svpp )
e4584336 1367 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1368
1369 if ( !SvTRUE( *svpp ) ) {
1370 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1371 TRIE_STORE_REVCHAR;
a3621e74
YO
1372 }
1373 }
1374 }
3dab1dad
YO
1375 if( cur == first ) {
1376 trie->minlen=chars;
1377 trie->maxlen=chars;
1378 } else if (chars < trie->minlen) {
1379 trie->minlen=chars;
1380 } else if (chars > trie->maxlen) {
1381 trie->maxlen=chars;
1382 }
1383
a3621e74
YO
1384 } /* end first pass */
1385 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1386 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1387 (int)depth * 2 + 2,"",
85c3142d 1388 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1390 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1391 );
786e8c11 1392 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1393
1394 /*
1395 We now know what we are dealing with in terms of unique chars and
1396 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1397 representation using a flat table will take. If it's over a reasonable
1398 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1399 conservative but potentially much slower representation using an array
1400 of lists.
1401
1402 At the end we convert both representations into the same compressed
1403 form that will be used in regexec.c for matching with. The latter
1404 is a form that cannot be used to construct with but has memory
1405 properties similar to the list form and access properties similar
1406 to the table form making it both suitable for fast searches and
1407 small enough that its feasable to store for the duration of a program.
1408
1409 See the comment in the code where the compressed table is produced
1410 inplace from the flat tabe representation for an explanation of how
1411 the compression works.
1412
1413 */
1414
1415
3dab1dad 1416 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1417 /*
1418 Second Pass -- Array Of Lists Representation
1419
1420 Each state will be represented by a list of charid:state records
1421 (reg_trie_trans_le) the first such element holds the CUR and LEN
1422 points of the allocated array. (See defines above).
1423
1424 We build the initial structure using the lists, and then convert
1425 it into the compressed table form which allows faster lookups
1426 (but cant be modified once converted).
a3621e74
YO
1427 */
1428
a3621e74
YO
1429 STRLEN transcount = 1;
1430
1e2e3d02
YO
1431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1432 "%*sCompiling trie using list compiler\n",
1433 (int)depth * 2 + 2, ""));
1434
3dab1dad 1435 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1436 TRIE_LIST_NEW(1);
1437 next_alloc = 2;
1438
1439 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1440
c445ea15
AL
1441 regnode * const noper = NEXTOPER( cur );
1442 U8 *uc = (U8*)STRING( noper );
1443 const U8 * const e = uc + STR_LEN( noper );
1444 U32 state = 1; /* required init */
1445 U16 charid = 0; /* sanity init */
1446 U8 *scan = (U8*)NULL; /* sanity init */
1447 STRLEN foldlen = 0; /* required init */
07be1b83 1448 U32 wordlen = 0; /* required init */
c445ea15
AL
1449 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1450
3dab1dad 1451 if (OP(noper) != NOTHING) {
786e8c11 1452 for ( ; uc < e ; uc += len ) {
c445ea15 1453
786e8c11 1454 TRIE_READ_CHAR;
c445ea15 1455
786e8c11
YO
1456 if ( uvc < 256 ) {
1457 charid = trie->charmap[ uvc ];
c445ea15 1458 } else {
786e8c11
YO
1459 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1460 if ( !svpp ) {
1461 charid = 0;
1462 } else {
1463 charid=(U16)SvIV( *svpp );
1464 }
c445ea15 1465 }
786e8c11
YO
1466 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1467 if ( charid ) {
a3621e74 1468
786e8c11
YO
1469 U16 check;
1470 U32 newstate = 0;
a3621e74 1471
786e8c11
YO
1472 charid--;
1473 if ( !trie->states[ state ].trans.list ) {
1474 TRIE_LIST_NEW( state );
c445ea15 1475 }
786e8c11
YO
1476 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1477 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1478 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1479 break;
1480 }
1481 }
1482 if ( ! newstate ) {
1483 newstate = next_alloc++;
1484 TRIE_LIST_PUSH( state, charid, newstate );
1485 transcount++;
1486 }
1487 state = newstate;
1488 } else {
1489 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1490 }
a28509cc 1491 }
c445ea15 1492 }
3dab1dad 1493 TRIE_HANDLE_WORD(state);
a3621e74
YO
1494
1495 } /* end second pass */
1496
1e2e3d02
YO
1497 /* next alloc is the NEXT state to be allocated */
1498 trie->statecount = next_alloc;
a3621e74
YO
1499 Renew( trie->states, next_alloc, reg_trie_state );
1500
3dab1dad
YO
1501 /* and now dump it out before we compress it */
1502 DEBUG_TRIE_COMPILE_MORE_r(
1503 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1504 );
a3621e74 1505
a02a5408 1506 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1507 {
1508 U32 state;
a3621e74
YO
1509 U32 tp = 0;
1510 U32 zp = 0;
1511
1512
1513 for( state=1 ; state < next_alloc ; state ++ ) {
1514 U32 base=0;
1515
1516 /*
1517 DEBUG_TRIE_COMPILE_MORE_r(
1518 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1519 );
1520 */
1521
1522 if (trie->states[state].trans.list) {
1523 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1524 U16 maxid=minid;
a28509cc 1525 U16 idx;
a3621e74
YO
1526
1527 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1528 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1529 if ( forid < minid ) {
1530 minid=forid;
1531 } else if ( forid > maxid ) {
1532 maxid=forid;
1533 }
a3621e74
YO
1534 }
1535 if ( transcount < tp + maxid - minid + 1) {
1536 transcount *= 2;
1537 Renew( trie->trans, transcount, reg_trie_trans );
1538 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1539 }
1540 base = trie->uniquecharcount + tp - minid;
1541 if ( maxid == minid ) {
1542 U32 set = 0;
1543 for ( ; zp < tp ; zp++ ) {
1544 if ( ! trie->trans[ zp ].next ) {
1545 base = trie->uniquecharcount + zp - minid;
1546 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1547 trie->trans[ zp ].check = state;
1548 set = 1;
1549 break;
1550 }
1551 }
1552 if ( !set ) {
1553 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1554 trie->trans[ tp ].check = state;
1555 tp++;
1556 zp = tp;
1557 }
1558 } else {
1559 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1560 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1561 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1562 trie->trans[ tid ].check = state;
1563 }
1564 tp += ( maxid - minid + 1 );
1565 }
1566 Safefree(trie->states[ state ].trans.list);
1567 }
1568 /*
1569 DEBUG_TRIE_COMPILE_MORE_r(
1570 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1571 );
1572 */
1573 trie->states[ state ].trans.base=base;
1574 }
cc601c31 1575 trie->lasttrans = tp + 1;
a3621e74
YO
1576 }
1577 } else {
1578 /*
1579 Second Pass -- Flat Table Representation.
1580
1581 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1582 We know that we will need Charcount+1 trans at most to store the data
1583 (one row per char at worst case) So we preallocate both structures
1584 assuming worst case.
1585
1586 We then construct the trie using only the .next slots of the entry
1587 structs.
1588
1589 We use the .check field of the first entry of the node temporarily to
1590 make compression both faster and easier by keeping track of how many non
1591 zero fields are in the node.
1592
1593 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1594 transition.
1595
1596 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1597 number representing the first entry of the node, and state as a
1598 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1599 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1600 are 2 entrys per node. eg:
1601
1602 A B A B
1603 1. 2 4 1. 3 7
1604 2. 0 3 3. 0 5
1605 3. 0 0 5. 0 0
1606 4. 0 0 7. 0 0
1607
1608 The table is internally in the right hand, idx form. However as we also
1609 have to deal with the states array which is indexed by nodenum we have to
1610 use TRIE_NODENUM() to convert.
1611
1612 */
1e2e3d02
YO
1613 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1614 "%*sCompiling trie using table compiler\n",
1615 (int)depth * 2 + 2, ""));
3dab1dad
YO
1616
1617 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1618 reg_trie_trans );
3dab1dad 1619 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1620 next_alloc = trie->uniquecharcount + 1;
1621
3dab1dad 1622
a3621e74
YO
1623 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1624
c445ea15 1625 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1626 const U8 *uc = (U8*)STRING( noper );
1627 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1628
1629 U32 state = 1; /* required init */
1630
1631 U16 charid = 0; /* sanity init */
1632 U32 accept_state = 0; /* sanity init */
1633 U8 *scan = (U8*)NULL; /* sanity init */
1634
1635 STRLEN foldlen = 0; /* required init */
07be1b83 1636 U32 wordlen = 0; /* required init */
a3621e74
YO
1637 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1638
3dab1dad 1639 if ( OP(noper) != NOTHING ) {
786e8c11 1640 for ( ; uc < e ; uc += len ) {
a3621e74 1641
786e8c11 1642 TRIE_READ_CHAR;
a3621e74 1643
786e8c11
YO
1644 if ( uvc < 256 ) {
1645 charid = trie->charmap[ uvc ];
1646 } else {
1647 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1648 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1649 }
786e8c11
YO
1650 if ( charid ) {
1651 charid--;
1652 if ( !trie->trans[ state + charid ].next ) {
1653 trie->trans[ state + charid ].next = next_alloc;
1654 trie->trans[ state ].check++;
1655 next_alloc += trie->uniquecharcount;
1656 }
1657 state = trie->trans[ state + charid ].next;
1658 } else {
1659 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1660 }
1661 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1662 }
a3621e74 1663 }
3dab1dad
YO
1664 accept_state = TRIE_NODENUM( state );
1665 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1666
1667 } /* end second pass */
1668
3dab1dad
YO
1669 /* and now dump it out before we compress it */
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 dump_trie_interim_table(trie,next_alloc,depth+1)
1672 );
a3621e74 1673
a3621e74
YO
1674 {
1675 /*
1676 * Inplace compress the table.*
1677
1678 For sparse data sets the table constructed by the trie algorithm will
1679 be mostly 0/FAIL transitions or to put it another way mostly empty.
1680 (Note that leaf nodes will not contain any transitions.)
1681
1682 This algorithm compresses the tables by eliminating most such
1683 transitions, at the cost of a modest bit of extra work during lookup:
1684
1685 - Each states[] entry contains a .base field which indicates the
1686 index in the state[] array wheres its transition data is stored.
1687
1688 - If .base is 0 there are no valid transitions from that node.
1689
1690 - If .base is nonzero then charid is added to it to find an entry in
1691 the trans array.
1692
1693 -If trans[states[state].base+charid].check!=state then the
1694 transition is taken to be a 0/Fail transition. Thus if there are fail
1695 transitions at the front of the node then the .base offset will point
1696 somewhere inside the previous nodes data (or maybe even into a node
1697 even earlier), but the .check field determines if the transition is
1698 valid.
1699
786e8c11 1700 XXX - wrong maybe?
a3621e74
YO
1701 The following process inplace converts the table to the compressed
1702 table: We first do not compress the root node 1,and mark its all its
1703 .check pointers as 1 and set its .base pointer as 1 as well. This
1704 allows to do a DFA construction from the compressed table later, and
1705 ensures that any .base pointers we calculate later are greater than
1706 0.
1707
1708 - We set 'pos' to indicate the first entry of the second node.
1709
1710 - We then iterate over the columns of the node, finding the first and
1711 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1712 and set the .check pointers accordingly, and advance pos
1713 appropriately and repreat for the next node. Note that when we copy
1714 the next pointers we have to convert them from the original
1715 NODEIDX form to NODENUM form as the former is not valid post
1716 compression.
1717
1718 - If a node has no transitions used we mark its base as 0 and do not
1719 advance the pos pointer.
1720
1721 - If a node only has one transition we use a second pointer into the
1722 structure to fill in allocated fail transitions from other states.
1723 This pointer is independent of the main pointer and scans forward
1724 looking for null transitions that are allocated to a state. When it
1725 finds one it writes the single transition into the "hole". If the
786e8c11 1726 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1727
1728 - Once compressed we can Renew/realloc the structures to release the
1729 excess space.
1730
1731 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1732 specifically Fig 3.47 and the associated pseudocode.
1733
1734 demq
1735 */
a3b680e6 1736 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1737 U32 state, charid;
a3621e74 1738 U32 pos = 0, zp=0;
1e2e3d02 1739 trie->statecount = laststate;
a3621e74
YO
1740
1741 for ( state = 1 ; state < laststate ; state++ ) {
1742 U8 flag = 0;
a28509cc
AL
1743 const U32 stateidx = TRIE_NODEIDX( state );
1744 const U32 o_used = trie->trans[ stateidx ].check;
1745 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1746 trie->trans[ stateidx ].check = 0;
1747
1748 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1749 if ( flag || trie->trans[ stateidx + charid ].next ) {
1750 if ( trie->trans[ stateidx + charid ].next ) {
1751 if (o_used == 1) {
1752 for ( ; zp < pos ; zp++ ) {
1753 if ( ! trie->trans[ zp ].next ) {
1754 break;
1755 }
1756 }
1757 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1758 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1759 trie->trans[ zp ].check = state;
1760 if ( ++zp > pos ) pos = zp;
1761 break;
1762 }
1763 used--;
1764 }
1765 if ( !flag ) {
1766 flag = 1;
1767 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1768 }
1769 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ pos ].check = state;
1771 pos++;
1772 }
1773 }
1774 }
cc601c31 1775 trie->lasttrans = pos + 1;
1e2e3d02 1776 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1777 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1778 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1779 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1780 (int)depth * 2 + 2,"",
1781 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1782 (IV)next_alloc,
1783 (IV)pos,
a3621e74
YO
1784 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1785 );
1786
1787 } /* end table compress */
1788 }
1e2e3d02
YO
1789 DEBUG_TRIE_COMPILE_MORE_r(
1790 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1791 (int)depth * 2 + 2, "",
1792 (UV)trie->statecount,
1793 (UV)trie->lasttrans)
1794 );
cc601c31
YO
1795 /* resize the trans array to remove unused space */
1796 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1797
3dab1dad
YO
1798 /* and now dump out the compressed format */
1799 DEBUG_TRIE_COMPILE_r(
1800 dump_trie(trie,depth+1)
1801 );
07be1b83 1802
3dab1dad 1803 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1804 U8 nodetype =(U8)(flags & 0xFF);
1805 char *str=NULL;
786e8c11 1806
07be1b83 1807#ifdef DEBUGGING
e62cc96a 1808 regnode *optimize = NULL;
b57a0404
JH
1809 U32 mjd_offset = 0;
1810 U32 mjd_nodelen = 0;
07be1b83 1811#endif
a3621e74 1812 /*
3dab1dad
YO
1813 This means we convert either the first branch or the first Exact,
1814 depending on whether the thing following (in 'last') is a branch
1815 or not and whther first is the startbranch (ie is it a sub part of
1816 the alternation or is it the whole thing.)
1817 Assuming its a sub part we conver the EXACT otherwise we convert
1818 the whole branch sequence, including the first.
a3621e74 1819 */
3dab1dad 1820 /* Find the node we are going to overwrite */
7f69552c 1821 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1822 /* branch sub-chain */
3dab1dad 1823 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1824 DEBUG_r({
1825 mjd_offset= Node_Offset((convert));
1826 mjd_nodelen= Node_Length((convert));
1827 });
7f69552c
YO
1828 /* whole branch chain */
1829 } else {
1830 DEBUG_r({
1831 const regnode *nop = NEXTOPER( convert );
1832 mjd_offset= Node_Offset((nop));
1833 mjd_nodelen= Node_Length((nop));
1834 });
07be1b83 1835 }
7f69552c 1836
07be1b83
YO
1837 DEBUG_OPTIMISE_r(
1838 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1839 (int)depth * 2 + 2, "",
786e8c11 1840 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1841 );
a3621e74 1842
3dab1dad
YO
1843 /* But first we check to see if there is a common prefix we can
1844 split out as an EXACT and put in front of the TRIE node. */
1845 trie->startstate= 1;
786e8c11 1846 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1847 U32 state;
1e2e3d02 1848 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1849 U32 ofs = 0;
8e11feef
RGS
1850 I32 idx = -1;
1851 U32 count = 0;
1852 const U32 base = trie->states[ state ].trans.base;
a3621e74 1853
3dab1dad 1854 if ( trie->states[state].wordnum )
8e11feef 1855 count = 1;
a3621e74 1856
8e11feef 1857 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1858 if ( ( base + ofs >= trie->uniquecharcount ) &&
1859 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1860 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1861 {
3dab1dad 1862 if ( ++count > 1 ) {
8e11feef 1863 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1864 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1865 if ( state == 1 ) break;
3dab1dad
YO
1866 if ( count == 2 ) {
1867 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1868 DEBUG_OPTIMISE_r(
8e11feef
RGS
1869 PerlIO_printf(Perl_debug_log,
1870 "%*sNew Start State=%"UVuf" Class: [",
1871 (int)depth * 2 + 2, "",
786e8c11 1872 (UV)state));
be8e71aa
YO
1873 if (idx >= 0) {
1874 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1875 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1876
3dab1dad 1877 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1878 if ( folder )
1879 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1880 DEBUG_OPTIMISE_r(
07be1b83 1881 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1882 );
8e11feef
RGS
1883 }
1884 }
1885 TRIE_BITMAP_SET(trie,*ch);
1886 if ( folder )
1887 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1888 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1889 }
1890 idx = ofs;
1891 }
3dab1dad
YO
1892 }
1893 if ( count == 1 ) {
1894 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1895 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1896 DEBUG_OPTIMISE_r(
8e11feef
RGS
1897 PerlIO_printf( Perl_debug_log,
1898 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1899 (int)depth * 2 + 2, "",
786e8c11 1900 (UV)state, (UV)idx, ch)
3dab1dad
YO
1901 );
1902 if ( state==1 ) {
1903 OP( convert ) = nodetype;
1904 str=STRING(convert);
1905 STR_LEN(convert)=0;
1906 }
1907 *str++=*ch;
1908 STR_LEN(convert)++;
a3621e74 1909
8e11feef 1910 } else {
f9049ba1 1911#ifdef DEBUGGING
8e11feef
RGS
1912 if (state>1)
1913 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1914#endif
8e11feef
RGS
1915 break;
1916 }
1917 }
3dab1dad 1918 if (str) {
8e11feef 1919 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1920 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1921 trie->startstate = state;
07be1b83
YO
1922 trie->minlen -= (state - 1);
1923 trie->maxlen -= (state - 1);
1924 DEBUG_r({
1925 regnode *fix = convert;
1926 mjd_nodelen++;
1927 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1928 while( ++fix < n ) {
1929 Set_Node_Offset_Length(fix, 0, 0);
1930 }
1931 });
8e11feef
RGS
1932 if (trie->maxlen) {
1933 convert = n;
1934 } else {
3dab1dad 1935 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1936 DEBUG_r(optimize= n);
3dab1dad
YO
1937 }
1938 }
1939 }
a5ca303d
YO
1940 if (!jumper)
1941 jumper = last;
3dab1dad 1942 if ( trie->maxlen ) {
8e11feef
RGS
1943 NEXT_OFF( convert ) = (U16)(tail - convert);
1944 ARG_SET( convert, data_slot );
786e8c11
YO
1945 /* Store the offset to the first unabsorbed branch in
1946 jump[0], which is otherwise unused by the jump logic.
1947 We use this when dumping a trie and during optimisation. */
1948 if (trie->jump)
7f69552c 1949 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1950
786e8c11
YO
1951 /* XXXX */
1952 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1953 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1954 {
1955 OP( convert ) = TRIEC;
1956 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1957 Safefree(trie->bitmap);
1958 trie->bitmap= NULL;
1959 } else
1960 OP( convert ) = TRIE;
a3621e74 1961
3dab1dad
YO
1962 /* store the type in the flags */
1963 convert->flags = nodetype;
a5ca303d
YO
1964 DEBUG_r({
1965 optimize = convert
1966 + NODE_STEP_REGNODE
1967 + regarglen[ OP( convert ) ];
1968 });
1969 /* XXX We really should free up the resource in trie now,
1970 as we won't use them - (which resources?) dmq */
3dab1dad 1971 }
a3621e74 1972 /* needed for dumping*/
e62cc96a 1973 DEBUG_r(if (optimize) {
07be1b83 1974 regnode *opt = convert;
e62cc96a 1975 while ( ++opt < optimize) {
07be1b83
YO
1976 Set_Node_Offset_Length(opt,0,0);
1977 }
786e8c11
YO
1978 /*
1979 Try to clean up some of the debris left after the
1980 optimisation.
a3621e74 1981 */
786e8c11 1982 while( optimize < jumper ) {
07be1b83 1983 mjd_nodelen += Node_Length((optimize));
a3621e74 1984 OP( optimize ) = OPTIMIZED;
07be1b83 1985 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1986 optimize++;
1987 }
07be1b83 1988 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1989 });
1990 } /* end node insert */
07be1b83 1991#ifndef DEBUGGING
6e8b4190 1992 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1993#endif
786e8c11
YO
1994 return trie->jump
1995 ? MADE_JUMP_TRIE
1996 : trie->startstate>1
1997 ? MADE_EXACT_TRIE
1998 : MADE_TRIE;
1999}
2000
2001STATIC void
2002S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2003{
2004/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2005
2006 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2007 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2008 ISBN 0-201-10088-6
2009
2010 We find the fail state for each state in the trie, this state is the longest proper
2011 suffix of the current states 'word' that is also a proper prefix of another word in our
2012 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2013 the DFA not to have to restart after its tried and failed a word at a given point, it
2014 simply continues as though it had been matching the other word in the first place.
2015 Consider
2016 'abcdgu'=~/abcdefg|cdgu/
2017 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2018 fail, which would bring use to the state representing 'd' in the second word where we would
2019 try 'g' and succeed, prodceding to match 'cdgu'.
2020 */
2021 /* add a fail transition */
2022 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2023 U32 *q;
2024 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2025 const U32 numstates = trie->statecount;
786e8c11
YO
2026 const U32 ubound = trie->lasttrans + ucharcount;
2027 U32 q_read = 0;
2028 U32 q_write = 0;
2029 U32 charid;
2030 U32 base = trie->states[ 1 ].trans.base;
2031 U32 *fail;
2032 reg_ac_data *aho;
2033 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2034 GET_RE_DEBUG_FLAGS_DECL;
2035#ifndef DEBUGGING
2036 PERL_UNUSED_ARG(depth);
2037#endif
2038
2039
2040 ARG_SET( stclass, data_slot );
2041 Newxz( aho, 1, reg_ac_data );
2042 RExC_rx->data->data[ data_slot ] = (void*)aho;
2043 aho->trie=trie;
2044 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2045 numstates * sizeof(reg_trie_state));
786e8c11
YO
2046 Newxz( q, numstates, U32);
2047 Newxz( aho->fail, numstates, U32 );
2048 aho->refcount = 1;
2049 fail = aho->fail;
2050 /* initialize fail[0..1] to be 1 so that we always have
2051 a valid final fail state */
2052 fail[ 0 ] = fail[ 1 ] = 1;
2053
2054 for ( charid = 0; charid < ucharcount ; charid++ ) {
2055 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2056 if ( newstate ) {
2057 q[ q_write ] = newstate;
2058 /* set to point at the root */
2059 fail[ q[ q_write++ ] ]=1;
2060 }
2061 }
2062 while ( q_read < q_write) {
2063 const U32 cur = q[ q_read++ % numstates ];
2064 base = trie->states[ cur ].trans.base;
2065
2066 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2067 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2068 if (ch_state) {
2069 U32 fail_state = cur;
2070 U32 fail_base;
2071 do {
2072 fail_state = fail[ fail_state ];
2073 fail_base = aho->states[ fail_state ].trans.base;
2074 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2075
2076 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2077 fail[ ch_state ] = fail_state;
2078 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2079 {
2080 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2081 }
2082 q[ q_write++ % numstates] = ch_state;
2083 }
2084 }
2085 }
2086 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2087 when we fail in state 1, this allows us to use the
2088 charclass scan to find a valid start char. This is based on the principle
2089 that theres a good chance the string being searched contains lots of stuff
2090 that cant be a start char.
2091 */
2092 fail[ 0 ] = fail[ 1 ] = 0;
2093 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2094 PerlIO_printf(Perl_debug_log,
2095 "%*sStclass Failtable (%"UVuf" states): 0",
2096 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2097 );
786e8c11
YO
2098 for( q_read=1; q_read<numstates; q_read++ ) {
2099 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2100 }
2101 PerlIO_printf(Perl_debug_log, "\n");
2102 });
2103 Safefree(q);
2104 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2105}
2106
786e8c11 2107
a3621e74 2108/*
5d1c421c
JH
2109 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2110 * These need to be revisited when a newer toolchain becomes available.
2111 */
2112#if defined(__sparc64__) && defined(__GNUC__)
2113# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2114# undef SPARC64_GCC_WORKAROUND
2115# define SPARC64_GCC_WORKAROUND 1
2116# endif
2117#endif
2118
07be1b83
YO
2119#define DEBUG_PEEP(str,scan,depth) \
2120 DEBUG_OPTIMISE_r({ \
2121 SV * const mysv=sv_newmortal(); \
2122 regnode *Next = regnext(scan); \
2123 regprop(RExC_rx, mysv, scan); \
7f69552c 2124 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2125 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2126 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2127 });
2128
1de06328
YO
2129
2130
2131
2132
07be1b83
YO
2133#define JOIN_EXACT(scan,min,flags) \
2134 if (PL_regkind[OP(scan)] == EXACT) \
2135 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2136
be8e71aa 2137STATIC U32
07be1b83
YO
2138S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2139 /* Merge several consecutive EXACTish nodes into one. */
2140 regnode *n = regnext(scan);
2141 U32 stringok = 1;
2142 regnode *next = scan + NODE_SZ_STR(scan);
2143 U32 merged = 0;
2144 U32 stopnow = 0;
2145#ifdef DEBUGGING
2146 regnode *stop = scan;
72f13be8 2147 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2148#else
d47053eb
RGS
2149 PERL_UNUSED_ARG(depth);
2150#endif
2151#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2152 PERL_UNUSED_ARG(flags);
2153 PERL_UNUSED_ARG(val);
07be1b83 2154#endif
07be1b83
YO
2155 DEBUG_PEEP("join",scan,depth);
2156
2157 /* Skip NOTHING, merge EXACT*. */
2158 while (n &&
2159 ( PL_regkind[OP(n)] == NOTHING ||
2160 (stringok && (OP(n) == OP(scan))))
2161 && NEXT_OFF(n)
2162 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2163
2164 if (OP(n) == TAIL || n > next)
2165 stringok = 0;
2166 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2167 DEBUG_PEEP("skip:",n,depth);
2168 NEXT_OFF(scan) += NEXT_OFF(n);
2169 next = n + NODE_STEP_REGNODE;
2170#ifdef DEBUGGING
2171 if (stringok)
2172 stop = n;
2173#endif
2174 n = regnext(n);
2175 }
2176 else if (stringok) {
786e8c11 2177 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2178 regnode * const nnext = regnext(n);
2179
2180 DEBUG_PEEP("merg",n,depth);
2181
2182 merged++;
2183 if (oldl + STR_LEN(n) > U8_MAX)
2184 break;
2185 NEXT_OFF(scan) += NEXT_OFF(n);
2186 STR_LEN(scan) += STR_LEN(n);
2187 next = n + NODE_SZ_STR(n);
2188 /* Now we can overwrite *n : */
2189 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2190#ifdef DEBUGGING
2191 stop = next - 1;
2192#endif
2193 n = nnext;
2194 if (stopnow) break;
2195 }
2196
d47053eb
RGS
2197#ifdef EXPERIMENTAL_INPLACESCAN
2198 if (flags && !NEXT_OFF(n)) {
2199 DEBUG_PEEP("atch", val, depth);
2200 if (reg_off_by_arg[OP(n)]) {
2201 ARG_SET(n, val - n);
2202 }
2203 else {
2204 NEXT_OFF(n) = val - n;
2205 }
2206 stopnow = 1;
2207 }
07be1b83
YO
2208#endif
2209 }
2210
2211 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2212 /*
2213 Two problematic code points in Unicode casefolding of EXACT nodes:
2214
2215 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2216 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2217
2218 which casefold to
2219
2220 Unicode UTF-8
2221
2222 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2223 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2224
2225 This means that in case-insensitive matching (or "loose matching",
2226 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2227 length of the above casefolded versions) can match a target string
2228 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2229 This would rather mess up the minimum length computation.
2230
2231 What we'll do is to look for the tail four bytes, and then peek
2232 at the preceding two bytes to see whether we need to decrease
2233 the minimum length by four (six minus two).
2234
2235 Thanks to the design of UTF-8, there cannot be false matches:
2236 A sequence of valid UTF-8 bytes cannot be a subsequence of
2237 another valid sequence of UTF-8 bytes.
2238
2239 */
2240 char * const s0 = STRING(scan), *s, *t;
2241 char * const s1 = s0 + STR_LEN(scan) - 1;
2242 char * const s2 = s1 - 4;
e294cc5d
JH
2243#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2244 const char t0[] = "\xaf\x49\xaf\x42";
2245#else
07be1b83 2246 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2247#endif
07be1b83
YO
2248 const char * const t1 = t0 + 3;
2249
2250 for (s = s0 + 2;
2251 s < s2 && (t = ninstr(s, s1, t0, t1));
2252 s = t + 4) {
e294cc5d
JH
2253#ifdef EBCDIC
2254 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2255 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2256#else
07be1b83
YO
2257 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2258 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2259#endif
07be1b83
YO
2260 *min -= 4;
2261 }
2262 }
2263
2264#ifdef DEBUGGING
2265 /* Allow dumping */
2266 n = scan + NODE_SZ_STR(scan);
2267 while (n <= stop) {
2268 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2269 OP(n) = OPTIMIZED;
2270 NEXT_OFF(n) = 0;
2271 }
2272 n++;
2273 }
2274#endif
2275 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2276 return stopnow;
2277}
2278
653099ff
GS
2279/* REx optimizer. Converts nodes into quickier variants "in place".
2280 Finds fixed substrings. */
2281
a0288114 2282/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2283 to the position after last scanned or to NULL. */
2284
40d049e4
YO
2285#define INIT_AND_WITHP \
2286 assert(!and_withp); \
2287 Newx(and_withp,1,struct regnode_charclass_class); \
2288 SAVEFREEPV(and_withp)
07be1b83 2289
76e3520e 2290STATIC I32
40d049e4 2291S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2292 I32 *minlenp, I32 *deltap,
40d049e4
YO
2293 regnode *last,
2294 scan_data_t *data,
2295 I32 stopparen,
2296 U8* recursed,
2297 struct regnode_charclass_class *and_withp,
2298 U32 flags, U32 depth)
c277df42
IZ
2299 /* scanp: Start here (read-write). */
2300 /* deltap: Write maxlen-minlen here. */
2301 /* last: Stop before this one. */
40d049e4
YO
2302 /* data: string data about the pattern */
2303 /* stopparen: treat close N as END */
2304 /* recursed: which subroutines have we recursed into */
2305 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2306{
97aff369 2307 dVAR;
c277df42
IZ
2308 I32 min = 0, pars = 0, code;
2309 regnode *scan = *scanp, *next;
2310 I32 delta = 0;
2311 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2312 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2313 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2314 scan_data_t data_fake;
a3621e74 2315 SV *re_trie_maxbuff = NULL;
786e8c11 2316 regnode *first_non_open = scan;
e2e6a0f1 2317 I32 stopmin = I32_MAX;
a3621e74 2318 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2319#ifdef DEBUGGING
40d049e4 2320 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2321#endif
40d049e4 2322
786e8c11 2323 if ( depth == 0 ) {
40d049e4 2324 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2325 first_non_open=regnext(first_non_open);
2326 }
2327
b81d288d 2328
c277df42
IZ
2329 while (scan && OP(scan) != END && scan < last) {
2330 /* Peephole optimizer: */
1de06328 2331 DEBUG_STUDYDATA(data,depth);
07be1b83 2332 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2333 JOIN_EXACT(scan,&min,0);
a3621e74 2334
653099ff
GS
2335 /* Follow the next-chain of the current node and optimize
2336 away all the NOTHINGs from it. */
c277df42 2337 if (OP(scan) != CURLYX) {
a3b680e6 2338 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2339 ? I32_MAX
2340 /* I32 may be smaller than U16 on CRAYs! */
2341 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2342 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2343 int noff;
2344 regnode *n = scan;
b81d288d 2345
c277df42
IZ
2346 /* Skip NOTHING and LONGJMP. */
2347 while ((n = regnext(n))
3dab1dad 2348 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2349 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2350 && off + noff < max)
2351 off += noff;
2352 if (reg_off_by_arg[OP(scan)])
2353 ARG(scan) = off;
b81d288d 2354 else
c277df42
IZ
2355 NEXT_OFF(scan) = off;
2356 }
a3621e74 2357
07be1b83 2358
3dab1dad 2359
653099ff
GS
2360 /* The principal pseudo-switch. Cannot be a switch, since we
2361 look into several different things. */
b81d288d 2362 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2363 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2364 next = regnext(scan);
2365 code = OP(scan);
a3621e74 2366 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2367
2368 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2369 /* NOTE - There is similar code to this block below for handling
2370 TRIE nodes on a re-study. If you change stuff here check there
2371 too. */
c277df42 2372 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2373 struct regnode_charclass_class accum;
d4c19fe8 2374 regnode * const startbranch=scan;
c277df42 2375
653099ff 2376 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2377 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2378 if (flags & SCF_DO_STCLASS)
830247a4 2379 cl_init_zero(pRExC_state, &accum);
a3621e74 2380
c277df42 2381 while (OP(scan) == code) {
830247a4 2382 I32 deltanext, minnext, f = 0, fake;
653099ff 2383 struct regnode_charclass_class this_class;
c277df42
IZ
2384
2385 num++;
2386 data_fake.flags = 0;
40d049e4 2387 if (data) {
2c2d71f5 2388 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2389 data_fake.last_closep = data->last_closep;
2390 }
2391 else
2392 data_fake.last_closep = &fake;
c277df42
IZ
2393 next = regnext(scan);
2394 scan = NEXTOPER(scan);
2395 if (code != BRANCH)
2396 scan = NEXTOPER(scan);
653099ff 2397 if (flags & SCF_DO_STCLASS) {
830247a4 2398 cl_init(pRExC_state, &this_class);
653099ff
GS
2399 data_fake.start_class = &this_class;
2400 f = SCF_DO_STCLASS_AND;
b81d288d 2401 }
e1901655
IZ
2402 if (flags & SCF_WHILEM_VISITED_POS)
2403 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2404
653099ff 2405 /* we suppose the run is continuous, last=next...*/
1de06328 2406 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2407 next, &data_fake,
2408 stopparen, recursed, NULL, f,depth+1);
b81d288d 2409 if (min1 > minnext)
c277df42
IZ
2410 min1 = minnext;
2411 if (max1 < minnext + deltanext)
2412 max1 = minnext + deltanext;
2413 if (deltanext == I32_MAX)
aca2d497 2414 is_inf = is_inf_internal = 1;
c277df42
IZ
2415 scan = next;
2416 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2417 pars++;
e2e6a0f1
YO
2418 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2419 if ( stopmin > minnext)
2420 stopmin = min + min1;
2421 flags &= ~SCF_DO_SUBSTR;
2422 if (data)
2423 data->flags |= SCF_SEEN_ACCEPT;
2424 }
3dab1dad
YO
2425 if (data) {
2426 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2427 data->flags |= SF_HAS_EVAL;
2c2d71f5 2428 data->whilem_c = data_fake.whilem_c;
3dab1dad 2429 }
653099ff 2430 if (flags & SCF_DO_STCLASS)
830247a4 2431 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2432 if (code == SUSPEND)
c277df42
IZ
2433 break;
2434 }
2435 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2436 min1 = 0;
2437 if (flags & SCF_DO_SUBSTR) {
2438 data->pos_min += min1;
2439 data->pos_delta += max1 - min1;
2440 if (max1 != min1 || is_inf)
2441 data->longest = &(data->longest_float);
2442 }
2443 min += min1;
2444 delta += max1 - min1;
653099ff 2445 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2446 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2447 if (min1) {
40d049e4 2448 cl_and(data->start_class, and_withp);
653099ff
GS
2449 flags &= ~SCF_DO_STCLASS;
2450 }
2451 }
2452 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2453 if (min1) {
2454 cl_and(data->start_class, &accum);
653099ff 2455 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2456 }
2457 else {
b81d288d 2458 /* Switch to OR mode: cache the old value of
de0c8cb8 2459 * data->start_class */
40d049e4
YO
2460 INIT_AND_WITHP;
2461 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2462 struct regnode_charclass_class);
2463 flags &= ~SCF_DO_STCLASS_AND;
2464 StructCopy(&accum, data->start_class,
2465 struct regnode_charclass_class);
2466 flags |= SCF_DO_STCLASS_OR;
2467 data->start_class->flags |= ANYOF_EOS;
2468 }
653099ff 2469 }
a3621e74 2470
786e8c11 2471 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2472 /* demq.
2473
2474 Assuming this was/is a branch we are dealing with: 'scan' now
2475 points at the item that follows the branch sequence, whatever
2476 it is. We now start at the beginning of the sequence and look
2477 for subsequences of
2478
786e8c11
YO
2479 BRANCH->EXACT=>x1
2480 BRANCH->EXACT=>x2
2481 tail
a3621e74
YO
2482
2483 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2484
2485 If we can find such a subseqence we need to turn the first
2486 element into a trie and then add the subsequent branch exact
2487 strings to the trie.
2488
2489 We have two cases
2490
786e8c11 2491 1. patterns where the whole set of branch can be converted.
a3621e74 2492
786e8c11 2493 2. patterns where only a subset can be converted.
a3621e74
YO
2494
2495 In case 1 we can replace the whole set with a single regop
2496 for the trie. In case 2 we need to keep the start and end
2497 branchs so
2498
2499 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2500 becomes BRANCH TRIE; BRANCH X;
2501
786e8c11
YO
2502 There is an additional case, that being where there is a
2503 common prefix, which gets split out into an EXACT like node
2504 preceding the TRIE node.
2505
2506 If x(1..n)==tail then we can do a simple trie, if not we make
2507 a "jump" trie, such that when we match the appropriate word
2508 we "jump" to the appopriate tail node. Essentailly we turn
2509 a nested if into a case structure of sorts.
a3621e74
YO
2510
2511 */
786e8c11 2512
3dab1dad 2513 int made=0;
0111c4fd
RGS
2514 if (!re_trie_maxbuff) {
2515 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2516 if (!SvIOK(re_trie_maxbuff))
2517 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2518 }
786e8c11 2519 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2520 regnode *cur;
2521 regnode *first = (regnode *)NULL;
2522 regnode *last = (regnode *)NULL;
2523 regnode *tail = scan;
2524 U8 optype = 0;
2525 U32 count=0;
2526
2527#ifdef DEBUGGING
c445ea15 2528 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2529#endif
2530 /* var tail is used because there may be a TAIL
2531 regop in the way. Ie, the exacts will point to the
2532 thing following the TAIL, but the last branch will
2533 point at the TAIL. So we advance tail. If we
2534 have nested (?:) we may have to move through several
2535 tails.
2536 */
2537
2538 while ( OP( tail ) == TAIL ) {
2539 /* this is the TAIL generated by (?:) */
2540 tail = regnext( tail );
2541 }
2542
3dab1dad 2543
a3621e74 2544 DEBUG_OPTIMISE_r({
32fc9b6a 2545 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2546 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2547 (int)depth * 2 + 2, "",
2548 "Looking for TRIE'able sequences. Tail node is: ",
2549 SvPV_nolen_const( mysv )
a3621e74
YO
2550 );
2551 });
3dab1dad 2552
a3621e74
YO
2553 /*
2554
2555 step through the branches, cur represents each
2556 branch, noper is the first thing to be matched
2557 as part of that branch and noper_next is the
2558 regnext() of that node. if noper is an EXACT
2559 and noper_next is the same as scan (our current
2560 position in the regex) then the EXACT branch is
2561 a possible optimization target. Once we have
2562 two or more consequetive such branches we can
2563 create a trie of the EXACT's contents and stich
2564 it in place. If the sequence represents all of
2565 the branches we eliminate the whole thing and
2566 replace it with a single TRIE. If it is a
2567 subsequence then we need to stitch it in. This
2568 means the first branch has to remain, and needs
2569 to be repointed at the item on the branch chain
2570 following the last branch optimized. This could
2571 be either a BRANCH, in which case the
2572 subsequence is internal, or it could be the
2573 item following the branch sequence in which
2574 case the subsequence is at the end.
2575
2576 */
2577
2578 /* dont use tail as the end marker for this traverse */
2579 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2580 regnode * const noper = NEXTOPER( cur );
be981c67 2581#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2582 regnode * const noper_next = regnext( noper );
be981c67 2583#endif
a3621e74 2584
a3621e74 2585 DEBUG_OPTIMISE_r({
32fc9b6a 2586 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2587 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2588 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2589
32fc9b6a 2590 regprop(RExC_rx, mysv, noper);
a3621e74 2591 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2592 SvPV_nolen_const(mysv));
a3621e74
YO
2593
2594 if ( noper_next ) {
32fc9b6a 2595 regprop(RExC_rx, mysv, noper_next );
a3621e74 2596 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2597 SvPV_nolen_const(mysv));
a3621e74 2598 }
3dab1dad
YO
2599 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2600 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2601 });
3dab1dad
YO
2602 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2603 : PL_regkind[ OP( noper ) ] == EXACT )
2604 || OP(noper) == NOTHING )
786e8c11
YO
2605#ifdef NOJUMPTRIE
2606 && noper_next == tail
2607#endif
2608 && count < U16_MAX)
a3621e74
YO
2609 {
2610 count++;
3dab1dad
YO
2611 if ( !first || optype == NOTHING ) {
2612 if (!first) first = cur;
a3621e74
YO
2613 optype = OP( noper );
2614 } else {
a3621e74 2615 last = cur;
a3621e74
YO
2616 }
2617 } else {
2618 if ( last ) {
786e8c11
YO
2619 make_trie( pRExC_state,
2620 startbranch, first, cur, tail, count,
2621 optype, depth+1 );
a3621e74 2622 }
3dab1dad 2623 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2624#ifdef NOJUMPTRIE
2625 && noper_next == tail
2626#endif
2627 ){
a3621e74
YO
2628 count = 1;
2629 first = cur;
2630 optype = OP( noper );
2631 } else {
2632 count = 0;
2633 first = NULL;
2634 optype = 0;
2635 }
2636 last = NULL;
2637 }
2638 }
2639 DEBUG_OPTIMISE_r({
32fc9b6a 2640 regprop(RExC_rx, mysv, cur);
a3621e74 2641 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2642 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2643 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2644
2645 });
2646 if ( last ) {
786e8c11 2647 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2648#ifdef TRIE_STUDY_OPT
786e8c11
YO
2649 if ( ((made == MADE_EXACT_TRIE &&
2650 startbranch == first)
2651 || ( first_non_open == first )) &&
2652 depth==0 )
2653 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2654#endif
07be1b83 2655 }
a3621e74 2656 }
3dab1dad
YO
2657
2658 } /* do trie */
786e8c11 2659
a0ed51b3 2660 }
a3621e74 2661 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2662 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2663 } else /* single branch is optimized. */
c277df42
IZ
2664 scan = NEXTOPER(scan);
2665 continue;
a0ed51b3
LW
2666 }
2667 else if (OP(scan) == EXACT) {
cd439c50 2668 I32 l = STR_LEN(scan);
c445ea15 2669 UV uc;
a0ed51b3 2670 if (UTF) {
a3b680e6 2671 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2672 l = utf8_length(s, s + l);
9041c2e3 2673 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2674 } else {
2675 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2676 }
2677 min += l;
c277df42 2678 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2679 /* The code below prefers earlier match for fixed
2680 offset, later match for variable offset. */
2681 if (data->last_end == -1) { /* Update the start info. */
2682 data->last_start_min = data->pos_min;
2683 data->last_start_max = is_inf
b81d288d 2684 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2685 }
cd439c50 2686 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2687 if (UTF)
2688 SvUTF8_on(data->last_found);
0eda9292 2689 {
9a957fbc 2690 SV * const sv = data->last_found;
a28509cc 2691 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2692 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2693 if (mg && mg->mg_len >= 0)
5e43f467
JH
2694 mg->mg_len += utf8_length((U8*)STRING(scan),
2695 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2696 }
c277df42
IZ
2697 data->last_end = data->pos_min + l;
2698 data->pos_min += l; /* As in the first entry. */
2699 data->flags &= ~SF_BEFORE_EOL;
2700 }
653099ff
GS
2701 if (flags & SCF_DO_STCLASS_AND) {
2702 /* Check whether it is compatible with what we know already! */
2703 int compat = 1;
2704
1aa99e6b 2705 if (uc >= 0x100 ||
516a5887 2706 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2707 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2708 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2709 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2710 )
653099ff
GS
2711 compat = 0;
2712 ANYOF_CLASS_ZERO(data->start_class);
2713 ANYOF_BITMAP_ZERO(data->start_class);
2714 if (compat)
1aa99e6b 2715 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2716 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2717 if (uc < 0x100)
2718 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2719 }
2720 else if (flags & SCF_DO_STCLASS_OR) {
2721 /* false positive possible if the class is case-folded */
1aa99e6b 2722 if (uc < 0x100)
9b877dbb
IH
2723 ANYOF_BITMAP_SET(data->start_class, uc);
2724 else
2725 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2726 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2727 cl_and(data->start_class, and_withp);
653099ff
GS
2728 }
2729 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2730 }
3dab1dad 2731 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2732 I32 l = STR_LEN(scan);
1aa99e6b 2733 UV uc = *((U8*)STRING(scan));
653099ff
GS
2734
2735 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2736 if (flags & SCF_DO_SUBSTR) {
2737 assert(data);
1de06328 2738 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2739 }
a0ed51b3 2740 if (UTF) {
6136c704 2741 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2742 l = utf8_length(s, s + l);
9041c2e3 2743 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2744 }
2745 min += l;
ecaa9b9c 2746 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2747 data->pos_min += l;
653099ff
GS
2748 if (flags & SCF_DO_STCLASS_AND) {
2749 /* Check whether it is compatible with what we know already! */
2750 int compat = 1;
2751
1aa99e6b 2752 if (uc >= 0x100 ||
516a5887 2753 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2754 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2755 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2756 compat = 0;
2757 ANYOF_CLASS_ZERO(data->start_class);
2758 ANYOF_BITMAP_ZERO(data->start_class);
2759 if (compat) {
1aa99e6b 2760 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2761 data->start_class->flags &= ~ANYOF_EOS;
2762 data->start_class->flags |= ANYOF_FOLD;
2763 if (OP(scan) == EXACTFL)
2764 data->start_class->flags |= ANYOF_LOCALE;
2765 }
2766 }
2767 else if (flags & SCF_DO_STCLASS_OR) {
2768 if (data->start_class->flags & ANYOF_FOLD) {
2769 /* false positive possible if the class is case-folded.
2770 Assume that the locale settings are the same... */
1aa99e6b
IH
2771 if (uc < 0x100)
2772 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2773 data->start_class->flags &= ~ANYOF_EOS;
2774 }
40d049e4 2775 cl_and(data->start_class, and_withp);
653099ff
GS
2776 }
2777 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2778 }
bfed75c6 2779 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2780 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2781 I32 f = flags, pos_before = 0;
d4c19fe8 2782 regnode * const oscan = scan;
653099ff
GS
2783 struct regnode_charclass_class this_class;
2784 struct regnode_charclass_class *oclass = NULL;
727f22e3 2785 I32 next_is_eval = 0;
653099ff 2786
3dab1dad 2787 switch (PL_regkind[OP(scan)]) {
653099ff 2788 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2789 scan = NEXTOPER(scan);
2790 goto finish;
2791 case PLUS:
653099ff 2792 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2793 next = NEXTOPER(scan);
653099ff 2794 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2795 mincount = 1;
2796 maxcount = REG_INFTY;
c277df42
IZ
2797 next = regnext(scan);
2798 scan = NEXTOPER(scan);
2799 goto do_curly;
2800 }
2801 }
2802 if (flags & SCF_DO_SUBSTR)
2803 data->pos_min++;
2804 min++;
2805 /* Fall through. */
2806 case STAR:
653099ff
GS
2807 if (flags & SCF_DO_STCLASS) {
2808 mincount = 0;
b81d288d 2809 maxcount = REG_INFTY;
653099ff
GS
2810 next = regnext(scan);
2811 scan = NEXTOPER(scan);
2812 goto do_curly;
2813 }
b81d288d 2814 is_inf = is_inf_internal = 1;
c277df42
IZ
2815 scan = regnext(scan);
2816 if (flags & SCF_DO_SUBSTR) {
1de06328 2817 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2818 data->longest = &(data->longest_float);
2819 }
2820 goto optimize_curly_tail;
2821 case CURLY:
40d049e4
YO
2822 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2823 && (scan->flags == stopparen))
2824 {
2825 mincount = 1;
2826 maxcount = 1;
2827 } else {
2828 mincount = ARG1(scan);
2829 maxcount = ARG2(scan);
2830 }
c277df42 2831 next = regnext(scan);
cb434fcc
IZ
2832 if (OP(scan) == CURLYX) {
2833 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2834 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2835 }
c277df42 2836 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2837 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2838 do_curly:
2839 if (flags & SCF_DO_SUBSTR) {
1de06328 2840 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2841 pos_before = data->pos_min;
2842 }
2843 if (data) {
2844 fl = data->flags;
2845 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2846 if (is_inf)
2847 data->flags |= SF_IS_INF;
2848 }
653099ff 2849 if (flags & SCF_DO_STCLASS) {
830247a4 2850 cl_init(pRExC_state, &this_class);
653099ff
GS
2851 oclass = data->start_class;
2852 data->start_class = &this_class;
2853 f |= SCF_DO_STCLASS_AND;
2854 f &= ~SCF_DO_STCLASS_OR;
2855 }
e1901655
IZ
2856 /* These are the cases when once a subexpression
2857 fails at a particular position, it cannot succeed
2858 even after backtracking at the enclosing scope.
b81d288d 2859
e1901655
IZ
2860 XXXX what if minimal match and we are at the
2861 initial run of {n,m}? */
2862 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2863 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2864
c277df42 2865 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2866 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2867 last, data, stopparen, recursed, NULL,
a3621e74
YO
2868 (mincount == 0
2869 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2870
2871 if (flags & SCF_DO_STCLASS)
2872 data->start_class = oclass;
2873 if (mincount == 0 || minnext == 0) {
2874 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2875 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2876 }
2877 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2878 /* Switch to OR mode: cache the old value of
653099ff 2879 * data->start_class */
40d049e4
YO
2880 INIT_AND_WITHP;
2881 StructCopy(data->start_class, and_withp,
653099ff
GS
2882 struct regnode_charclass_class);
2883 flags &= ~SCF_DO_STCLASS_AND;
2884 StructCopy(&this_class, data->start_class,
2885 struct regnode_charclass_class);
2886 flags |= SCF_DO_STCLASS_OR;
2887 data->start_class->flags |= ANYOF_EOS;
2888 }
2889 } else { /* Non-zero len */
2890 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2891 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2892 cl_and(data->start_class, and_withp);
653099ff
GS
2893 }
2894 else if (flags & SCF_DO_STCLASS_AND)
2895 cl_and(data->start_class, &this_class);
2896 flags &= ~SCF_DO_STCLASS;
2897 }
c277df42
IZ
2898 if (!scan) /* It was not CURLYX, but CURLY. */
2899 scan = next;
041457d9
DM
2900 if ( /* ? quantifier ok, except for (?{ ... }) */
2901 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2902 && (minnext == 0) && (deltanext == 0)
99799961 2903 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2904 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2905 && ckWARN(WARN_REGEXP))
b45f050a 2906 {
830247a4 2907 vWARN(RExC_parse,
b45f050a
JF
2908 "Quantifier unexpected on zero-length expression");
2909 }
2910
c277df42 2911 min += minnext * mincount;
b81d288d 2912 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2913 && (minnext + deltanext) > 0)
2914 || deltanext == I32_MAX);
aca2d497 2915 is_inf |= is_inf_internal;
c277df42
IZ
2916 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2917
2918 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2919 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2920 && data->flags & SF_IN_PAR
2921 && !(data->flags & SF_HAS_EVAL)
2922 && !deltanext && minnext == 1 ) {
2923 /* Try to optimize to CURLYN. */
2924 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2925 regnode * const nxt1 = nxt;
497b47a8
JH
2926#ifdef DEBUGGING
2927 regnode *nxt2;
2928#endif
c277df42
IZ
2929
2930 /* Skip open. */
2931 nxt = regnext(nxt);
bfed75c6 2932 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2933 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2934 && STR_LEN(nxt) == 1))
c277df42 2935 goto nogo;
497b47a8 2936#ifdef DEBUGGING
c277df42 2937 nxt2 = nxt;
497b47a8 2938#endif
c277df42 2939 nxt = regnext(nxt);
b81d288d 2940 if (OP(nxt) != CLOSE)
c277df42 2941 goto nogo;
40d049e4
YO
2942 if (RExC_open_parens) {
2943 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2944 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2945 }
c277df42 2946 /* Now we know that nxt2 is the only contents: */
eb160463 2947 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2948 OP(oscan) = CURLYN;
2949 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2950
c277df42
IZ
2951#ifdef DEBUGGING
2952 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2953 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2954 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2955 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2956 OP(nxt + 1) = OPTIMIZED; /* was count. */
2957 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2958#endif
c277df42 2959 }
c277df42
IZ
2960 nogo:
2961
2962 /* Try optimization CURLYX => CURLYM. */
b81d288d 2963 if ( OP(oscan) == CURLYX && data
c277df42 2964 && !(data->flags & SF_HAS_PAR)
c277df42 2965 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2966 && !deltanext /* atom is fixed width */
2967 && minnext != 0 /* CURLYM can't handle zero width */
2968 ) {
c277df42
IZ
2969 /* XXXX How to optimize if data == 0? */
2970 /* Optimize to a simpler form. */
2971 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2972 regnode *nxt2;
2973
2974 OP(oscan) = CURLYM;
2975 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2976 && (OP(nxt2) != WHILEM))
c277df42
IZ
2977 nxt = nxt2;
2978 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2979 /* Need to optimize away parenths. */
2980 if (data->flags & SF_IN_PAR) {
2981 /* Set the parenth number. */
2982 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2983
b81d288d 2984 if (OP(nxt) != CLOSE)
b45f050a 2985 FAIL("Panic opt close");
eb160463 2986 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2987 if (RExC_open_parens) {
2988 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2989 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2990 }
c277df42
IZ
2991 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2992 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2993
c277df42
IZ
2994#ifdef DEBUGGING
2995 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2996 OP(nxt + 1) = OPTIMIZED; /* was count. */
2997 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2998 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2999#endif
c277df42
IZ
3000#if 0
3001 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3002 regnode *nnxt = regnext(nxt1);
b81d288d 3003
c277df42
IZ
3004 if (nnxt == nxt) {
3005 if (reg_off_by_arg[OP(nxt1)])
3006 ARG_SET(nxt1, nxt2 - nxt1);
3007 else if (nxt2 - nxt1 < U16_MAX)
3008 NEXT_OFF(nxt1) = nxt2 - nxt1;
3009 else
3010 OP(nxt) = NOTHING; /* Cannot beautify */
3011 }
3012 nxt1 = nnxt;
3013 }
3014#endif
3015 /* Optimize again: */
1de06328 3016 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3017 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3018 }
3019 else
c277df42 3020 oscan->flags = 0;
c277df42 3021 }
e1901655
IZ
3022 else if ((OP(oscan) == CURLYX)
3023 && (flags & SCF_WHILEM_VISITED_POS)
3024 /* See the comment on a similar expression above.
3025 However, this time it not a subexpression
3026 we care about, but the expression itself. */
3027 && (maxcount == REG_INFTY)
3028 && data && ++data->whilem_c < 16) {
3029 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3030 /* Find WHILEM (as in regexec.c) */
3031 regnode *nxt = oscan + NEXT_OFF(oscan);
3032
3033 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3034 nxt += ARG(nxt);
eb160463
GS
3035 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3036 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3037 }
b81d288d 3038 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3039 pars++;
3040 if (flags & SCF_DO_SUBSTR) {
c445ea15 3041 SV *last_str = NULL;
c277df42
IZ
3042 int counted = mincount != 0;
3043
3044 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3045#if defined(SPARC64_GCC_WORKAROUND)
3046 I32 b = 0;
3047 STRLEN l = 0;
cfd0369c 3048 const char *s = NULL;
5d1c421c
JH
3049 I32 old = 0;
3050
3051 if (pos_before >= data->last_start_min)
3052 b = pos_before;
3053 else
3054 b = data->last_start_min;
3055
3056 l = 0;
cfd0369c 3057 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3058 old = b - data->last_start_min;
3059
3060#else
b81d288d 3061 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3062 ? pos_before : data->last_start_min;
3063 STRLEN l;
d4c19fe8 3064 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3065 I32 old = b - data->last_start_min;
5d1c421c 3066#endif
a0ed51b3
LW
3067
3068 if (UTF)
3069 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3070
a0ed51b3 3071 l -= old;
c277df42 3072 /* Get the added string: */
79cb57f6 3073 last_str = newSVpvn(s + old, l);
0e933229
IH
3074 if (UTF)
3075 SvUTF8_on(last_str);
c277df42
IZ
3076 if (deltanext == 0 && pos_before == b) {
3077 /* What was added is a constant string */
3078 if (mincount > 1) {
3079 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3080 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3081 SvPVX_const(last_str), l, mincount - 1);
b162af07 3082 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3083 /* Add additional parts. */
b81d288d 3084 SvCUR_set(data->last_found,
c277df42
IZ
3085 SvCUR(data->last_found) - l);
3086 sv_catsv(data->last_found, last_str);
0eda9292
JH
3087 {
3088 SV * sv = data->last_found;
3089 MAGIC *mg =
3090 SvUTF8(sv) && SvMAGICAL(sv) ?
3091 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3092 if (mg && mg->mg_len >= 0)
3093 mg->mg_len += CHR_SVLEN(last_str);
3094 }
c277df42
IZ
3095 data->last_end += l * (mincount - 1);
3096 }
2a8d9689
HS
3097 } else {
3098 /* start offset must point into the last copy */
3099 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3100 data->last_start_max += is_inf ? I32_MAX
3101 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3102 }
3103 }
3104 /* It is counted once already... */
3105 data->pos_min += minnext * (mincount - counted);
3106 data->pos_delta += - counted * deltanext +
3107 (minnext + deltanext) * maxcount - minnext * mincount;
3108 if (mincount != maxcount) {
653099ff
GS
3109 /* Cannot extend fixed substrings found inside
3110 the group. */
1de06328 3111 scan_commit(pRExC_state,data,minlenp);
c277df42 3112 if (mincount && last_str) {
d4c19fe8
AL
3113 SV * const sv = data->last_found;
3114 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3115 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3116
3117 if (mg)
3118 mg->mg_len = -1;
3119 sv_setsv(sv, last_str);
c277df42 3120 data->last_end = data->pos_min;
b81d288d 3121 data->last_start_min =
a0ed51b3 3122 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3123 data->last_start_max = is_inf
3124 ? I32_MAX
c277df42 3125 : data->pos_min + data->pos_delta
a0ed51b3 3126 - CHR_SVLEN(last_str);
c277df42
IZ
3127 }
3128 data->longest = &(data->longest_float);
3129 }
aca2d497 3130 SvREFCNT_dec(last_str);
c277df42 3131 }
405ff068 3132 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3133 data->flags |= SF_HAS_EVAL;
3134 optimize_curly_tail:
c277df42 3135 if (OP(oscan) != CURLYX) {
3dab1dad 3136 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3137 && NEXT_OFF(next))
3138 NEXT_OFF(oscan) += NEXT_OFF(next);
3139 }
c277df42 3140 continue;
653099ff 3141 default: /* REF and CLUMP only? */
c277df42 3142 if (flags & SCF_DO_SUBSTR) {
1de06328 3143 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3144 data->longest = &(data->longest_float);
3145 }
aca2d497 3146 is_inf = is_inf_internal = 1;
653099ff 3147 if (flags & SCF_DO_STCLASS_OR)
830247a4 3148 cl_anything(pRExC_state, data->start_class);
653099ff 3149 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3150 break;
3151 }
a0ed51b3 3152 }
bfed75c6 3153 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3154 int value = 0;
653099ff 3155
c277df42 3156 if (flags & SCF_DO_SUBSTR) {
1de06328 3157 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3158 data->pos_min++;
3159 }
3160 min++;
653099ff
GS
3161 if (flags & SCF_DO_STCLASS) {
3162 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3163
3164 /* Some of the logic below assumes that switching
3165 locale on will only add false positives. */
3dab1dad 3166 switch (PL_regkind[OP(scan)]) {
653099ff 3167 case SANY:
653099ff
GS
3168 default:
3169 do_default:
3170 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3171 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3172 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3173 break;
3174 case REG_ANY:
3175 if (OP(scan) == SANY)
3176 goto do_default;
3177 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3178 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3179 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3180 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3181 }
3182 if (flags & SCF_DO_STCLASS_AND || !value)
3183 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3184 break;
3185 case ANYOF:
3186 if (flags & SCF_DO_STCLASS_AND)
3187 cl_and(data->start_class,
3188 (struct regnode_charclass_class*)scan);
3189 else
830247a4 3190 cl_or(pRExC_state, data->start_class,
653099ff
GS
3191 (struct regnode_charclass_class*)scan);
3192 break;
3193 case ALNUM:
3194 if (flags & SCF_DO_STCLASS_AND) {
3195 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3196 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3197 for (value = 0; value < 256; value++)
3198 if (!isALNUM(value))
3199 ANYOF_BITMAP_CLEAR(data->start_class, value);
3200 }
3201 }
3202 else {
3203 if (data->start_class->flags & ANYOF_LOCALE)
3204 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3205 else {
3206 for (value = 0; value < 256; value++)
3207 if (isALNUM(value))
b81d288d 3208 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3209 }
3210 }
3211 break;
3212 case ALNUML:
3213 if (flags & SCF_DO_STCLASS_AND) {
3214 if (data->start_class->flags & ANYOF_LOCALE)
3215 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3216 }
3217 else {
3218 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3219 data->start_class->flags |= ANYOF_LOCALE;
3220 }
3221 break;
3222 case NALNUM:
3223 if (flags & SCF_DO_STCLASS_AND) {
3224 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3225 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3226 for (value = 0; value < 256; value++)
3227 if (isALNUM(value))
3228 ANYOF_BITMAP_CLEAR(data->start_class, value);
3229 }
3230 }
3231 else {
3232 if (data->start_class->flags & ANYOF_LOCALE)
3233 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3234 else {
3235 for (value = 0; value < 256; value++)
3236 if (!isALNUM(value))
b81d288d 3237 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3238 }
3239 }
3240 break;
3241 case NALNUML:
3242 if (flags & SCF_DO_STCLASS_AND) {
3243 if (data->start_class->flags & ANYOF_LOCALE)
3244 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3245 }
3246 else {
3247 data->start_class->flags |= ANYOF_LOCALE;
3248 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3249 }
3250 break;
3251 case SPACE:
3252 if (flags & SCF_DO_STCLASS_AND) {
3253 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3254 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3255 for (value = 0; value < 256; value++)
3256 if (!isSPACE(value))
3257 ANYOF_BITMAP_CLEAR(data->start_class, value);
3258 }
3259 }
3260 else {
3261 if (data->start_class->flags & ANYOF_LOCALE)
3262 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3263 else {
3264 for (value = 0; value < 256; value++)
3265 if (isSPACE(value))
b81d288d 3266 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3267 }
3268 }
3269 break;
3270 case SPACEL:
3271 if (flags & SCF_DO_STCLASS_AND) {
3272 if (data->start_class->flags & ANYOF_LOCALE)
3273 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3274 }
3275 else {
3276 data->start_class->flags |= ANYOF_LOCALE;
3277 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3278 }
3279 break;
3280 case NSPACE:
3281 if (flags & SCF_DO_STCLASS_AND) {
3282 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3283 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3284 for (value = 0; value < 256; value++)
3285 if (isSPACE(value))
3286 ANYOF_BITMAP_CLEAR(data->start_class, value);
3287 }
3288 }
3289 else {
3290 if (data->start_class->flags & ANYOF_LOCALE)
3291 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3292 else {
3293 for (value = 0; value < 256; value++)
3294 if (!isSPACE(value))
b81d288d 3295 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3296 }
3297 }
3298 break;
3299 case NSPACEL:
3300 if (flags & SCF_DO_STCLASS_AND) {
3301 if (data->start_class->flags & ANYOF_LOCALE) {
3302 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3303 for (value = 0; value < 256; value++)
3304 if (!isSPACE(value))
3305 ANYOF_BITMAP_CLEAR(data->start_class, value);
3306 }
3307 }
3308 else {
3309 data->start_class->flags |= ANYOF_LOCALE;
3310 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3311 }
3312 break;
3313 case DIGIT:
3314 if (flags & SCF_DO_STCLASS_AND) {
3315 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3316 for (value = 0; value < 256; value++)
3317 if (!isDIGIT(value))
3318 ANYOF_BITMAP_CLEAR(data->start_class, value);
3319 }
3320 else {
3321 if (data->start_class->flags & ANYOF_LOCALE)
3322 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3323 else {
3324 for (value = 0; value < 256; value++)
3325 if (isDIGIT(value))
b81d288d 3326 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3327 }
3328 }
3329 break;
3330 case NDIGIT:
3331 if (flags & SCF_DO_STCLASS_AND) {
3332 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3333 for (value = 0; value < 256; value++)
3334 if (isDIGIT(value))
3335 ANYOF_BITMAP_CLEAR(data->start_class, value);
3336 }
3337 else {
3338 if (data->start_class->flags & ANYOF_LOCALE)
3339 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3340 else {
3341 for (value = 0; value < 256; value++)
3342 if (!isDIGIT(value))
b81d288d 3343 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3344 }
3345 }
3346 break;
3347 }
3348 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3349 cl_and(data->start_class, and_withp);
653099ff
GS
3350 flags &= ~SCF_DO_STCLASS;
3351 }
a0ed51b3 3352 }
3dab1dad 3353 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3354 data->flags |= (OP(scan) == MEOL
3355 ? SF_BEFORE_MEOL
3356 : SF_BEFORE_SEOL);
a0ed51b3 3357 }
3dab1dad 3358 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3359 /* Lookbehind, or need to calculate parens/evals/stclass: */
3360 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3361 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3362 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3363 || OP(scan) == UNLESSM )
3364 {
3365 /* Negative Lookahead/lookbehind
3366 In this case we can't do fixed string optimisation.
3367 */
3368
3369 I32 deltanext, minnext, fake = 0;
3370 regnode *nscan;
3371 struct regnode_charclass_class intrnl;
3372 int f = 0;
3373
3374 data_fake.flags = 0;
3375 if (data) {
3376 data_fake.whilem_c = data->whilem_c;
3377 data_fake.last_closep = data->last_closep;
a0ed51b3 3378 }
1de06328
YO
3379 else
3380 data_fake.last_closep = &fake;
3381 if ( flags & SCF_DO_STCLASS && !scan->flags
3382 && OP(scan) == IFMATCH ) { /* Lookahead */
3383 cl_init(pRExC_state, &intrnl);
3384 data_fake.start_class = &intrnl;
3385 f |= SCF_DO_STCLASS_AND;
c277df42 3386 }
1de06328
YO
3387 if (flags & SCF_WHILEM_VISITED_POS)
3388 f |= SCF_WHILEM_VISITED_POS;
3389 next = regnext(scan);
3390 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3391 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3392 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3393 if (scan->flags) {
3394 if (deltanext) {
3395 vFAIL("Variable length lookbehind not implemented");
3396 }
3397 else if (minnext > (I32)U8_MAX) {
3398 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3399 }
3400 scan->flags = (U8)minnext;
3401 }
3402 if (data) {
3403 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3404 pars++;
3405 if (data_fake.flags & SF_HAS_EVAL)
3406 data->flags |= SF_HAS_EVAL;
3407 data->whilem_c = data_fake.whilem_c;
3408 }
3409 if (f & SCF_DO_STCLASS_AND) {
3410 const int was = (data->start_class->flags & ANYOF_EOS);
3411
3412 cl_and(data->start_class, &intrnl);
3413 if (was)
3414 data->start_class->flags |= ANYOF_EOS;
3415 }
be8e71aa 3416 }
1de06328
YO
3417#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3418 else {
3419 /* Positive Lookahead/lookbehind
3420 In this case we can do fixed string optimisation,
3421 but we must be careful about it. Note in the case of
3422 lookbehind the positions will be offset by the minimum
3423 length of the pattern, something we won't know about
3424 until after the recurse.
3425 */
3426 I32 deltanext, fake = 0;
3427 regnode *nscan;
3428 struct regnode_charclass_class intrnl;
3429 int f = 0;
3430 /* We use SAVEFREEPV so that when the full compile
3431 is finished perl will clean up the allocated
3432 minlens when its all done. This was we don't
3433 have to worry about freeing them when we know
3434 they wont be used, which would be a pain.
3435 */
3436 I32 *minnextp;
3437 Newx( minnextp, 1, I32 );
3438 SAVEFREEPV(minnextp);
3439
3440 if (data) {
3441 StructCopy(data, &data_fake, scan_data_t);
3442 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3443 f |= SCF_DO_SUBSTR;
3444 if (scan->flags)
3445 scan_commit(pRExC_state, &data_fake,minlenp);
3446 data_fake.last_found=newSVsv(data->last_found);
3447 }
3448 }
3449 else
3450 data_fake.last_closep = &fake;
3451 data_fake.flags = 0;
3452 if (is_inf)
3453 data_fake.flags |= SF_IS_INF;
3454 if ( flags & SCF_DO_STCLASS && !scan->flags
3455 && OP(scan) == IFMATCH ) { /* Lookahead */
3456 cl_init(pRExC_state, &intrnl);
3457 data_fake.start_class = &intrnl;
3458 f |= SCF_DO_STCLASS_AND;
3459 }
3460 if (flags & SCF_WHILEM_VISITED_POS)
3461 f |= SCF_WHILEM_VISITED_POS;
3462 next = regnext(scan);
3463 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3464
3465 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3466 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3467 if (scan->flags) {
3468 if (deltanext) {
3469 vFAIL("Variable length lookbehind not implemented");
3470 }
3471 else if (*minnextp > (I32)U8_MAX) {
3472 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3473 }
3474 scan->flags = (U8)*minnextp;
3475 }
40d049e4 3476
1de06328 3477 *minnextp += min;
40d049e4 3478
1de06328
YO
3479 if (f & SCF_DO_STCLASS_AND) {
3480 const int was = (data->start_class->flags & ANYOF_EOS);
3481
3482 cl_and(data->start_class, &intrnl);
3483 if (was)
3484 data->start_class->flags |= ANYOF_EOS;
40d049e4 3485 }
1de06328
YO
3486 if (data) {
3487 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3488 pars++;
3489 if (data_fake.flags & SF_HAS_EVAL)
3490 data->flags |= SF_HAS_EVAL;
3491 data->whilem_c = data_fake.whilem_c;
3492 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3493 if (RExC_rx->minlen<*minnextp)
3494 RExC_rx->minlen=*minnextp;
3495 scan_commit(pRExC_state, &data_fake, minnextp);
3496 SvREFCNT_dec(data_fake.last_found);
3497
3498 if ( data_fake.minlen_fixed != minlenp )
3499 {
3500 data->offset_fixed= data_fake.offset_fixed;
3501 data->minlen_fixed= data_fake.minlen_fixed;
3502 data->lookbehind_fixed+= scan->flags;
3503 }
3504 if ( data_fake.minlen_float != minlenp )
3505 {
3506 data->minlen_float= data_fake.minlen_float;
3507 data->offset_float_min=data_fake.offset_float_min;
3508 data->offset_float_max=data_fake.offset_float_max;
3509 data->lookbehind_float+= scan->flags;
3510 }
3511 }
3512 }
3513
653099ff 3514
653099ff 3515 }
1de06328 3516#endif
a0ed51b3
LW
3517 }
3518 else if (OP(scan) == OPEN) {
40d049e4
YO
3519 if (stopparen != (I32)ARG(scan))
3520 pars++;
a0ed51b3 3521 }
cb434fcc 3522 else if (OP(scan) == CLOSE) {
40d049e4
YO
3523 if (stopparen == (I32)ARG(scan)) {
3524 break;
3525 }
eb160463 3526 if ((I32)ARG(scan) == is_par) {
cb434fcc 3527 next = regnext(scan);
c277df42 3528
cb434fcc
IZ
3529 if ( next && (OP(next) != WHILEM) && next < last)
3530 is_par = 0; /* Disable optimization */
3531 }
3532 if (data)
3533 *(data->last_closep) = ARG(scan);
a0ed51b3 3534 }
1a147d38 3535 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3536 /* set the pointer */
3537 I32 paren;
3538 regnode *start;
3539 regnode *end;
1a147d38 3540 if (OP(scan) == GOSUB) {
40d049e4
YO
3541 paren = ARG(scan);
3542 RExC_recurse[ARG2L(scan)] = scan;
3543 start = RExC_open_parens[paren-1];
3544 end = RExC_close_parens[paren-1];
3545 } else {
3546 paren = 0;
3547 start = RExC_rx->program + 1;
3548 end = RExC_opend;
3549 }
3550 assert(start);
3551 assert(end);
3552 if (!recursed) {
3553 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3554 SAVEFREEPV(recursed);
3555 }
3556 if (!PAREN_TEST(recursed,paren+1)) {
3557 I32 deltanext = 0;
3558 PAREN_SET(recursed,paren+1);
3559
3560 DEBUG_PEEP("goto",start,depth);
3561 min += study_chunk(
3562 pRExC_state,
3563 &start,
3564 minlenp,
3565 &deltanext,
3566 end+1,
3567 data,
3568 paren,
3569 recursed,
3570 and_withp,
3571 flags,depth+1);
3572 delta+=deltanext;
3573 if (deltanext == I32_MAX) {
3574 is_inf = is_inf_internal = 1;
3575 delta=deltanext;
3576 }
3577 DEBUG_PEEP("rtrn",end,depth);
3578 PAREN_UNSET(recursed,paren+1);
3579 } else {
3580 if (flags & SCF_DO_SUBSTR) {
3581 scan_commit(pRExC_state,data,minlenp);
3582 data->longest = &(data->longest_float);
3583 }
3584 is_inf = is_inf_internal = 1;
3585 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3586 cl_anything(pRExC_state, data->start_class);
3587 flags &= ~SCF_DO_STCLASS;
3588 }
3589 }
a0ed51b3 3590 else if (OP(scan) == EVAL) {
c277df42
IZ
3591 if (data)
3592 data->flags |= SF_HAS_EVAL;
3593 }
e2e6a0f1 3594 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
40d049e4
YO
3595 if (flags & SCF_DO_SUBSTR) {
3596 scan_commit(pRExC_state,data,minlenp);
3597 flags &= ~SCF_DO_SUBSTR;
3598 }
e2e6a0f1
YO
3599 if (data && OP(scan)==ACCEPT) {
3600 data->flags |= SCF_SEEN_ACCEPT;
3601 if (stopmin > min)
3602 stopmin = min;
3603 }
40d049e4
YO
3604 }
3605 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3606 {
0f5d15d6 3607 if (flags & SCF_DO_SUBSTR) {
1de06328 3608 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3609 data->longest = &(data->longest_float);
3610 }
3611 is_inf = is_inf_internal = 1;
653099ff 3612 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3613 cl_anything(pRExC_state, data->start_class);
96776eda 3614 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3615 }
786e8c11 3616#ifdef TRIE_STUDY_OPT
40d049e4 3617#ifdef FULL_TRIE_STUDY
786e8c11
YO
3618 else if (PL_regkind[OP(scan)] == TRIE) {
3619 /* NOTE - There is similar code to this block above for handling
40d049e4 3620 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3621 check there too. */
7f69552c 3622 regnode *trie_node= scan;
786e8c11
YO
3623 regnode *tail= regnext(scan);
3624 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3625 I32 max1 = 0, min1 = I32_MAX;
3626 struct regnode_charclass_class accum;
3627
3628 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3629 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3630 if (flags & SCF_DO_STCLASS)
3631 cl_init_zero(pRExC_state, &accum);
3632
3633 if (!trie->jump) {
3634 min1= trie->minlen;
3635 max1= trie->maxlen;
3636 } else {
3637 const regnode *nextbranch= NULL;
3638 U32 word;
3639
3640 for ( word=1 ; word <= trie->wordcount ; word++)
3641 {
3642 I32 deltanext=0, minnext=0, f = 0, fake;
3643 struct regnode_charclass_class this_class;
3644
3645 data_fake.flags = 0;
3646 if (data) {
3647 data_fake.whilem_c = data->whilem_c;
3648 data_fake.last_closep = data->last_closep;
3649 }
3650 else
3651 data_fake.last_closep = &fake;
3652
3653 if (flags & SCF_DO_STCLASS) {
3654 cl_init(pRExC_state, &this_class);
3655 data_fake.start_class = &this_class;
3656 f = SCF_DO_STCLASS_AND;
3657 }
3658 if (flags & SCF_WHILEM_VISITED_POS)
3659 f |= SCF_WHILEM_VISITED_POS;
3660
3661 if (trie->jump[word]) {
3662 if (!nextbranch)
7f69552c
YO
3663 nextbranch = trie_node + trie->jump[0];
3664 scan= trie_node + trie->jump[word];
786e8c11
YO
3665 /* We go from the jump point to the branch that follows
3666 it. Note this means we need the vestigal unused branches
3667 even though they arent otherwise used.
3668 */
40d049e4
YO
3669 minnext = study_chunk(pRExC_state, &scan, minlenp,
3670 &deltanext, (regnode *)nextbranch, &data_fake,
3671 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3672 }
3673 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3674 nextbranch= regnext((regnode*)nextbranch);
3675
3676 if (min1 > (I32)(minnext + trie->minlen))
3677 min1 = minnext + trie->minlen;
3678 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3679 max1 = minnext + deltanext + trie->maxlen;
3680 if (deltanext == I32_MAX)
3681 is_inf = is_inf_internal = 1;
3682
3683 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3684 pars++;
e2e6a0f1
YO
3685 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3686 if ( stopmin > min + min1)
3687 stopmin = min + min1;
3688 flags &= ~SCF_DO_SUBSTR;
3689 if (data)
3690 data->flags |= SCF_SEEN_ACCEPT;
3691 }
786e8c11
YO
3692 if (data) {
3693 if (data_fake.flags & SF_HAS_EVAL)
3694 data->flags |= SF_HAS_EVAL;
3695 data->whilem_c = data_fake.whilem_c;
3696 }
3697 if (flags & SCF_DO_STCLASS)
3698 cl_or(pRExC_state, &accum, &this_class);
3699 }
3700 }
3701 if (flags & SCF_DO_SUBSTR) {
3702 data->pos_min += min1;
3703 data->pos_delta += max1 - min1;
3704 if (max1 != min1 || is_inf)
3705 data->longest = &(data->longest_float);
3706 }
3707 min += min1;
3708 delta += max1 - min1;
3709 if (flags & SCF_DO_STCLASS_OR) {
3710 cl_or(pRExC_state, data->start_class, &accum);
3711 if (min1) {
40d049e4 3712 cl_and(data->start_class, and_withp);
786e8c11
YO
3713 flags &= ~SCF_DO_STCLASS;
3714 }
3715 }
3716 else if (flags & SCF_DO_STCLASS_AND) {
3717 if (min1) {
3718 cl_and(data->start_class, &accum);
3719 flags &= ~SCF_DO_STCLASS;
3720 }
3721 else {
3722 /* Switch to OR mode: cache the old value of
3723 * data->start_class */
40d049e4
YO
3724 INIT_AND_WITHP;
3725 StructCopy(data->start_class, and_withp,
786e8c11
YO
3726 struct regnode_charclass_class);
3727 flags &= ~SCF_DO_STCLASS_AND;
3728 StructCopy(&accum, data->start_class,
3729 struct regnode_charclass_class);
3730 flags |= SCF_DO_STCLASS_OR;
3731 data->start_class->flags |= ANYOF_EOS;
3732 }
3733 }
3734 scan= tail;
3735 continue;
3736 }
3737#else
3738 else if (PL_regkind[OP(scan)] == TRIE) {
3739 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3740 U8*bang=NULL;
3741
3742 min += trie->minlen;
3743 delta += (trie->maxlen - trie->minlen);
3744 flags &= ~SCF_DO_STCLASS; /* xxx */
3745 if (flags & SCF_DO_SUBSTR) {
1de06328 3746 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3747 data->pos_min += trie->minlen;
3748 data->pos_delta += (trie->maxlen - trie->minlen);
3749 if (trie->maxlen != trie->minlen)
3750 data->longest = &(data->longest_float);
3751 }
3752 if (trie->jump) /* no more substrings -- for now /grr*/
3753 flags &= ~SCF_DO_SUBSTR;
3754 }
3755#endif /* old or new */
3756#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3757 /* Else: zero-length, ignore. */
3758 scan = regnext(scan);
3759 }
3760
3761 finish:
3762 *scanp = scan;
aca2d497 3763 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3764 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3765 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3766 if (is_par > (I32)U8_MAX)
c277df42
IZ
3767 is_par = 0;
3768 if (is_par && pars==1 && data) {
3769 data->flags |= SF_IN_PAR;
3770 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3771 }
3772 else if (pars && data) {
c277df42
IZ
3773 data->flags |= SF_HAS_PAR;
3774 data->flags &= ~SF_IN_PAR;
3775 }
653099ff 3776 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3777 cl_and(data->start_class, and_withp);
786e8c11
YO
3778 if (flags & SCF_TRIE_RESTUDY)
3779 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3780
3781 DEBUG_STUDYDATA(data,depth);
3782
e2e6a0f1 3783 return min < stopmin ? min : stopmin;
c277df42
IZ
3784}
3785
76e3520e 3786STATIC I32
5f66b61c 3787S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3788{
830247a4 3789 if (RExC_rx->data) {
2eb97020 3790 const U32 count = RExC_rx->data->count;
b81d288d 3791 Renewc(RExC_rx->data,
2eb97020 3792 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3793 char, struct reg_data);
2eb97020 3794 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3795 RExC_rx->data->count += n;
a0ed51b3
LW
3796 }
3797 else {
a02a5408 3798 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3799 char, struct reg_data);
a02a5408 3800 Newx(RExC_rx->data->what, n, U8);
830247a4 3801 RExC_rx->data->count = n;
c277df42 3802 }
830247a4
IZ
3803 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3804 return RExC_rx->data->count - n;
c277df42
IZ
3805}
3806
76234dfb 3807#ifndef PERL_IN_XSUB_RE
d88dccdf 3808void
864dbfa3 3809Perl_reginitcolors(pTHX)
d88dccdf 3810{
97aff369 3811 dVAR;
1df70142 3812 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3813 if (s) {
1df70142
AL
3814 char *t = savepv(s);
3815 int i = 0;
3816 PL_colors[0] = t;
d88dccdf 3817 while (++i < 6) {
1df70142
AL
3818 t = strchr(t, '\t');
3819 if (t) {
3820 *t = '\0';
3821 PL_colors[i] = ++t;
d88dccdf
IZ
3822 }
3823 else
1df70142 3824 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3825 }
3826 } else {
1df70142 3827 int i = 0;
b81d288d 3828 while (i < 6)
06b5626a 3829 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3830 }
3831 PL_colorset = 1;
3832}
76234dfb 3833#endif
8615cb43 3834
07be1b83 3835
786e8c11
YO
3836#ifdef TRIE_STUDY_OPT
3837#define CHECK_RESTUDY_GOTO \
3838 if ( \
3839 (data.flags & SCF_TRIE_RESTUDY) \
3840 && ! restudied++ \
3841 ) goto reStudy
3842#else
3843#define CHECK_RESTUDY_GOTO
3844#endif
f9f4320a 3845
a687059c 3846/*
e50aee73 3847 - pregcomp - compile a regular expression into internal code
a687059c
LW
3848 *
3849 * We can't allocate space until we know how big the compiled form will be,
3850 * but we can't compile it (and thus know how big it is) until we've got a
3851 * place to put the code. So we cheat: we compile it twice, once with code
3852 * generation turned off and size counting turned on, and once "for real".
3853 * This also means that we don't allocate space until we are sure that the
3854 * thing really will compile successfully, and we never have to move the
3855 * code and thus invalidate pointers into it. (Note that it has to be in
3856 * one piece because free() must be able to free it all.) [NB: not true in perl]
3857 *
3858 * Beware that the optimization-preparation code in here knows about some
3859 * of the structure of the compiled regexp. [I'll say.]
3860 */
b9b4dddf
YO
3861
3862
3863
f9f4320a 3864#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3865#define RE_ENGINE_PTR &PL_core_reg_engine
3866#else
f9f4320a
YO
3867extern const struct regexp_engine my_reg_engine;
3868#define RE_ENGINE_PTR &my_reg_engine
3869#endif
b9b4dddf
YO
3870/* these make a few things look better, to avoid indentation */
3871#define BEGIN_BLOCK {
f9f4320a
YO
3872#define END_BLOCK }
3873
a687059c 3874regexp *
864dbfa3 3875Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3876{
97aff369 3877 dVAR;
f9f4320a
YO
3878 GET_RE_DEBUG_FLAGS_DECL;
3879 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3880#ifndef PERL_IN_XSUB_RE
3881 BEGIN_BLOCK
f9f4320a
YO
3882 /* Dispatch a request to compile a regexp to correct
3883 regexp engine. */
3884 HV * const table = GvHV(PL_hintgv);
3885 if (table) {
3886 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3887 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3888 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3889 DEBUG_COMPILE_r({
8d8756e7 3890 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3891 SvIV(*ptr));
3892 });
f2f78491 3893 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3894 }
b9b4dddf
YO
3895 }
3896 END_BLOCK
3897#endif
3898 BEGIN_BLOCK
a0d0e21e 3899 register regexp *r;
c277df42 3900 regnode *scan;
c277df42 3901 regnode *first;
a0d0e21e 3902 I32 flags;
a0d0e21e
LW
3903 I32 minlen = 0;
3904 I32 sawplus = 0;
3905 I32 sawopen = 0;
2c2d71f5 3906 scan_data_t data;
830247a4 3907 RExC_state_t RExC_state;
be8e71aa 3908 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3909#ifdef TRIE_STUDY_OPT
3910 int restudied= 0;
3911 RExC_state_t copyRExC_state;
3912#endif
a0d0e21e 3913 if (exp == NULL)
c277df42 3914 FAIL("NULL regexp argument");
a0d0e21e 3915
a5961de5 3916 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3917
5cfc7842 3918 RExC_precomp = exp;
a3621e74 3919 DEBUG_COMPILE_r({
ab3bbdeb
YO
3920 SV *dsv= sv_newmortal();
3921 RE_PV_QUOTED_DECL(s, RExC_utf8,
3922 dsv, RExC_precomp, (xend - exp), 60);
3923 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3924 PL_colors[4],PL_colors[5],s);
a5961de5 3925 });
e2509266 3926 RExC_flags = pm->op_pmflags;
830247a4 3927 RExC_sawback = 0;
bbce6d69 3928
830247a4
IZ
3929 RExC_seen = 0;
3930 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3931 RExC_seen_evals = 0;
3932 RExC_extralen = 0;
c277df42 3933
bbce6d69 3934 /* First pass: determine size, legality. */
830247a4 3935 RExC_parse = exp;
fac92740 3936 RExC_start = exp;
830247a4
IZ
3937 RExC_end = xend;
3938 RExC_naughty = 0;
3939 RExC_npar = 1;
e2e6a0f1 3940 RExC_nestroot = 0;
830247a4
IZ
3941 RExC_size = 0L;
3942 RExC_emit = &PL_regdummy;
3943 RExC_whilem_seen = 0;
fc8cd66c 3944 RExC_charnames = NULL;
40d049e4
YO
3945 RExC_open_parens = NULL;
3946 RExC_close_parens = NULL;
3947 RExC_opend = NULL;
81714fb9 3948 RExC_paren_names = NULL;
40d049e4
YO
3949 RExC_recurse = NULL;
3950 RExC_recurse_count = 0;
81714fb9 3951
85ddcde9
JH
3952#if 0 /* REGC() is (currently) a NOP at the first pass.
3953 * Clever compilers notice this and complain. --jhi */
830247a4 3954 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3955#endif
3dab1dad
YO
3956 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3957 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3958 RExC_precomp = NULL;
a0d0e21e
LW
3959 return(NULL);
3960 }
07be1b83 3961 DEBUG_PARSE_r({
81714fb9
YO
3962 PerlIO_printf(Perl_debug_log,
3963 "Required size %"IVdf" nodes\n"
3964 "Starting second pass (creation)\n",
3965 (IV)RExC_size);
07be1b83
YO
3966 RExC_lastnum=0;
3967 RExC_lastparse=NULL;
3968 });
c277df42
IZ
3969 /* Small enough for pointer-storage convention?
3970 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3971 if (RExC_size >= 0x10000L && RExC_extralen)
3972 RExC_size += RExC_extralen;
c277df42 3973 else
830247a4
IZ
3974 RExC_extralen = 0;
3975 if (RExC_whilem_seen > 15)
3976 RExC_whilem_seen = 15;
a0d0e21e 3977
e2e6a0f1
YO
3978#ifdef DEBUGGING
3979 /* Make room for a sentinel value at the end of the program */
3980 RExC_size++;
3981#endif
3982
f9f4320a
YO
3983 /* Allocate space and zero-initialize. Note, the two step process
3984 of zeroing when in debug mode, thus anything assigned has to
3985 happen after that */
a02a5408 3986 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3987 char, regexp);
a0d0e21e 3988 if (r == NULL)
b45f050a 3989 FAIL("Regexp out of space");
0f79a09d
GS
3990#ifdef DEBUGGING
3991 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3992 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3993#endif
f9f4320a
YO
3994 /* initialization begins here */
3995 r->engine= RE_ENGINE_PTR;
c277df42 3996 r->refcnt = 1;
bbce6d69 3997 r->prelen = xend - exp;
5cfc7842 3998 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3999 r->subbeg = NULL;
f8c7b90f 4000#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 4001 r->saved_copy = NULL;
ed252734 4002#endif
cf93c79d 4003 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4004 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 4005 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
4006
4007 r->substrs = 0; /* Useful during FAIL. */
4008 r->startp = 0; /* Useful during FAIL. */
6bda09f9 4009 r->endp = 0;
81714fb9
YO
4010 r->paren_names = 0;
4011
6bda09f9 4012 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4013 Newxz(RExC_open_parens, RExC_npar,regnode *);
4014 SAVEFREEPV(RExC_open_parens);
4015 Newxz(RExC_close_parens,RExC_npar,regnode *);
4016 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4017 }
4018
4019 /* Useful during FAIL. */
a02a5408 4020 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4021 if (r->offsets) {
2af232bd 4022 r->offsets[0] = RExC_size;
fac92740 4023 }
a3621e74 4024 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4025 "%s %"UVuf" bytes for offset annotations.\n",
4026 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4027 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4028
830247a4 4029 RExC_rx = r;
bbce6d69 4030
4031 /* Second pass: emit code. */
e2509266 4032 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4033 RExC_parse = exp;
4034 RExC_end = xend;
4035 RExC_naughty = 0;
4036 RExC_npar = 1;
fac92740 4037 RExC_emit_start = r->program;
830247a4 4038 RExC_emit = r->program;
e2e6a0f1
YO
4039#ifdef DEBUGGING
4040 /* put a sentinal on the end of the program so we can check for
4041 overwrites */
4042 r->program[RExC_size].type = 255;
4043#endif
2cd61cdb 4044 /* Store the count of eval-groups for security checks: */
786e8c11 4045 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4046 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4047 r->data = 0;
3dab1dad 4048 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4049 return(NULL);
6bda09f9 4050
07be1b83
YO
4051 /* XXXX To minimize changes to RE engine we always allocate
4052 3-units-long substrs field. */
4053 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4054 if (RExC_recurse_count) {
4055 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4056 SAVEFREEPV(RExC_recurse);
4057 }
a0d0e21e 4058
07be1b83 4059reStudy:
1de06328 4060 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4061 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4062
07be1b83
YO
4063#ifdef TRIE_STUDY_OPT
4064 if ( restudied ) {
4065 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4066 RExC_state=copyRExC_state;
1de06328 4067 if (data.last_found) {
07be1b83 4068 SvREFCNT_dec(data.longest_fixed);
07be1b83 4069 SvREFCNT_dec(data.longest_float);
07be1b83 4070 SvREFCNT_dec(data.last_found);
1de06328 4071 }
40d049e4 4072 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4073 } else {
40d049e4 4074 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83
YO
4075 copyRExC_state=RExC_state;
4076 }
40d049e4
YO
4077#else
4078 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4079#endif
fc8cd66c 4080
a0d0e21e 4081 /* Dig out information for optimizations. */
cf93c79d 4082 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4083 pm->op_pmflags = RExC_flags;
a0ed51b3 4084 if (UTF)
5ff6fc6d 4085 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4086 r->regstclass = NULL;
830247a4 4087 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4088 r->reganch |= ROPT_NAUGHTY;
c277df42 4089 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4090
1de06328
YO
4091 /* testing for BRANCH here tells us whether there is "must appear"
4092 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4093 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4094 I32 fake;
c5254dd6 4095 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4096 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4097 int stclass_flag;
07be1b83 4098 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4099
4100 first = scan;
c277df42 4101 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4102 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4103 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4104 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4105 /* for now we can't handle lookbehind IFMATCH*/
4106 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4107 (OP(first) == PLUS) ||
4108 (OP(first) == MINMOD) ||
653099ff 4109 /* An {n,m} with n>0 */
07be1b83
YO
4110 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4111 {
786e8c11 4112
a0d0e21e
LW
4113 if (OP(first) == PLUS)
4114 sawplus = 1;
4115 else
3dab1dad 4116 first += regarglen[OP(first)];
07be1b83
YO
4117 if (OP(first) == IFMATCH) {
4118 first = NEXTOPER(first);
4119 first += EXTRA_STEP_2ARGS;
7c167cea 4120 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4121 first = NEXTOPER(first);
a687059c
LW
4122 }
4123
a0d0e21e
LW
4124 /* Starting-point info. */
4125 again:
786e8c11 4126 DEBUG_PEEP("first:",first,0);
07be1b83 4127 /* Ignore EXACT as we deal with it later. */
3dab1dad 4128 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4129 if (OP(first) == EXACT)
6f207bd3 4130 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4131 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4132 r->regstclass = first;
4133 }
07be1b83 4134#ifdef TRIE_STCLASS
786e8c11 4135 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4136 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4137 {
786e8c11 4138 regnode *trie_op;
07be1b83 4139 /* this can happen only on restudy */
786e8c11
YO
4140 if ( OP(first) == TRIE ) {
4141 struct regnode_1 *trieop;
4142 Newxz(trieop,1,struct regnode_1);
4143 StructCopy(first,trieop,struct regnode_1);
4144 trie_op=(regnode *)trieop;
4145 } else {
4146 struct regnode_charclass *trieop;
4147 Newxz(trieop,1,struct regnode_charclass);
4148 StructCopy(first,trieop,struct regnode_charclass);
4149 trie_op=(regnode *)trieop;
4150 }
1de06328 4151 OP(trie_op)+=2;
786e8c11
YO
4152 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4153 r->regstclass = trie_op;
07be1b83
YO
4154 }
4155#endif
bfed75c6 4156 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4157 r->regstclass = first;
3dab1dad
YO
4158 else if (PL_regkind[OP(first)] == BOUND ||
4159 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4160 r->regstclass = first;
3dab1dad 4161 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4162 r->reganch |= (OP(first) == MBOL
4163 ? ROPT_ANCH_MBOL
4164 : (OP(first) == SBOL
4165 ? ROPT_ANCH_SBOL
4166 : ROPT_ANCH_BOL));
a0d0e21e 4167 first = NEXTOPER(first);
774d564b 4168 goto again;
4169 }
4170 else if (OP(first) == GPOS) {
4171 r->reganch |= ROPT_ANCH_GPOS;
4172 first = NEXTOPER(first);
4173 goto again;
a0d0e21e 4174 }
e09294f4 4175 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4176 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4177 !(r->reganch & ROPT_ANCH) )
4178 {
4179 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4180 const int type =
4181 (OP(NEXTOPER(first)) == REG_ANY)
4182 ? ROPT_ANCH_MBOL
4183 : ROPT_ANCH_SBOL;
cad2e5aa 4184 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4185 first = NEXTOPER(first);
774d564b 4186 goto again;
a0d0e21e 4187 }
b81d288d 4188 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4189 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4190 /* x+ must match at the 1st pos of run of x's */
4191 r->reganch |= ROPT_SKIP;
a0d0e21e 4192
c277df42 4193 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4194#ifdef TRIE_STUDY_OPT
81714fb9 4195 DEBUG_PARSE_r(
be8e71aa
YO
4196 if (!restudied)
4197 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4198 (IV)(first - scan + 1))
4199 );
4200#else
81714fb9 4201 DEBUG_PARSE_r(
be8e71aa
YO
4202 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4203 (IV)(first - scan + 1))
4204 );
4205#endif
4206
4207
a0d0e21e
LW
4208 /*
4209 * If there's something expensive in the r.e., find the
4210 * longest literal string that must appear and make it the
4211 * regmust. Resolve ties in favor of later strings, since
4212 * the regstart check works with the beginning of the r.e.
4213 * and avoiding duplication strengthens checking. Not a
4214 * strong reason, but sufficient in the absence of others.
4215 * [Now we resolve ties in favor of the earlier string if
c277df42 4216 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4217 * earlier string may buy us something the later one won't.]
4218 */
a0d0e21e 4219 minlen = 0;
a687059c 4220
396482e1
GA
4221 data.longest_fixed = newSVpvs("");
4222 data.longest_float = newSVpvs("");
4223 data.last_found = newSVpvs("");
c277df42
IZ
4224 data.longest = &(data.longest_fixed);
4225 first = scan;
653099ff 4226 if (!r->regstclass) {
830247a4 4227 cl_init(pRExC_state, &ch_class);
653099ff
GS
4228 data.start_class = &ch_class;
4229 stclass_flag = SCF_DO_STCLASS_AND;
4230 } else /* XXXX Check for BOUND? */
4231 stclass_flag = 0;
cb434fcc 4232 data.last_closep = &last_close;
653099ff 4233
1de06328 4234 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4235 &data, -1, NULL, NULL,
4236 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4237
07be1b83 4238
786e8c11
YO
4239 CHECK_RESTUDY_GOTO;
4240
4241
830247a4 4242 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4243 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4244 && !RExC_seen_zerolen
4245 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4246 r->reganch |= ROPT_CHECK_ALL;
1de06328 4247 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4248 SvREFCNT_dec(data.last_found);
4249
1de06328
YO
4250 /* Note that code very similar to this but for anchored string
4251 follows immediately below, changes may need to be made to both.
4252 Be careful.
4253 */
a0ed51b3 4254 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4255 if (longest_float_length
c277df42
IZ
4256 || (data.flags & SF_FL_BEFORE_EOL
4257 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4258 || (RExC_flags & PMf_MULTILINE))))
4259 {
1182767e 4260 I32 t,ml;
cf93c79d 4261
1de06328 4262 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4263 && data.offset_fixed == data.offset_float_min
4264 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4265 goto remove_float; /* As in (a)+. */
4266
1de06328
YO
4267 /* copy the information about the longest float from the reg_scan_data
4268 over to the program. */
33b8afdf
JH
4269 if (SvUTF8(data.longest_float)) {
4270 r->float_utf8 = data.longest_float;
c445ea15 4271 r->float_substr = NULL;
33b8afdf
JH
4272 } else {
4273 r->float_substr = data.longest_float;
c445ea15 4274 r->float_utf8 = NULL;
33b8afdf 4275 }
1de06328
YO
4276 /* float_end_shift is how many chars that must be matched that
4277 follow this item. We calculate it ahead of time as once the
4278 lookbehind offset is added in we lose the ability to correctly
4279 calculate it.*/
4280 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4281 : (I32)longest_float_length;
1de06328
YO
4282 r->float_end_shift = ml - data.offset_float_min
4283 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4284 + data.lookbehind_float;
4285 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4286 r->float_max_offset = data.offset_float_max;
1182767e 4287 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4288 r->float_max_offset -= data.lookbehind_float;
4289
cf93c79d
IZ
4290 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4291 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4292 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4293 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4294 }
4295 else {
aca2d497 4296 remove_float:
c445ea15 4297 r->float_substr = r->float_utf8 = NULL;
c277df42 4298 SvREFCNT_dec(data.longest_float);
c5254dd6 4299 longest_float_length = 0;
a0d0e21e 4300 }
c277df42 4301
1de06328
YO
4302 /* Note that code very similar to this but for floating string
4303 is immediately above, changes may need to be made to both.
4304 Be careful.
4305 */
a0ed51b3 4306 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4307 if (longest_fixed_length
c277df42
IZ
4308 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4309 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4310 || (RExC_flags & PMf_MULTILINE))))
4311 {
1182767e 4312 I32 t,ml;
cf93c79d 4313
1de06328
YO
4314 /* copy the information about the longest fixed
4315 from the reg_scan_data over to the program. */
33b8afdf
JH
4316 if (SvUTF8(data.longest_fixed)) {
4317 r->anchored_utf8 = data.longest_fixed;
c445ea15 4318 r->anchored_substr = NULL;
33b8afdf
JH
4319 } else {
4320 r->anchored_substr = data.longest_fixed;
c445ea15 4321 r->anchored_utf8 = NULL;
33b8afdf 4322 }
1de06328
YO
4323 /* fixed_end_shift is how many chars that must be matched that
4324 follow this item. We calculate it ahead of time as once the
4325 lookbehind offset is added in we lose the ability to correctly
4326 calculate it.*/
4327 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4328 : (I32)longest_fixed_length;
1de06328
YO
4329 r->anchored_end_shift = ml - data.offset_fixed
4330 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4331 + data.lookbehind_fixed;
4332 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4333
cf93c79d
IZ
4334 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4335 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4336 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4337 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4338 }
4339 else {
c445ea15 4340 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4341 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4342 longest_fixed_length = 0;
a0d0e21e 4343 }
b81d288d 4344 if (r->regstclass
ffc61ed2 4345 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4346 r->regstclass = NULL;
33b8afdf
JH
4347 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4348 && stclass_flag
653099ff 4349 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4350 && !cl_is_anything(data.start_class))
4351 {
1df70142 4352 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4353
a02a5408 4354 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4355 struct regnode_charclass_class);
4356 StructCopy(data.start_class,
830247a4 4357 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4358 struct regnode_charclass_class);
830247a4 4359 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4360 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4361 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4362 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4363 PerlIO_printf(Perl_debug_log,
a0288114 4364 "synthetic stclass \"%s\".\n",
3f7c398e 4365 SvPVX_const(sv));});
653099ff 4366 }
c277df42
IZ
4367
4368 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4369 if (longest_fixed_length > longest_float_length) {
1de06328 4370 r->check_end_shift = r->anchored_end_shift;
c277df42 4371 r->check_substr = r->anchored_substr;
33b8afdf 4372 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4373 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4374 if (r->reganch & ROPT_ANCH_SINGLE)
4375 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4376 }
4377 else {
1de06328 4378 r->check_end_shift = r->float_end_shift;
c277df42 4379 r->check_substr = r->float_substr;
33b8afdf 4380 r->check_utf8 = r->float_utf8;
1de06328
YO
4381 r->check_offset_min = r->float_min_offset;
4382 r->check_offset_max = r->float_max_offset;
a0d0e21e 4383 }
30382c73
IZ
4384 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4385 This should be changed ASAP! */
33b8afdf 4386 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4387 r->reganch |= RE_USE_INTUIT;
33b8afdf 4388 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4389 r->reganch |= RE_INTUIT_TAIL;
4390 }
1de06328
YO
4391 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4392 if ( (STRLEN)minlen < longest_float_length )
4393 minlen= longest_float_length;
4394 if ( (STRLEN)minlen < longest_fixed_length )
4395 minlen= longest_fixed_length;
4396 */
a0ed51b3
LW
4397 }
4398 else {
c277df42
IZ
4399 /* Several toplevels. Best we can is to set minlen. */
4400 I32 fake;
653099ff 4401 struct regnode_charclass_class ch_class;
cb434fcc 4402 I32 last_close = 0;
c277df42 4403
a3621e74 4404 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4405
c277df42 4406 scan = r->program + 1;
830247a4 4407 cl_init(pRExC_state, &ch_class);
653099ff 4408 data.start_class = &ch_class;
cb434fcc 4409 data.last_closep = &last_close;
07be1b83 4410
1de06328 4411 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4412 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
07be1b83 4413
786e8c11 4414 CHECK_RESTUDY_GOTO;
07be1b83 4415
33b8afdf 4416 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4417 = r->float_substr = r->float_utf8 = NULL;
653099ff 4418 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4419 && !cl_is_anything(data.start_class))
4420 {
1df70142 4421 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4422
a02a5408 4423 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4424 struct regnode_charclass_class);
4425 StructCopy(data.start_class,
830247a4 4426 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4427 struct regnode_charclass_class);
830247a4 4428 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4429 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4430 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4431 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4432 PerlIO_printf(Perl_debug_log,
a0288114 4433 "synthetic stclass \"%s\".\n",
3f7c398e 4434 SvPVX_const(sv));});
653099ff 4435 }
a0d0e21e
LW
4436 }
4437
1de06328
YO
4438 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4439 the "real" pattern. */
4440 if (r->minlen < minlen)
4441 r->minlen = minlen;
4442
b81d288d 4443 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4444 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4445 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4446 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4447 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4448 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4449 if (RExC_seen & REG_SEEN_CANY)
4450 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4451 if (RExC_seen & REG_SEEN_VERBARG)
4452 r->reganch |= ROPT_VERBARG_SEEN;
81714fb9
YO
4453 if (RExC_paren_names)
4454 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4455 else
4456 r->paren_names = NULL;
4457
40d049e4
YO
4458 if (RExC_recurse_count) {
4459 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4460 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4461 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4462 }
4463 }
a02a5408
JC
4464 Newxz(r->startp, RExC_npar, I32);
4465 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4466
f2278c82 4467 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4468 DEBUG_DUMP_r({
4469 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4470 regdump(r);
4471 });
8e9a8a48
YO
4472 DEBUG_OFFSETS_r(if (r->offsets) {
4473 const U32 len = r->offsets[0];
4474 U32 i;
4475 GET_RE_DEBUG_FLAGS_DECL;
4476 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4477 for (i = 1; i <= len; i++) {
4478 if (r->offsets[i*2-1] || r->offsets[i*2])
4479 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4480 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4481 }
4482 PerlIO_printf(Perl_debug_log, "\n");
4483 });
a0d0e21e 4484 return(r);
f9f4320a 4485 END_BLOCK
a687059c
LW
4486}
4487
f9f4320a
YO
4488#undef CORE_ONLY_BLOCK
4489#undef END_BLOCK
4490#undef RE_ENGINE_PTR
3dab1dad 4491
9af228c6 4492#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4493SV*
4494Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4495{
4496 I32 parno = 0; /* no match */
4497 if (PL_curpm) {
4498 const REGEXP * const rx = PM_GETRE(PL_curpm);
4499 if (rx && rx->paren_names) {
4500 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4501 if (he_str) {
4502 IV i;
4503 SV* sv_dat=HeVAL(he_str);
4504 I32 *nums=(I32*)SvPVX(sv_dat);
4505 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4506 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4507 rx->endp[nums[i]] != -1)
4508 {
4509 parno = nums[i];
4510 break;
4511 }
4512 }
4513 }
4514 }
4515 }
4516 if ( !parno ) {
4517 return 0;
4518 } else {
4519 GV *gv_paren;
4520 SV *sv= sv_newmortal();
4521 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4522 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4523 return GvSVn(gv_paren);
4524 }
4525}
9af228c6 4526#endif
0a4db386 4527
894be9b7 4528/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4529 * If flags is REG_RSN_RETURN_NULL returns null.
4530 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4531 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4532 * to the parsed name as looked up in the RExC_paren_names hash.
4533 * If there is an error throws a vFAIL().. type exception.
894be9b7 4534 */
0a4db386
YO
4535
4536#define REG_RSN_RETURN_NULL 0
4537#define REG_RSN_RETURN_NAME 1
4538#define REG_RSN_RETURN_DATA 2
4539
894be9b7
YO
4540STATIC SV*
4541S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4542 char *name_start = RExC_parse;
0a4db386 4543 if ( UTF ) {
894be9b7 4544 STRLEN numlen;
0a4db386
YO
4545 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4546 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4547 {
4548 RExC_parse += numlen;
4549 }
4550 } else {
4551 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4552 RExC_parse++;
4553 }
0a4db386
YO
4554 if ( flags ) {
4555 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4556 (int)(RExC_parse - name_start)));
894be9b7 4557 if (UTF)
0a4db386
YO
4558 SvUTF8_on(sv_name);
4559 if ( flags == REG_RSN_RETURN_NAME)
4560 return sv_name;
4561 else if (flags==REG_RSN_RETURN_DATA) {
4562 HE *he_str = NULL;
4563 SV *sv_dat = NULL;
4564 if ( ! sv_name ) /* should not happen*/
4565 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4566 if (RExC_paren_names)
4567 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4568 if ( he_str )
4569 sv_dat = HeVAL(he_str);
4570 if ( ! sv_dat )
4571 vFAIL("Reference to nonexistent named group");
4572 return sv_dat;
4573 }
4574 else {
4575 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4576 }
4577 /* NOT REACHED */
894be9b7 4578 }
0a4db386 4579 return NULL;
894be9b7
YO
4580}
4581
3dab1dad
YO
4582#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4583 int rem=(int)(RExC_end - RExC_parse); \
4584 int cut; \
4585 int num; \
4586 int iscut=0; \
4587 if (rem>10) { \
4588 rem=10; \
4589 iscut=1; \
4590 } \
4591 cut=10-rem; \
4592 if (RExC_lastparse!=RExC_parse) \
4593 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4594 rem, RExC_parse, \
4595 cut + 4, \
4596 iscut ? "..." : "<" \
4597 ); \
4598 else \
4599 PerlIO_printf(Perl_debug_log,"%16s",""); \
4600 \
4601 if (SIZE_ONLY) \
4602 num=RExC_size; \
4603 else \
4604 num=REG_NODE_NUM(RExC_emit); \
4605 if (RExC_lastnum!=num) \
0a4db386 4606 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4607 else \
0a4db386 4608 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4609 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4610 (int)((depth*2)), "", \
3dab1dad
YO
4611 (funcname) \
4612 ); \
4613 RExC_lastnum=num; \
4614 RExC_lastparse=RExC_parse; \
4615})
4616
07be1b83
YO
4617
4618
3dab1dad
YO
4619#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4620 DEBUG_PARSE_MSG((funcname)); \
4621 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4622})
6bda09f9
YO
4623#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4624 DEBUG_PARSE_MSG((funcname)); \
4625 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4626})
a687059c
LW
4627/*
4628 - reg - regular expression, i.e. main body or parenthesized thing
4629 *
4630 * Caller must absorb opening parenthesis.
4631 *
4632 * Combining parenthesis handling with the base level of regular expression
4633 * is a trifle forced, but the need to tie the tails of the branches to what
4634 * follows makes it hard to avoid.
4635 */
07be1b83
YO
4636#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4637#ifdef DEBUGGING
4638#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4639#else
4640#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4641#endif
3dab1dad 4642
e2e6a0f1
YO
4643/* this idea is borrowed from STR_WITH_LEN in handy.h */
4644#define CHECK_WORD(s,v,l) \
4645 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4646
76e3520e 4647STATIC regnode *
3dab1dad 4648S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4649 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4650{
27da23d5 4651 dVAR;
c277df42
IZ
4652 register regnode *ret; /* Will be the head of the group. */
4653 register regnode *br;
4654 register regnode *lastbr;
cbbf8932 4655 register regnode *ender = NULL;
a0d0e21e 4656 register I32 parno = 0;
cbbf8932
AL
4657 I32 flags;
4658 const I32 oregflags = RExC_flags;
6136c704
AL
4659 bool have_branch = 0;
4660 bool is_open = 0;
9d1d55b5
JP
4661
4662 /* for (?g), (?gc), and (?o) warnings; warning
4663 about (?c) will warn about (?g) -- japhy */
4664
6136c704
AL
4665#define WASTED_O 0x01
4666#define WASTED_G 0x02
4667#define WASTED_C 0x04
4668#define WASTED_GC (0x02|0x04)
cbbf8932 4669 I32 wastedflags = 0x00;
9d1d55b5 4670
fac92740 4671 char * parse_start = RExC_parse; /* MJD */
a28509cc 4672 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4673
3dab1dad
YO
4674 GET_RE_DEBUG_FLAGS_DECL;
4675 DEBUG_PARSE("reg ");
4676
4677
821b33a5 4678 *flagp = 0; /* Tentatively. */
a0d0e21e 4679
9d1d55b5 4680
a0d0e21e
LW
4681 /* Make an OPEN node, if parenthesized. */
4682 if (paren) {
e2e6a0f1
YO
4683 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4684 char *start_verb = RExC_parse;
4685 STRLEN verb_len = 0;
4686 char *start_arg = NULL;
4687 unsigned char op = 0;
4688 int argok = 1;
4689 int internal_argval = 0; /* internal_argval is only useful if !argok */
4690 while ( *RExC_parse && *RExC_parse != ')' ) {
4691 if ( *RExC_parse == ':' ) {
4692 start_arg = RExC_parse + 1;
4693 break;
4694 }
4695 RExC_parse++;
4696 }
4697 ++start_verb;
4698 verb_len = RExC_parse - start_verb;
4699 if ( start_arg ) {
4700 RExC_parse++;
4701 while ( *RExC_parse && *RExC_parse != ')' )
4702 RExC_parse++;
4703 if ( *RExC_parse != ')' )
4704 vFAIL("Unterminated verb pattern argument");
4705 if ( RExC_parse == start_arg )
4706 start_arg = NULL;
4707 } else {
4708 if ( *RExC_parse != ')' )
4709 vFAIL("Unterminated verb pattern");
4710 }
4711 switch ( *start_verb ) {
4712 case 'A': /* (*ACCEPT) */
4713 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4714 op = ACCEPT;
4715 internal_argval = RExC_nestroot;
4716 }
4717 break;
4718 case 'C': /* (*COMMIT) */
4719 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4720 op = COMMIT;
4721 else if ( CHECK_WORD("CUT",start_verb,verb_len) )
4722 op = CUT;
4723 break;
4724 case 'F': /* (*FAIL) */
4725 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4726 op = OPFAIL;
4727 argok = 0;
4728 }
4729 break;
4730 case 'M':
4731 if ( CHECK_WORD("MARK",start_verb,verb_len) )
4732 op = MARKPOINT;
4733 break;
4734 case 'N': /* (*NOMATCH) */
4735 if ( CHECK_WORD("NOMATCH",start_verb,verb_len) )
4736 op = NOMATCH;
4737 break;
4738 }
4739 if ( ! op ) {
4740 RExC_parse++;
4741 vFAIL3("Unknown verb pattern '%.*s'",
4742 verb_len, start_verb);
4743 }
4744 if ( argok ) {
4745 if ( start_arg && internal_argval ) {
4746 vFAIL3("Verb pattern '%.*s' may not have an argument",
4747 verb_len, start_verb);
4748 } else if ( argok < 0 && !start_arg ) {
4749 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4750 verb_len, start_verb);
4751 } else {
4752 ret = reganode(pRExC_state, op, internal_argval);
4753 if ( ! internal_argval && ! SIZE_ONLY ) {
4754 if (start_arg) {
4755 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4756 ARG(ret) = add_data( pRExC_state, 1, "S" );
4757 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4758 ret->flags = 0;
4759 } else {
4760 ret->flags = 1;
4761 }
4762 }
4763 }
4764 if (!internal_argval)
4765 RExC_seen |= REG_SEEN_VERBARG;
4766 } else if ( start_arg ) {
4767 vFAIL3("Verb pattern '%.*s' may not have an argument",
4768 verb_len, start_verb);
4769 } else {
4770 ret = reg_node(pRExC_state, op);
4771 }
4772 nextchar(pRExC_state);
4773 return ret;
4774 } else
fac92740 4775 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4776 U32 posflags = 0, negflags = 0;
4777 U32 *flagsp = &posflags;
6136c704 4778 bool is_logical = 0;
a28509cc 4779 const char * const seqstart = RExC_parse;
ca9dfc88 4780
830247a4
IZ
4781 RExC_parse++;
4782 paren = *RExC_parse++;
c277df42 4783 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4784 switch (paren) {
894be9b7 4785
fac92740 4786 case '<': /* (?<...) */
b81d288d 4787 if (*RExC_parse == '!')
c277df42 4788 paren = ',';
0a4db386
YO
4789 else if (*RExC_parse != '=')
4790 { /* (?<...>) */
81714fb9 4791 char *name_start;
894be9b7 4792 SV *svname;
81714fb9
YO
4793 paren= '>';
4794 case '\'': /* (?'...') */
4795 name_start= RExC_parse;
0a4db386
YO
4796 svname = reg_scan_name(pRExC_state,
4797 SIZE_ONLY ? /* reverse test from the others */
4798 REG_RSN_RETURN_NAME :
4799 REG_RSN_RETURN_NULL);
81714fb9
YO
4800 if (RExC_parse == name_start)
4801 goto unknown;
4802 if (*RExC_parse != paren)
4803 vFAIL2("Sequence (?%c... not terminated",
4804 paren=='>' ? '<' : paren);
4805 if (SIZE_ONLY) {
e62cc96a
YO
4806 HE *he_str;
4807 SV *sv_dat = NULL;
894be9b7
YO
4808 if (!svname) /* shouldnt happen */
4809 Perl_croak(aTHX_
4810 "panic: reg_scan_name returned NULL");
81714fb9
YO
4811 if (!RExC_paren_names) {
4812 RExC_paren_names= newHV();
4813 sv_2mortal((SV*)RExC_paren_names);
4814 }
4815 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4816 if ( he_str )
81714fb9 4817 sv_dat = HeVAL(he_str);
e62cc96a 4818 if ( ! sv_dat ) {
81714fb9 4819 /* croak baby croak */
e62cc96a
YO
4820 Perl_croak(aTHX_
4821 "panic: paren_name hash element allocation failed");
4822 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4823 IV count=SvIV(sv_dat);
4824 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4825 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4826 pv[count]=RExC_npar;
4827 SvIVX(sv_dat)++;
4828 } else {
4829 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4830 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4831 SvIOK_on(sv_dat);
4832 SvIVX(sv_dat)= 1;
e62cc96a
YO
4833 }
4834
81714fb9
YO
4835 /*sv_dump(sv_dat);*/
4836 }
4837 nextchar(pRExC_state);
4838 paren = 1;
4839 goto capturing_parens;
4840 }
4841 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4842 RExC_parse++;
fac92740
MJD
4843 case '=': /* (?=...) */
4844 case '!': /* (?!...) */
830247a4 4845 RExC_seen_zerolen++;
e2e6a0f1
YO
4846 if (*RExC_parse == ')') {
4847 ret=reg_node(pRExC_state, OPFAIL);
4848 nextchar(pRExC_state);
4849 return ret;
4850 }
fac92740
MJD
4851 case ':': /* (?:...) */
4852 case '>': /* (?>...) */
a0d0e21e 4853 break;
fac92740
MJD
4854 case '$': /* (?$...) */
4855 case '@': /* (?@...) */
8615cb43 4856 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4857 break;
fac92740 4858 case '#': /* (?#...) */
830247a4
IZ
4859 while (*RExC_parse && *RExC_parse != ')')
4860 RExC_parse++;
4861 if (*RExC_parse != ')')
c277df42 4862 FAIL("Sequence (?#... not terminated");
830247a4 4863 nextchar(pRExC_state);
a0d0e21e
LW
4864 *flagp = TRYAGAIN;
4865 return NULL;
894be9b7
YO
4866 case '0' : /* (?0) */
4867 case 'R' : /* (?R) */
4868 if (*RExC_parse != ')')
6bda09f9 4869 FAIL("Sequence (?R) not terminated");
1a147d38 4870 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4871 nextchar(pRExC_state);
4872 return ret;
4873 /*notreached*/
894be9b7
YO
4874 { /* named and numeric backreferences */
4875 I32 num;
4876 char * parse_start;
4877 case '&': /* (?&NAME) */
4878 parse_start = RExC_parse - 1;
4879 {
0a4db386
YO
4880 SV *sv_dat = reg_scan_name(pRExC_state,
4881 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4882 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4883 }
4884 goto gen_recurse_regop;
4885 /* NOT REACHED */
6bda09f9
YO
4886 case '1': case '2': case '3': case '4': /* (?1) */
4887 case '5': case '6': case '7': case '8': case '9':
4888 RExC_parse--;
894be9b7
YO
4889 num = atoi(RExC_parse);
4890 parse_start = RExC_parse - 1; /* MJD */
6bda09f9
YO
4891 while (isDIGIT(*RExC_parse))
4892 RExC_parse++;
4893 if (*RExC_parse!=')')
4894 vFAIL("Expecting close bracket");
894be9b7
YO
4895
4896 gen_recurse_regop:
1a147d38 4897 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4898 if (!SIZE_ONLY) {
4899 if (num > (I32)RExC_rx->nparens) {
4900 RExC_parse++;
4901 vFAIL("Reference to nonexistent group");
4902 }
40d049e4 4903 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4904 RExC_emit++;
226de585 4905 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4906 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4907 } else {
6bda09f9 4908 RExC_size++;
6bda09f9 4909 }
0a4db386 4910 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4911 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4912 Set_Node_Offset(ret, parse_start); /* MJD */
4913
6bda09f9
YO
4914 nextchar(pRExC_state);
4915 return ret;
894be9b7
YO
4916 } /* named and numeric backreferences */
4917 /* NOT REACHED */
4918
fac92740 4919 case 'p': /* (?p...) */
9014280d 4920 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4921 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4922 /* FALL THROUGH*/
fac92740 4923 case '?': /* (??...) */
6136c704 4924 is_logical = 1;
438a3801
YST
4925 if (*RExC_parse != '{')
4926 goto unknown;
830247a4 4927 paren = *RExC_parse++;
0f5d15d6 4928 /* FALL THROUGH */
fac92740 4929 case '{': /* (?{...}) */
c277df42 4930 {
c277df42
IZ
4931 I32 count = 1, n = 0;
4932 char c;
830247a4 4933 char *s = RExC_parse;
c277df42 4934
830247a4
IZ
4935 RExC_seen_zerolen++;
4936 RExC_seen |= REG_SEEN_EVAL;
4937 while (count && (c = *RExC_parse)) {
6136c704
AL
4938 if (c == '\\') {
4939 if (RExC_parse[1])
4940 RExC_parse++;
4941 }
b81d288d 4942 else if (c == '{')
c277df42 4943 count++;
b81d288d 4944 else if (c == '}')
c277df42 4945 count--;
830247a4 4946 RExC_parse++;
c277df42 4947 }
6136c704 4948 if (*RExC_parse != ')') {
b81d288d 4949 RExC_parse = s;
b45f050a
JF
4950 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4951 }
c277df42 4952 if (!SIZE_ONLY) {
f3548bdc 4953 PAD *pad;
6136c704
AL
4954 OP_4tree *sop, *rop;
4955 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4956
569233ed
SB
4957 ENTER;
4958 Perl_save_re_context(aTHX);
f3548bdc 4959 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4960 sop->op_private |= OPpREFCOUNTED;
4961 /* re_dup will OpREFCNT_inc */
4962 OpREFCNT_set(sop, 1);
569233ed 4963 LEAVE;
c277df42 4964
830247a4
IZ
4965 n = add_data(pRExC_state, 3, "nop");
4966 RExC_rx->data->data[n] = (void*)rop;
4967 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4968 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4969 SvREFCNT_dec(sv);
a0ed51b3 4970 }
e24b16f9 4971 else { /* First pass */
830247a4 4972 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4973 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4974 /* No compiled RE interpolated, has runtime
4975 components ===> unsafe. */
4976 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4977 if (PL_tainting && PL_tainted)
cc6b7395 4978 FAIL("Eval-group in insecure regular expression");
54df2634 4979#if PERL_VERSION > 8
923e4eb5 4980 if (IN_PERL_COMPILETIME)
b5c19bd7 4981 PL_cv_has_eval = 1;
54df2634 4982#endif
c277df42 4983 }
b5c19bd7 4984
830247a4 4985 nextchar(pRExC_state);
6136c704 4986 if (is_logical) {
830247a4 4987 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4988 if (!SIZE_ONLY)
4989 ret->flags = 2;
3dab1dad 4990 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4991 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4992 return ret;
4993 }
ccb2c380
MP
4994 ret = reganode(pRExC_state, EVAL, n);
4995 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4996 Set_Node_Offset(ret, parse_start);
4997 return ret;
c277df42 4998 }
fac92740 4999 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5000 {
0a4db386 5001 int is_define= 0;
fac92740 5002 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5003 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5004 || RExC_parse[1] == '<'
830247a4 5005 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5006 I32 flag;
5007
830247a4 5008 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5009 if (!SIZE_ONLY)
5010 ret->flags = 1;
3dab1dad 5011 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5012 goto insert_if;
b81d288d 5013 }
a0ed51b3 5014 }
0a4db386
YO
5015 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5016 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5017 {
5018 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5019 char *name_start= RExC_parse++;
5020 I32 num = 0;
5021 SV *sv_dat=reg_scan_name(pRExC_state,
5022 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5023 if (RExC_parse == name_start || *RExC_parse != ch)
5024 vFAIL2("Sequence (?(%c... not terminated",
5025 (ch == '>' ? '<' : ch));
5026 RExC_parse++;
5027 if (!SIZE_ONLY) {
5028 num = add_data( pRExC_state, 1, "S" );
5029 RExC_rx->data->data[num]=(void*)sv_dat;
5030 SvREFCNT_inc(sv_dat);
5031 }
5032 ret = reganode(pRExC_state,NGROUPP,num);
5033 goto insert_if_check_paren;
5034 }
5035 else if (RExC_parse[0] == 'D' &&
5036 RExC_parse[1] == 'E' &&
5037 RExC_parse[2] == 'F' &&
5038 RExC_parse[3] == 'I' &&
5039 RExC_parse[4] == 'N' &&
5040 RExC_parse[5] == 'E')
5041 {
5042 ret = reganode(pRExC_state,DEFINEP,0);
5043 RExC_parse +=6 ;
5044 is_define = 1;
5045 goto insert_if_check_paren;
5046 }
5047 else if (RExC_parse[0] == 'R') {
5048 RExC_parse++;
5049 parno = 0;
5050 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5051 parno = atoi(RExC_parse++);
5052 while (isDIGIT(*RExC_parse))
5053 RExC_parse++;
5054 } else if (RExC_parse[0] == '&') {
5055 SV *sv_dat;
5056 RExC_parse++;
5057 sv_dat = reg_scan_name(pRExC_state,
5058 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5059 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5060 }
1a147d38 5061 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5062 goto insert_if_check_paren;
5063 }
830247a4 5064 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5065 /* (?(1)...) */
6136c704 5066 char c;
830247a4 5067 parno = atoi(RExC_parse++);
c277df42 5068
830247a4
IZ
5069 while (isDIGIT(*RExC_parse))
5070 RExC_parse++;
fac92740 5071 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5072
0a4db386 5073 insert_if_check_paren:
830247a4 5074 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5075 vFAIL("Switch condition not recognized");
c277df42 5076 insert_if:
3dab1dad
YO
5077 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5078 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5079 if (br == NULL)
830247a4 5080 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5081 else
3dab1dad 5082 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5083 c = *nextchar(pRExC_state);
d1b80229
IZ
5084 if (flags&HASWIDTH)
5085 *flagp |= HASWIDTH;
c277df42 5086 if (c == '|') {
0a4db386
YO
5087 if (is_define)
5088 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5089 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5090 regbranch(pRExC_state, &flags, 1,depth+1);
5091 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5092 if (flags&HASWIDTH)
5093 *flagp |= HASWIDTH;
830247a4 5094 c = *nextchar(pRExC_state);
a0ed51b3
LW
5095 }
5096 else
c277df42
IZ
5097 lastbr = NULL;
5098 if (c != ')')
8615cb43 5099 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5100 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5101 REGTAIL(pRExC_state, br, ender);
c277df42 5102 if (lastbr) {
3dab1dad
YO
5103 REGTAIL(pRExC_state, lastbr, ender);
5104 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5105 }
5106 else
3dab1dad 5107 REGTAIL(pRExC_state, ret, ender);
c277df42 5108 return ret;
a0ed51b3
LW
5109 }
5110 else {
830247a4 5111 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5112 }
5113 }
1b1626e4 5114 case 0:
830247a4 5115 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5116 vFAIL("Sequence (? incomplete");
1b1626e4 5117 break;
a0d0e21e 5118 default:
830247a4 5119 --RExC_parse;
fac92740 5120 parse_flags: /* (?i) */
830247a4 5121 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5122 /* (?g), (?gc) and (?o) are useless here
5123 and must be globally applied -- japhy */
5124
5125 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5126 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5127 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5128 if (! (wastedflags & wflagbit) ) {
5129 wastedflags |= wflagbit;
5130 vWARN5(
5131 RExC_parse + 1,
5132 "Useless (%s%c) - %suse /%c modifier",
5133 flagsp == &negflags ? "?-" : "?",
5134 *RExC_parse,
5135 flagsp == &negflags ? "don't " : "",
5136 *RExC_parse
5137 );
5138 }
5139 }
5140 }
5141 else if (*RExC_parse == 'c') {
5142 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5143 if (! (wastedflags & WASTED_C) ) {
5144 wastedflags |= WASTED_GC;
9d1d55b5
JP
5145 vWARN3(
5146 RExC_parse + 1,
5147 "Useless (%sc) - %suse /gc modifier",
5148 flagsp == &negflags ? "?-" : "?",
5149 flagsp == &negflags ? "don't " : ""
5150 );
5151 }
5152 }
5153 }
5154 else { pmflag(flagsp, *RExC_parse); }
5155
830247a4 5156 ++RExC_parse;
ca9dfc88 5157 }
830247a4 5158 if (*RExC_parse == '-') {
ca9dfc88 5159 flagsp = &negflags;
9d1d55b5 5160 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5161 ++RExC_parse;
ca9dfc88 5162 goto parse_flags;
48c036b1 5163 }
e2509266
JH
5164 RExC_flags |= posflags;
5165 RExC_flags &= ~negflags;
830247a4
IZ
5166 if (*RExC_parse == ':') {
5167 RExC_parse++;
ca9dfc88
IZ
5168 paren = ':';
5169 break;
5170 }
c277df42 5171 unknown:
830247a4
IZ
5172 if (*RExC_parse != ')') {
5173 RExC_parse++;
5174 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5175 }
830247a4 5176 nextchar(pRExC_state);
a0d0e21e
LW
5177 *flagp = TRYAGAIN;
5178 return NULL;
5179 }
5180 }
fac92740 5181 else { /* (...) */
81714fb9 5182 capturing_parens:
830247a4
IZ
5183 parno = RExC_npar;
5184 RExC_npar++;
e2e6a0f1 5185
830247a4 5186 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5187 if (!SIZE_ONLY ){
5188 if (!RExC_nestroot)
5189 RExC_nestroot = parno;
5190 if (RExC_seen & REG_SEEN_RECURSE) {
5191 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5192 "Setting open paren #%"IVdf" to %d\n",
5193 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5194 RExC_open_parens[parno-1]= ret;
5195 }
6bda09f9 5196 }
fac92740
MJD
5197 Set_Node_Length(ret, 1); /* MJD */
5198 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5199 is_open = 1;
a0d0e21e 5200 }
a0ed51b3 5201 }
fac92740 5202 else /* ! paren */
a0d0e21e
LW
5203 ret = NULL;
5204
5205 /* Pick up the branches, linking them together. */
fac92740 5206 parse_start = RExC_parse; /* MJD */
3dab1dad 5207 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5208 /* branch_len = (paren != 0); */
2af232bd 5209
a0d0e21e
LW
5210 if (br == NULL)
5211 return(NULL);
830247a4
IZ
5212 if (*RExC_parse == '|') {
5213 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5214 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5215 }
fac92740 5216 else { /* MJD */
6bda09f9 5217 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5218 Set_Node_Length(br, paren != 0);
5219 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5220 }
c277df42
IZ
5221 have_branch = 1;
5222 if (SIZE_ONLY)
830247a4 5223 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5224 }
5225 else if (paren == ':') {
c277df42
IZ
5226 *flagp |= flags&SIMPLE;
5227 }
6136c704 5228 if (is_open) { /* Starts with OPEN. */
3dab1dad 5229 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5230 }
5231 else if (paren != '?') /* Not Conditional */
a0d0e21e 5232 ret = br;
32a0ca98 5233 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5234 lastbr = br;
830247a4
IZ
5235 while (*RExC_parse == '|') {
5236 if (!SIZE_ONLY && RExC_extralen) {
5237 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5238 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5239 }
5240 if (SIZE_ONLY)
830247a4
IZ
5241 RExC_extralen += 2; /* Account for LONGJMP. */
5242 nextchar(pRExC_state);
3dab1dad 5243 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5244
a687059c 5245 if (br == NULL)
a0d0e21e 5246 return(NULL);
3dab1dad 5247 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5248 lastbr = br;
821b33a5
IZ
5249 if (flags&HASWIDTH)
5250 *flagp |= HASWIDTH;
a687059c 5251 *flagp |= flags&SPSTART;
a0d0e21e
LW
5252 }
5253
c277df42
IZ
5254 if (have_branch || paren != ':') {
5255 /* Make a closing node, and hook it on the end. */
5256 switch (paren) {
5257 case ':':
830247a4 5258 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5259 break;
5260 case 1:
830247a4 5261 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5262 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5263 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5264 "Setting close paren #%"IVdf" to %d\n",
5265 (IV)parno, REG_NODE_NUM(ender)));
5266 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5267 if (RExC_nestroot == parno)
5268 RExC_nestroot = 0;
40d049e4 5269 }
fac92740
MJD
5270 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5271 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5272 break;
5273 case '<':
c277df42
IZ
5274 case ',':
5275 case '=':
5276 case '!':
c277df42 5277 *flagp &= ~HASWIDTH;
821b33a5
IZ
5278 /* FALL THROUGH */
5279 case '>':
830247a4 5280 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5281 break;
5282 case 0:
830247a4 5283 ender = reg_node(pRExC_state, END);
40d049e4
YO
5284 if (!SIZE_ONLY) {
5285 assert(!RExC_opend); /* there can only be one! */
5286 RExC_opend = ender;
5287 }
c277df42
IZ
5288 break;
5289 }
eaf3ca90 5290 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5291
9674d46a 5292 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5293 if (depth==1)
5294 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5295
c277df42 5296 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5297 for (br = ret; br; br = regnext(br)) {
5298 const U8 op = PL_regkind[OP(br)];
5299 if (op == BRANCH) {
07be1b83 5300 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5301 }
5302 else if (op == BRANCHJ) {
07be1b83 5303 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5304 }
c277df42
IZ
5305 }
5306 }
a0d0e21e 5307 }
c277df42
IZ
5308
5309 {
e1ec3a88
AL
5310 const char *p;
5311 static const char parens[] = "=!<,>";
c277df42
IZ
5312
5313 if (paren && (p = strchr(parens, paren))) {
eb160463 5314 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5315 int flag = (p - parens) > 1;
5316
5317 if (paren == '>')
5318 node = SUSPEND, flag = 0;
6bda09f9 5319 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5320 Set_Node_Cur_Length(ret);
5321 Set_Node_Offset(ret, parse_start + 1);
c277df42 5322 ret->flags = flag;
07be1b83 5323 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5324 }
a0d0e21e
LW
5325 }
5326
5327 /* Check for proper termination. */
ce3e6498 5328 if (paren) {
e2509266 5329 RExC_flags = oregflags;
830247a4
IZ
5330 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5331 RExC_parse = oregcomp_parse;
380a0633 5332 vFAIL("Unmatched (");
ce3e6498 5333 }
a0ed51b3 5334 }
830247a4
IZ
5335 else if (!paren && RExC_parse < RExC_end) {
5336 if (*RExC_parse == ')') {
5337 RExC_parse++;
380a0633 5338 vFAIL("Unmatched )");
a0ed51b3
LW
5339 }
5340 else
b45f050a 5341 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5342 /* NOTREACHED */
5343 }
a687059c 5344
a0d0e21e 5345 return(ret);
a687059c
LW
5346}
5347
5348/*
5349 - regbranch - one alternative of an | operator
5350 *
5351 * Implements the concatenation operator.
5352 */
76e3520e 5353STATIC regnode *
3dab1dad 5354S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5355{
97aff369 5356 dVAR;
c277df42
IZ
5357 register regnode *ret;
5358 register regnode *chain = NULL;
5359 register regnode *latest;
5360 I32 flags = 0, c = 0;
3dab1dad
YO
5361 GET_RE_DEBUG_FLAGS_DECL;
5362 DEBUG_PARSE("brnc");
b81d288d 5363 if (first)
c277df42
IZ
5364 ret = NULL;
5365 else {
b81d288d 5366 if (!SIZE_ONLY && RExC_extralen)
830247a4 5367 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5368 else {
830247a4 5369 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5370 Set_Node_Length(ret, 1);
5371 }
c277df42
IZ
5372 }
5373
b81d288d 5374 if (!first && SIZE_ONLY)
830247a4 5375 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5376
c277df42 5377 *flagp = WORST; /* Tentatively. */
a0d0e21e 5378
830247a4
IZ
5379 RExC_parse--;
5380 nextchar(pRExC_state);
5381 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5382 flags &= ~TRYAGAIN;
3dab1dad 5383 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5384 if (latest == NULL) {
5385 if (flags & TRYAGAIN)
5386 continue;
5387 return(NULL);
a0ed51b3
LW
5388 }
5389 else if (ret == NULL)
c277df42 5390 ret = latest;
a0d0e21e 5391 *flagp |= flags&HASWIDTH;
c277df42 5392 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5393 *flagp |= flags&SPSTART;
5394 else {
830247a4 5395 RExC_naughty++;
3dab1dad 5396 REGTAIL(pRExC_state, chain, latest);
a687059c 5397 }
a0d0e21e 5398 chain = latest;
c277df42
IZ
5399 c++;
5400 }
5401 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5402 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5403 if (ret == NULL)
5404 ret = chain;
5405 }
5406 if (c == 1) {
5407 *flagp |= flags&SIMPLE;
a0d0e21e 5408 }
a687059c 5409
d4c19fe8 5410 return ret;
a687059c
LW
5411}
5412
5413/*
5414 - regpiece - something followed by possible [*+?]
5415 *
5416 * Note that the branching code sequences used for ? and the general cases
5417 * of * and + are somewhat optimized: they use the same NOTHING node as
5418 * both the endmarker for their branch list and the body of the last branch.
5419 * It might seem that this node could be dispensed with entirely, but the
5420 * endmarker role is not redundant.
5421 */
76e3520e 5422STATIC regnode *
3dab1dad 5423S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5424{
97aff369 5425 dVAR;
c277df42 5426 register regnode *ret;
a0d0e21e
LW
5427 register char op;
5428 register char *next;
5429 I32 flags;
1df70142 5430 const char * const origparse = RExC_parse;
a0d0e21e 5431 I32 min;
c277df42 5432 I32 max = REG_INFTY;
fac92740 5433 char *parse_start;
10edeb5d 5434 const char *maxpos = NULL;
3dab1dad
YO
5435 GET_RE_DEBUG_FLAGS_DECL;
5436 DEBUG_PARSE("piec");
a0d0e21e 5437
3dab1dad 5438 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5439 if (ret == NULL) {
5440 if (flags & TRYAGAIN)
5441 *flagp |= TRYAGAIN;
5442 return(NULL);
5443 }
5444
830247a4 5445 op = *RExC_parse;
a0d0e21e 5446
830247a4 5447 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5448 maxpos = NULL;
fac92740 5449 parse_start = RExC_parse; /* MJD */
830247a4 5450 next = RExC_parse + 1;
a0d0e21e
LW
5451 while (isDIGIT(*next) || *next == ',') {
5452 if (*next == ',') {
5453 if (maxpos)
5454 break;
5455 else
5456 maxpos = next;
a687059c 5457 }
a0d0e21e
LW
5458 next++;
5459 }
5460 if (*next == '}') { /* got one */
5461 if (!maxpos)
5462 maxpos = next;
830247a4
IZ
5463 RExC_parse++;
5464 min = atoi(RExC_parse);
a0d0e21e
LW
5465 if (*maxpos == ',')
5466 maxpos++;
5467 else
830247a4 5468 maxpos = RExC_parse;
a0d0e21e
LW
5469 max = atoi(maxpos);
5470 if (!max && *maxpos != '0')
c277df42
IZ
5471 max = REG_INFTY; /* meaning "infinity" */
5472 else if (max >= REG_INFTY)
8615cb43 5473 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5474 RExC_parse = next;
5475 nextchar(pRExC_state);
a0d0e21e
LW
5476
5477 do_curly:
5478 if ((flags&SIMPLE)) {
830247a4 5479 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5480 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5481 Set_Node_Offset(ret, parse_start+1); /* MJD */
5482 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5483 }
5484 else {
3dab1dad 5485 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5486
5487 w->flags = 0;
3dab1dad 5488 REGTAIL(pRExC_state, ret, w);
830247a4 5489 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5490 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5491 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5492 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5493 }
6bda09f9 5494 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5495 /* MJD hk */
5496 Set_Node_Offset(ret, parse_start+1);
2af232bd 5497 Set_Node_Length(ret,
fac92740 5498 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5499
830247a4 5500 if (!SIZE_ONLY && RExC_extralen)
c277df42 5501 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5502 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5503 if (SIZE_ONLY)
830247a4
IZ
5504 RExC_whilem_seen++, RExC_extralen += 3;
5505 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5506 }
c277df42 5507 ret->flags = 0;
a0d0e21e
LW
5508
5509 if (min > 0)
821b33a5
IZ
5510 *flagp = WORST;
5511 if (max > 0)
5512 *flagp |= HASWIDTH;
a0d0e21e 5513 if (max && max < min)
8615cb43 5514 vFAIL("Can't do {n,m} with n > m");
c277df42 5515 if (!SIZE_ONLY) {
eb160463
GS
5516 ARG1_SET(ret, (U16)min);
5517 ARG2_SET(ret, (U16)max);
a687059c 5518 }
a687059c 5519
a0d0e21e 5520 goto nest_check;
a687059c 5521 }
a0d0e21e 5522 }
a687059c 5523
a0d0e21e
LW
5524 if (!ISMULT1(op)) {
5525 *flagp = flags;
a687059c 5526 return(ret);
a0d0e21e 5527 }
bb20fd44 5528
c277df42 5529#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5530
5531 /* if this is reinstated, don't forget to put this back into perldiag:
5532
5533 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5534
5535 (F) The part of the regexp subject to either the * or + quantifier
5536 could match an empty string. The {#} shows in the regular
5537 expression about where the problem was discovered.
5538
5539 */
5540
bb20fd44 5541 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5542 vFAIL("Regexp *+ operand could be empty");
b81d288d 5543#endif
bb20fd44 5544
fac92740 5545 parse_start = RExC_parse;
830247a4 5546 nextchar(pRExC_state);
a0d0e21e 5547
821b33a5 5548 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5549
5550 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5551 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5552 ret->flags = 0;
830247a4 5553 RExC_naughty += 4;
a0d0e21e
LW
5554 }
5555 else if (op == '*') {
5556 min = 0;
5557 goto do_curly;
a0ed51b3
LW
5558 }
5559 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5560 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5561 ret->flags = 0;
830247a4 5562 RExC_naughty += 3;
a0d0e21e
LW
5563 }
5564 else if (op == '+') {
5565 min = 1;
5566 goto do_curly;
a0ed51b3
LW
5567 }
5568 else if (op == '?') {
a0d0e21e
LW
5569 min = 0; max = 1;
5570 goto do_curly;
5571 }
5572 nest_check:
041457d9 5573 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5574 vWARN3(RExC_parse,
b45f050a 5575 "%.*s matches null string many times",
afd78fd5 5576 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5577 origparse);
a0d0e21e
LW
5578 }
5579
b9b4dddf 5580 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5581 nextchar(pRExC_state);
6bda09f9 5582 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5583 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5584 }
b9b4dddf
YO
5585#ifndef REG_ALLOW_MINMOD_SUSPEND
5586 else
5587#endif
5588 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5589 regnode *ender;
5590 nextchar(pRExC_state);
5591 ender = reg_node(pRExC_state, SUCCEED);
5592 REGTAIL(pRExC_state, ret, ender);
5593 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5594 ret->flags = 0;
5595 ender = reg_node(pRExC_state, TAIL);
5596 REGTAIL(pRExC_state, ret, ender);
5597 /*ret= ender;*/
5598 }
5599
5600 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5601 RExC_parse++;
b45f050a
JF
5602 vFAIL("Nested quantifiers");
5603 }
a0d0e21e
LW
5604
5605 return(ret);
a687059c
LW
5606}
5607
fc8cd66c
YO
5608
5609/* reg_namedseq(pRExC_state,UVp)
5610
5611 This is expected to be called by a parser routine that has
5612 recognized'\N' and needs to handle the rest. RExC_parse is
5613 expected to point at the first char following the N at the time
5614 of the call.
5615
5616 If valuep is non-null then it is assumed that we are parsing inside
5617 of a charclass definition and the first codepoint in the resolved
5618 string is returned via *valuep and the routine will return NULL.
5619 In this mode if a multichar string is returned from the charnames
5620 handler a warning will be issued, and only the first char in the
5621 sequence will be examined. If the string returned is zero length
5622 then the value of *valuep is undefined and NON-NULL will
5623 be returned to indicate failure. (This will NOT be a valid pointer
5624 to a regnode.)
5625
5626 If value is null then it is assumed that we are parsing normal text
5627 and inserts a new EXACT node into the program containing the resolved
5628 string and returns a pointer to the new node. If the string is
5629 zerolength a NOTHING node is emitted.
5630
5631 On success RExC_parse is set to the char following the endbrace.
5632 Parsing failures will generate a fatal errorvia vFAIL(...)
5633
5634 NOTE: We cache all results from the charnames handler locally in
5635 the RExC_charnames hash (created on first use) to prevent a charnames
5636 handler from playing silly-buggers and returning a short string and
5637 then a long string for a given pattern. Since the regexp program
5638 size is calculated during an initial parse this would result
5639 in a buffer overrun so we cache to prevent the charname result from
5640 changing during the course of the parse.
5641
5642 */
5643STATIC regnode *
5644S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5645{
5646 char * name; /* start of the content of the name */
5647 char * endbrace; /* endbrace following the name */
5648 SV *sv_str = NULL;
5649 SV *sv_name = NULL;
5650 STRLEN len; /* this has various purposes throughout the code */
5651 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5652 regnode *ret = NULL;
5653
5654 if (*RExC_parse != '{') {
5655 vFAIL("Missing braces on \\N{}");
5656 }
5657 name = RExC_parse+1;
5658 endbrace = strchr(RExC_parse, '}');
5659 if ( ! endbrace ) {
5660 RExC_parse++;
5661 vFAIL("Missing right brace on \\N{}");
5662 }
5663 RExC_parse = endbrace + 1;
5664
5665
5666 /* RExC_parse points at the beginning brace,
5667 endbrace points at the last */
5668 if ( name[0]=='U' && name[1]=='+' ) {
5669 /* its a "unicode hex" notation {U+89AB} */
5670 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5671 | PERL_SCAN_DISALLOW_PREFIX
5672 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5673 UV cp;
196f1508 5674 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5675 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5676 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5677 cp = 0xFFFD;
5678 }
5679 if (cp > 0xff)
5680 RExC_utf8 = 1;
5681 if ( valuep ) {
5682 *valuep = cp;
5683 return NULL;
5684 }
5685 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5686 } else {
5687 /* fetch the charnames handler for this scope */
5688 HV * const table = GvHV(PL_hintgv);
5689 SV **cvp= table ?
5690 hv_fetchs(table, "charnames", FALSE) :
5691 NULL;
5692 SV *cv= cvp ? *cvp : NULL;
5693 HE *he_str;
5694 int count;
5695 /* create an SV with the name as argument */
5696 sv_name = newSVpvn(name, endbrace - name);
5697
5698 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5699 vFAIL2("Constant(\\N{%s}) unknown: "
5700 "(possibly a missing \"use charnames ...\")",
5701 SvPVX(sv_name));
5702 }
5703 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5704 vFAIL2("Constant(\\N{%s}): "
5705 "$^H{charnames} is not defined",SvPVX(sv_name));
5706 }
5707
5708
5709
5710 if (!RExC_charnames) {
5711 /* make sure our cache is allocated */
5712 RExC_charnames = newHV();
6bda09f9 5713 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5714 }
5715 /* see if we have looked this one up before */
5716 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5717 if ( he_str ) {
5718 sv_str = HeVAL(he_str);
5719 cached = 1;
5720 } else {
5721 dSP ;
5722
5723 ENTER ;
5724 SAVETMPS ;
5725 PUSHMARK(SP) ;
5726
5727 XPUSHs(sv_name);
5728
5729 PUTBACK ;
5730
5731 count= call_sv(cv, G_SCALAR);
5732
5733 if (count == 1) { /* XXXX is this right? dmq */
5734 sv_str = POPs;
5735 SvREFCNT_inc_simple_void(sv_str);
5736 }
5737
5738 SPAGAIN ;
5739 PUTBACK ;
5740 FREETMPS ;
5741 LEAVE ;
5742
5743 if ( !sv_str || !SvOK(sv_str) ) {
5744 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5745 "did not return a defined value",SvPVX(sv_name));
5746 }
5747 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5748 cached = 1;
5749 }
5750 }
5751 if (valuep) {
5752 char *p = SvPV(sv_str, len);
5753 if (len) {
5754 STRLEN numlen = 1;
5755 if ( SvUTF8(sv_str) ) {
196f1508 5756 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5757 if (*valuep > 0x7F)
5758 RExC_utf8 = 1;
5759 /* XXXX
5760 We have to turn on utf8 for high bit chars otherwise
5761 we get failures with
5762
5763 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5764 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5765
5766 This is different from what \x{} would do with the same
5767 codepoint, where the condition is > 0xFF.
5768 - dmq
5769 */
5770
5771
5772 } else {
5773 *valuep = (UV)*p;
5774 /* warn if we havent used the whole string? */
5775 }
5776 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5777 vWARN2(RExC_parse,
5778 "Ignoring excess chars from \\N{%s} in character class",
5779 SvPVX(sv_name)
5780 );
5781 }
5782 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5783 vWARN2(RExC_parse,
5784 "Ignoring zero length \\N{%s} in character class",
5785 SvPVX(sv_name)
5786 );
5787 }
5788 if (sv_name)
5789 SvREFCNT_dec(sv_name);
5790 if (!cached)
5791 SvREFCNT_dec(sv_str);
5792 return len ? NULL : (regnode *)&len;
5793 } else if(SvCUR(sv_str)) {
5794
5795 char *s;
5796 char *p, *pend;
5797 STRLEN charlen = 1;
5798 char * parse_start = name-3; /* needed for the offsets */
5799 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5800
5801 ret = reg_node(pRExC_state,
5802 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5803 s= STRING(ret);
5804
5805 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5806 sv_utf8_upgrade(sv_str);
5807 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5808 RExC_utf8= 1;
5809 }
5810
5811 p = SvPV(sv_str, len);
5812 pend = p + len;
5813 /* len is the length written, charlen is the size the char read */
5814 for ( len = 0; p < pend; p += charlen ) {
5815 if (UTF) {
196f1508 5816 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5817 if (FOLD) {
5818 STRLEN foldlen,numlen;
5819 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5820 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5821 /* Emit all the Unicode characters. */
5822
5823 for (foldbuf = tmpbuf;
5824 foldlen;
5825 foldlen -= numlen)
5826 {
5827 uvc = utf8_to_uvchr(foldbuf, &numlen);
5828 if (numlen > 0) {
5829 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5830 s += unilen;
5831 len += unilen;
5832 /* In EBCDIC the numlen
5833 * and unilen can differ. */
5834 foldbuf += numlen;
5835 if (numlen >= foldlen)
5836 break;
5837 }
5838 else
5839 break; /* "Can't happen." */
5840 }
5841 } else {
5842 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5843 if (unilen > 0) {
5844 s += unilen;
5845 len += unilen;
5846 }
5847 }
5848 } else {
5849 len++;
5850 REGC(*p, s++);
5851 }
5852 }
5853 if (SIZE_ONLY) {
5854 RExC_size += STR_SZ(len);
5855 } else {
5856 STR_LEN(ret) = len;
5857 RExC_emit += STR_SZ(len);
5858 }
5859 Set_Node_Cur_Length(ret); /* MJD */
5860 RExC_parse--;
5861 nextchar(pRExC_state);
5862 } else {
5863 ret = reg_node(pRExC_state,NOTHING);
5864 }
5865 if (!cached) {
5866 SvREFCNT_dec(sv_str);
5867 }
5868 if (sv_name) {
5869 SvREFCNT_dec(sv_name);
5870 }
5871 return ret;
5872
5873}
5874
5875
9e08bc66
TS
5876/*
5877 * reg_recode
5878 *
5879 * It returns the code point in utf8 for the value in *encp.
5880 * value: a code value in the source encoding
5881 * encp: a pointer to an Encode object
5882 *
5883 * If the result from Encode is not a single character,
5884 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5885 */
5886STATIC UV
5887S_reg_recode(pTHX_ const char value, SV **encp)
5888{
5889 STRLEN numlen = 1;
5890 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5891 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5892 : SvPVX(sv);
5893 const STRLEN newlen = SvCUR(sv);
5894 UV uv = UNICODE_REPLACEMENT;
5895
5896 if (newlen)
5897 uv = SvUTF8(sv)
5898 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5899 : *(U8*)s;
5900
5901 if (!newlen || numlen != newlen) {
5902 uv = UNICODE_REPLACEMENT;
5903 if (encp)
5904 *encp = NULL;
5905 }
5906 return uv;
5907}
5908
fc8cd66c 5909
a687059c
LW
5910/*
5911 - regatom - the lowest level
5912 *
5913 * Optimization: gobbles an entire sequence of ordinary characters so that
5914 * it can turn them into a single node, which is smaller to store and
5915 * faster to run. Backslashed characters are exceptions, each becoming a
5916 * separate node; the code is simpler that way and it's not worth fixing.
5917 *
7f6f358c
YO
5918 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5919 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5920 */
76e3520e 5921STATIC regnode *
3dab1dad 5922S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5923{
97aff369 5924 dVAR;
cbbf8932 5925 register regnode *ret = NULL;
a0d0e21e 5926 I32 flags;
45948336 5927 char *parse_start = RExC_parse;
3dab1dad
YO
5928 GET_RE_DEBUG_FLAGS_DECL;
5929 DEBUG_PARSE("atom");
a0d0e21e
LW
5930 *flagp = WORST; /* Tentatively. */
5931
5932tryagain:
830247a4 5933 switch (*RExC_parse) {
a0d0e21e 5934 case '^':
830247a4
IZ
5935 RExC_seen_zerolen++;
5936 nextchar(pRExC_state);
e2509266 5937 if (RExC_flags & PMf_MULTILINE)
830247a4 5938 ret = reg_node(pRExC_state, MBOL);
e2509266 5939 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5940 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5941 else
830247a4 5942 ret = reg_node(pRExC_state, BOL);
fac92740 5943 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5944 break;
5945 case '$':
830247a4 5946 nextchar(pRExC_state);
b81d288d 5947 if (*RExC_parse)
830247a4 5948 RExC_seen_zerolen++;
e2509266 5949 if (RExC_flags & PMf_MULTILINE)
830247a4 5950 ret = reg_node(pRExC_state, MEOL);
e2509266 5951 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5952 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5953 else
830247a4 5954 ret = reg_node(pRExC_state, EOL);
fac92740 5955 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5956 break;
5957 case '.':
830247a4 5958 nextchar(pRExC_state);
e2509266 5959 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5960 ret = reg_node(pRExC_state, SANY);
5961 else
5962 ret = reg_node(pRExC_state, REG_ANY);
5963 *flagp |= HASWIDTH|SIMPLE;
830247a4 5964 RExC_naughty++;
fac92740 5965 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5966 break;
5967 case '[':
b45f050a 5968 {
3dab1dad
YO
5969 char * const oregcomp_parse = ++RExC_parse;
5970 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5971 if (*RExC_parse != ']') {
5972 RExC_parse = oregcomp_parse;
b45f050a
JF
5973 vFAIL("Unmatched [");
5974 }
830247a4 5975 nextchar(pRExC_state);
a0d0e21e 5976 *flagp |= HASWIDTH|SIMPLE;
fac92740 5977 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5978 break;
b45f050a 5979 }
a0d0e21e 5980 case '(':
830247a4 5981 nextchar(pRExC_state);
3dab1dad 5982 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5983 if (ret == NULL) {
bf93d4cc 5984 if (flags & TRYAGAIN) {
830247a4 5985 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5986 /* Make parent create an empty node if needed. */
5987 *flagp |= TRYAGAIN;
5988 return(NULL);
5989 }
a0d0e21e 5990 goto tryagain;
bf93d4cc 5991 }
a0d0e21e
LW
5992 return(NULL);
5993 }
c277df42 5994 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5995 break;
5996 case '|':
5997 case ')':
5998 if (flags & TRYAGAIN) {
5999 *flagp |= TRYAGAIN;
6000 return NULL;
6001 }
b45f050a 6002 vFAIL("Internal urp");
a0d0e21e
LW
6003 /* Supposed to be caught earlier. */
6004 break;
85afd4ae 6005 case '{':
830247a4
IZ
6006 if (!regcurly(RExC_parse)) {
6007 RExC_parse++;
85afd4ae
CS
6008 goto defchar;
6009 }
6010 /* FALL THROUGH */
a0d0e21e
LW
6011 case '?':
6012 case '+':
6013 case '*':
830247a4 6014 RExC_parse++;
b45f050a 6015 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6016 break;
6017 case '\\':
830247a4 6018 switch (*++RExC_parse) {
a0d0e21e 6019 case 'A':
830247a4
IZ
6020 RExC_seen_zerolen++;
6021 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6022 *flagp |= SIMPLE;
830247a4 6023 nextchar(pRExC_state);
fac92740 6024 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6025 break;
6026 case 'G':
830247a4
IZ
6027 ret = reg_node(pRExC_state, GPOS);
6028 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6029 *flagp |= SIMPLE;
830247a4 6030 nextchar(pRExC_state);
fac92740 6031 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6032 break;
6033 case 'Z':
830247a4 6034 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6035 *flagp |= SIMPLE;
a1917ab9 6036 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6037 nextchar(pRExC_state);
a0d0e21e 6038 break;
b85d18e9 6039 case 'z':
830247a4 6040 ret = reg_node(pRExC_state, EOS);
b85d18e9 6041 *flagp |= SIMPLE;
830247a4
IZ
6042 RExC_seen_zerolen++; /* Do not optimize RE away */
6043 nextchar(pRExC_state);
fac92740 6044 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6045 break;
4a2d328f 6046 case 'C':
f33976b4
DB
6047 ret = reg_node(pRExC_state, CANY);
6048 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6049 *flagp |= HASWIDTH|SIMPLE;
830247a4 6050 nextchar(pRExC_state);
fac92740 6051 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6052 break;
6053 case 'X':
830247a4 6054 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6055 *flagp |= HASWIDTH;
830247a4 6056 nextchar(pRExC_state);
fac92740 6057 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6058 break;
a0d0e21e 6059 case 'w':
eb160463 6060 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6061 *flagp |= HASWIDTH|SIMPLE;
830247a4 6062 nextchar(pRExC_state);
fac92740 6063 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6064 break;
6065 case 'W':
eb160463 6066 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6067 *flagp |= HASWIDTH|SIMPLE;
830247a4 6068 nextchar(pRExC_state);
fac92740 6069 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6070 break;
6071 case 'b':
830247a4
IZ
6072 RExC_seen_zerolen++;
6073 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6074 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6075 *flagp |= SIMPLE;
830247a4 6076 nextchar(pRExC_state);
fac92740 6077 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6078 break;
6079 case 'B':
830247a4
IZ
6080 RExC_seen_zerolen++;
6081 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6082 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6083 *flagp |= SIMPLE;
830247a4 6084 nextchar(pRExC_state);
fac92740 6085 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6086 break;
6087 case 's':
eb160463 6088 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6089 *flagp |= HASWIDTH|SIMPLE;
830247a4 6090 nextchar(pRExC_state);
fac92740 6091 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6092 break;
6093 case 'S':
eb160463 6094 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6095 *flagp |= HASWIDTH|SIMPLE;
830247a4 6096 nextchar(pRExC_state);
fac92740 6097 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6098 break;
6099 case 'd':
ffc61ed2 6100 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6101 *flagp |= HASWIDTH|SIMPLE;
830247a4 6102 nextchar(pRExC_state);
fac92740 6103 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6104 break;
6105 case 'D':
ffc61ed2 6106 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6107 *flagp |= HASWIDTH|SIMPLE;
830247a4 6108 nextchar(pRExC_state);
fac92740 6109 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6110 break;
a14b48bc
LW
6111 case 'p':
6112 case 'P':
3568d838 6113 {
3dab1dad 6114 char* const oldregxend = RExC_end;
ccb2c380 6115 char* parse_start = RExC_parse - 2;
a14b48bc 6116
830247a4 6117 if (RExC_parse[1] == '{') {
3568d838 6118 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6119 RExC_end = strchr(RExC_parse, '}');
6120 if (!RExC_end) {
3dab1dad 6121 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6122 RExC_parse += 2;
6123 RExC_end = oldregxend;
0da60cf5 6124 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6125 }
830247a4 6126 RExC_end++;
a14b48bc 6127 }
af6f566e 6128 else {
830247a4 6129 RExC_end = RExC_parse + 2;
af6f566e
HS
6130 if (RExC_end > oldregxend)
6131 RExC_end = oldregxend;
6132 }
830247a4 6133 RExC_parse--;
a14b48bc 6134
3dab1dad 6135 ret = regclass(pRExC_state,depth+1);
a14b48bc 6136
830247a4
IZ
6137 RExC_end = oldregxend;
6138 RExC_parse--;
ccb2c380
MP
6139
6140 Set_Node_Offset(ret, parse_start + 2);
6141 Set_Node_Cur_Length(ret);
830247a4 6142 nextchar(pRExC_state);
a14b48bc
LW
6143 *flagp |= HASWIDTH|SIMPLE;
6144 }
6145 break;
fc8cd66c
YO
6146 case 'N':
6147 /* Handle \N{NAME} here and not below because it can be
6148 multicharacter. join_exact() will join them up later on.
6149 Also this makes sure that things like /\N{BLAH}+/ and
6150 \N{BLAH} being multi char Just Happen. dmq*/
6151 ++RExC_parse;
6152 ret= reg_namedseq(pRExC_state, NULL);
6153 break;
0a4db386 6154 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6155 {
6156 char ch= RExC_parse[1];
6157 if (ch != '<' && ch != '\'') {
6158 if (SIZE_ONLY)
6159 vWARN( RExC_parse + 1,
6160 "Possible broken named back reference treated as literal k");
6161 parse_start--;
6162 goto defchar;
6163 } else {
6164 char* name_start = (RExC_parse += 2);
6165 I32 num = 0;
0a4db386
YO
6166 SV *sv_dat = reg_scan_name(pRExC_state,
6167 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6168 ch= (ch == '<') ? '>' : '\'';
6169
81714fb9
YO
6170 if (RExC_parse == name_start || *RExC_parse != ch)
6171 vFAIL2("Sequence \\k%c... not terminated",
6172 (ch == '>' ? '<' : ch));
6173
6174 RExC_sawback = 1;
6175 ret = reganode(pRExC_state,
6176 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6177 num);
6178 *flagp |= HASWIDTH;
6179
6180
6181 if (!SIZE_ONLY) {
81714fb9
YO
6182 num = add_data( pRExC_state, 1, "S" );
6183 ARG_SET(ret,num);
6184 RExC_rx->data->data[num]=(void*)sv_dat;
6185 SvREFCNT_inc(sv_dat);
6186 }
6187 /* override incorrect value set in reganode MJD */
6188 Set_Node_Offset(ret, parse_start+1);
6189 Set_Node_Cur_Length(ret); /* MJD */
6190 nextchar(pRExC_state);
6191
6192 }
6193 break;
6194 }
a0d0e21e
LW
6195 case 'n':
6196 case 'r':
6197 case 't':
6198 case 'f':
6199 case 'e':
6200 case 'a':
6201 case 'x':
6202 case 'c':
6203 case '0':
6204 goto defchar;
6205 case '1': case '2': case '3': case '4':
6206 case '5': case '6': case '7': case '8': case '9':
6207 {
1df70142 6208 const I32 num = atoi(RExC_parse);
a0d0e21e 6209
830247a4 6210 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6211 goto defchar;
6212 else {
3dab1dad 6213 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6214 while (isDIGIT(*RExC_parse))
6215 RExC_parse++;
b45f050a 6216
eb160463 6217 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 6218 vFAIL("Reference to nonexistent group");
830247a4 6219 RExC_sawback = 1;
eb160463
GS
6220 ret = reganode(pRExC_state,
6221 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6222 num);
a0d0e21e 6223 *flagp |= HASWIDTH;
2af232bd 6224
fac92740 6225 /* override incorrect value set in reganode MJD */
2af232bd 6226 Set_Node_Offset(ret, parse_start+1);
fac92740 6227 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6228 RExC_parse--;
6229 nextchar(pRExC_state);
a0d0e21e
LW
6230 }
6231 }
6232 break;
6233 case '\0':
830247a4 6234 if (RExC_parse >= RExC_end)
b45f050a 6235 FAIL("Trailing \\");
a0d0e21e
LW
6236 /* FALL THROUGH */
6237 default:
a0288114 6238 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6239 back into the quick-grab loop below */
45948336 6240 parse_start--;
a0d0e21e
LW
6241 goto defchar;
6242 }
6243 break;
4633a7c4
LW
6244
6245 case '#':
e2509266 6246 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6247 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6248 RExC_parse++;
830247a4 6249 if (RExC_parse < RExC_end)
4633a7c4
LW
6250 goto tryagain;
6251 }
6252 /* FALL THROUGH */
6253
a0d0e21e 6254 default: {
ba210ebe 6255 register STRLEN len;
58ae7d3f 6256 register UV ender;
a0d0e21e 6257 register char *p;
3dab1dad 6258 char *s;
80aecb99 6259 STRLEN foldlen;
89ebb4a3 6260 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6261
6262 parse_start = RExC_parse - 1;
a0d0e21e 6263
830247a4 6264 RExC_parse++;
a0d0e21e
LW
6265
6266 defchar:
58ae7d3f 6267 ender = 0;
eb160463
GS
6268 ret = reg_node(pRExC_state,
6269 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6270 s = STRING(ret);
830247a4
IZ
6271 for (len = 0, p = RExC_parse - 1;
6272 len < 127 && p < RExC_end;
a0d0e21e
LW
6273 len++)
6274 {
3dab1dad 6275 char * const oldp = p;
5b5a24f7 6276
e2509266 6277 if (RExC_flags & PMf_EXTENDED)
830247a4 6278 p = regwhite(p, RExC_end);
a0d0e21e
LW
6279 switch (*p) {
6280 case '^':
6281 case '$':
6282 case '.':
6283 case '[':
6284 case '(':
6285 case ')':
6286 case '|':
6287 goto loopdone;
6288 case '\\':
6289 switch (*++p) {
6290 case 'A':
1ed8eac0
JF
6291 case 'C':
6292 case 'X':
a0d0e21e
LW
6293 case 'G':
6294 case 'Z':
b85d18e9 6295 case 'z':
a0d0e21e
LW
6296 case 'w':
6297 case 'W':
6298 case 'b':
6299 case 'B':
6300 case 's':
6301 case 'S':
6302 case 'd':
6303 case 'D':
a14b48bc
LW
6304 case 'p':
6305 case 'P':
fc8cd66c 6306 case 'N':
a0d0e21e
LW
6307 --p;
6308 goto loopdone;
6309 case 'n':
6310 ender = '\n';
6311 p++;
a687059c 6312 break;
a0d0e21e
LW
6313 case 'r':
6314 ender = '\r';
6315 p++;
a687059c 6316 break;
a0d0e21e
LW
6317 case 't':
6318 ender = '\t';
6319 p++;
a687059c 6320 break;
a0d0e21e
LW
6321 case 'f':
6322 ender = '\f';
6323 p++;
a687059c 6324 break;
a0d0e21e 6325 case 'e':
c7f1f016 6326 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6327 p++;
a687059c 6328 break;
a0d0e21e 6329 case 'a':
c7f1f016 6330 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6331 p++;
a687059c 6332 break;
a0d0e21e 6333 case 'x':
a0ed51b3 6334 if (*++p == '{') {
1df70142 6335 char* const e = strchr(p, '}');
b81d288d 6336
b45f050a 6337 if (!e) {
830247a4 6338 RExC_parse = p + 1;
b45f050a
JF
6339 vFAIL("Missing right brace on \\x{}");
6340 }
de5f0749 6341 else {
a4c04bdc
NC
6342 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6343 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6344 STRLEN numlen = e - p - 1;
53305cf1 6345 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6346 if (ender > 0xff)
6347 RExC_utf8 = 1;
a0ed51b3
LW
6348 p = e + 1;
6349 }
a0ed51b3
LW
6350 }
6351 else {
a4c04bdc 6352 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6353 STRLEN numlen = 2;
53305cf1 6354 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6355 p += numlen;
6356 }
9e08bc66
TS
6357 if (PL_encoding && ender < 0x100)
6358 goto recode_encoding;
a687059c 6359 break;
a0d0e21e
LW
6360 case 'c':
6361 p++;
bbce6d69 6362 ender = UCHARAT(p++);
6363 ender = toCTRL(ender);
a687059c 6364 break;
a0d0e21e
LW
6365 case '0': case '1': case '2': case '3':case '4':
6366 case '5': case '6': case '7': case '8':case '9':
6367 if (*p == '0' ||
830247a4 6368 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6369 I32 flags = 0;
1df70142 6370 STRLEN numlen = 3;
53305cf1 6371 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6372 p += numlen;
6373 }
6374 else {
6375 --p;
6376 goto loopdone;
a687059c 6377 }
9e08bc66
TS
6378 if (PL_encoding && ender < 0x100)
6379 goto recode_encoding;
6380 break;
6381 recode_encoding:
6382 {
6383 SV* enc = PL_encoding;
6384 ender = reg_recode((const char)(U8)ender, &enc);
6385 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6386 vWARN(p, "Invalid escape in the specified encoding");
6387 RExC_utf8 = 1;
6388 }
a687059c 6389 break;
a0d0e21e 6390 case '\0':
830247a4 6391 if (p >= RExC_end)
b45f050a 6392 FAIL("Trailing \\");
a687059c 6393 /* FALL THROUGH */
a0d0e21e 6394 default:
041457d9 6395 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6396 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6397 goto normal_default;
a0d0e21e
LW
6398 }
6399 break;
a687059c 6400 default:
a0ed51b3 6401 normal_default:
fd400ab9 6402 if (UTF8_IS_START(*p) && UTF) {
1df70142 6403 STRLEN numlen;
5e12f4fb 6404 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6405 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6406 p += numlen;
6407 }
6408 else
6409 ender = *p++;
a0d0e21e 6410 break;
a687059c 6411 }
e2509266 6412 if (RExC_flags & PMf_EXTENDED)
830247a4 6413 p = regwhite(p, RExC_end);
60a8b682
JH
6414 if (UTF && FOLD) {
6415 /* Prime the casefolded buffer. */
ac7e0132 6416 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6417 }
a0d0e21e
LW
6418 if (ISMULT2(p)) { /* Back off on ?+*. */
6419 if (len)
6420 p = oldp;
16ea2a2e 6421 else if (UTF) {
80aecb99 6422 if (FOLD) {
60a8b682 6423 /* Emit all the Unicode characters. */
1df70142 6424 STRLEN numlen;
80aecb99
JH
6425 for (foldbuf = tmpbuf;
6426 foldlen;
6427 foldlen -= numlen) {
6428 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6429 if (numlen > 0) {
71207a34 6430 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6431 s += unilen;
6432 len += unilen;
6433 /* In EBCDIC the numlen
6434 * and unilen can differ. */
9dc45d57 6435 foldbuf += numlen;
47654450
JH
6436 if (numlen >= foldlen)
6437 break;
9dc45d57
JH
6438 }
6439 else
6440 break; /* "Can't happen." */
80aecb99
JH
6441 }
6442 }
6443 else {
71207a34 6444 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6445 if (unilen > 0) {
0ebc6274
JH
6446 s += unilen;
6447 len += unilen;
9dc45d57 6448 }
80aecb99 6449 }
a0ed51b3 6450 }
a0d0e21e
LW
6451 else {
6452 len++;
eb160463 6453 REGC((char)ender, s++);
a0d0e21e
LW
6454 }
6455 break;
a687059c 6456 }
16ea2a2e 6457 if (UTF) {
80aecb99 6458 if (FOLD) {
60a8b682 6459 /* Emit all the Unicode characters. */
1df70142 6460 STRLEN numlen;
80aecb99
JH
6461 for (foldbuf = tmpbuf;
6462 foldlen;
6463 foldlen -= numlen) {
6464 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6465 if (numlen > 0) {
71207a34 6466 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6467 len += unilen;
6468 s += unilen;
6469 /* In EBCDIC the numlen
6470 * and unilen can differ. */
9dc45d57 6471 foldbuf += numlen;
47654450
JH
6472 if (numlen >= foldlen)
6473 break;
9dc45d57
JH
6474 }
6475 else
6476 break;
80aecb99
JH
6477 }
6478 }
6479 else {
71207a34 6480 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6481 if (unilen > 0) {
0ebc6274
JH
6482 s += unilen;
6483 len += unilen;
9dc45d57 6484 }
80aecb99
JH
6485 }
6486 len--;
a0ed51b3
LW
6487 }
6488 else
eb160463 6489 REGC((char)ender, s++);
a0d0e21e
LW
6490 }
6491 loopdone:
830247a4 6492 RExC_parse = p - 1;
fac92740 6493 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6494 nextchar(pRExC_state);
793db0cb
JH
6495 {
6496 /* len is STRLEN which is unsigned, need to copy to signed */
6497 IV iv = len;
6498 if (iv < 0)
6499 vFAIL("Internal disaster");
6500 }
a0d0e21e
LW
6501 if (len > 0)
6502 *flagp |= HASWIDTH;
090f7165 6503 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6504 *flagp |= SIMPLE;
3dab1dad 6505
cd439c50 6506 if (SIZE_ONLY)
830247a4 6507 RExC_size += STR_SZ(len);
3dab1dad
YO
6508 else {
6509 STR_LEN(ret) = len;
830247a4 6510 RExC_emit += STR_SZ(len);
07be1b83 6511 }
3dab1dad 6512 }
a0d0e21e
LW
6513 break;
6514 }
a687059c 6515
a0d0e21e 6516 return(ret);
a687059c
LW
6517}
6518
873ef191 6519STATIC char *
5f66b61c 6520S_regwhite(char *p, const char *e)
5b5a24f7
CS
6521{
6522 while (p < e) {
6523 if (isSPACE(*p))
6524 ++p;
6525 else if (*p == '#') {
6526 do {
6527 p++;
6528 } while (p < e && *p != '\n');
6529 }
6530 else
6531 break;
6532 }
6533 return p;
6534}
6535
b8c5462f
JH
6536/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6537 Character classes ([:foo:]) can also be negated ([:^foo:]).
6538 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6539 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6540 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6541
6542#define POSIXCC_DONE(c) ((c) == ':')
6543#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6544#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6545
b8c5462f 6546STATIC I32
830247a4 6547S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6548{
97aff369 6549 dVAR;
936ed897 6550 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6551
830247a4 6552 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6553 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6554 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6555 const char c = UCHARAT(RExC_parse);
097eb12c 6556 char* const s = RExC_parse++;
b81d288d 6557
9a86a77b 6558 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6559 RExC_parse++;
6560 if (RExC_parse == RExC_end)
620e46c5 6561 /* Grandfather lone [:, [=, [. */
830247a4 6562 RExC_parse = s;
620e46c5 6563 else {
3dab1dad 6564 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6565 assert(*t == c);
6566
9a86a77b 6567 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6568 const char *posixcc = s + 1;
830247a4 6569 RExC_parse++; /* skip over the ending ] */
3dab1dad 6570
b8c5462f 6571 if (*s == ':') {
1df70142
AL
6572 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6573 const I32 skip = t - posixcc;
80916619
NC
6574
6575 /* Initially switch on the length of the name. */
6576 switch (skip) {
6577 case 4:
3dab1dad
YO
6578 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6579 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6580 break;
80916619
NC
6581 case 5:
6582 /* Names all of length 5. */
6583 /* alnum alpha ascii blank cntrl digit graph lower
6584 print punct space upper */
6585 /* Offset 4 gives the best switch position. */
6586 switch (posixcc[4]) {
6587 case 'a':
3dab1dad
YO
6588 if (memEQ(posixcc, "alph", 4)) /* alpha */
6589 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6590 break;
6591 case 'e':
3dab1dad
YO
6592 if (memEQ(posixcc, "spac", 4)) /* space */
6593 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6594 break;
6595 case 'h':
3dab1dad
YO
6596 if (memEQ(posixcc, "grap", 4)) /* graph */
6597 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6598 break;
6599 case 'i':
3dab1dad
YO
6600 if (memEQ(posixcc, "asci", 4)) /* ascii */
6601 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6602 break;
6603 case 'k':
3dab1dad
YO
6604 if (memEQ(posixcc, "blan", 4)) /* blank */
6605 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6606 break;
6607 case 'l':
3dab1dad
YO
6608 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6609 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6610 break;
6611 case 'm':
3dab1dad
YO
6612 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6613 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6614 break;
6615 case 'r':
3dab1dad
YO
6616 if (memEQ(posixcc, "lowe", 4)) /* lower */
6617 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6618 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6619 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6620 break;
6621 case 't':
3dab1dad
YO
6622 if (memEQ(posixcc, "digi", 4)) /* digit */
6623 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6624 else if (memEQ(posixcc, "prin", 4)) /* print */
6625 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6626 else if (memEQ(posixcc, "punc", 4)) /* punct */
6627 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6628 break;
b8c5462f
JH
6629 }
6630 break;
80916619 6631 case 6:
3dab1dad
YO
6632 if (memEQ(posixcc, "xdigit", 6))
6633 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6634 break;
6635 }
80916619
NC
6636
6637 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6638 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6639 t - s - 1, s + 1);
80916619
NC
6640 assert (posixcc[skip] == ':');
6641 assert (posixcc[skip+1] == ']');
b45f050a 6642 } else if (!SIZE_ONLY) {
b8c5462f 6643 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6644
830247a4 6645 /* adjust RExC_parse so the warning shows after
b45f050a 6646 the class closes */
9a86a77b 6647 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6648 RExC_parse++;
b45f050a
JF
6649 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6650 }
b8c5462f
JH
6651 } else {
6652 /* Maternal grandfather:
6653 * "[:" ending in ":" but not in ":]" */
830247a4 6654 RExC_parse = s;
767d463e 6655 }
620e46c5
JH
6656 }
6657 }
6658
b8c5462f
JH
6659 return namedclass;
6660}
6661
6662STATIC void
830247a4 6663S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6664{
97aff369 6665 dVAR;
3dab1dad 6666 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6667 const char *s = RExC_parse;
6668 const char c = *s++;
b8c5462f 6669
3dab1dad 6670 while (isALNUM(*s))
b8c5462f
JH
6671 s++;
6672 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6673 if (ckWARN(WARN_REGEXP))
6674 vWARN3(s+2,
6675 "POSIX syntax [%c %c] belongs inside character classes",
6676 c, c);
b45f050a
JF
6677
6678 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6679 if (POSIXCC_NOTYET(c)) {
830247a4 6680 /* adjust RExC_parse so the error shows after
b45f050a 6681 the class closes */
9a86a77b 6682 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6683 NOOP;
b45f050a
JF
6684 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6685 }
b8c5462f
JH
6686 }
6687 }
620e46c5
JH
6688}
6689
7f6f358c
YO
6690
6691/*
6692 parse a class specification and produce either an ANYOF node that
6693 matches the pattern. If the pattern matches a single char only and
6694 that char is < 256 then we produce an EXACT node instead.
6695*/
76e3520e 6696STATIC regnode *
3dab1dad 6697S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6698{
97aff369 6699 dVAR;
9ef43ace 6700 register UV value = 0;
9a86a77b 6701 register UV nextvalue;
3568d838 6702 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6703 register IV range = 0;
c277df42 6704 register regnode *ret;
ba210ebe 6705 STRLEN numlen;
ffc61ed2 6706 IV namedclass;
cbbf8932 6707 char *rangebegin = NULL;
936ed897 6708 bool need_class = 0;
c445ea15 6709 SV *listsv = NULL;
ffc61ed2 6710 UV n;
9e55ce06 6711 bool optimize_invert = TRUE;
cbbf8932 6712 AV* unicode_alternate = NULL;
1b2d223b
JH
6713#ifdef EBCDIC
6714 UV literal_endpoint = 0;
6715#endif
7f6f358c 6716 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6717
3dab1dad 6718 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6719 case we need to change the emitted regop to an EXACT. */
07be1b83 6720 const char * orig_parse = RExC_parse;
72f13be8 6721 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6722#ifndef DEBUGGING
6723 PERL_UNUSED_ARG(depth);
6724#endif
72f13be8 6725
3dab1dad 6726 DEBUG_PARSE("clas");
7f6f358c
YO
6727
6728 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6729 ret = reganode(pRExC_state, ANYOF, 0);
6730
6731 if (!SIZE_ONLY)
6732 ANYOF_FLAGS(ret) = 0;
6733
9a86a77b 6734 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6735 RExC_naughty++;
6736 RExC_parse++;
6737 if (!SIZE_ONLY)
6738 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6739 }
a0d0e21e 6740
73060fc4 6741 if (SIZE_ONLY) {
830247a4 6742 RExC_size += ANYOF_SKIP;
73060fc4
JH
6743 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6744 }
936ed897 6745 else {
830247a4 6746 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6747 if (FOLD)
6748 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6749 if (LOC)
6750 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6751 ANYOF_BITMAP_ZERO(ret);
396482e1 6752 listsv = newSVpvs("# comment\n");
a0d0e21e 6753 }
b8c5462f 6754
9a86a77b
JH
6755 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6756
b938889d 6757 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6758 checkposixcc(pRExC_state);
b8c5462f 6759
f064b6ad
HS
6760 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6761 if (UCHARAT(RExC_parse) == ']')
6762 goto charclassloop;
ffc61ed2 6763
fc8cd66c 6764parseit:
9a86a77b 6765 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6766
6767 charclassloop:
6768
6769 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6770
73b437c8 6771 if (!range)
830247a4 6772 rangebegin = RExC_parse;
ffc61ed2 6773 if (UTF) {
5e12f4fb 6774 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6775 RExC_end - RExC_parse,
9f7f3913 6776 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6777 RExC_parse += numlen;
6778 }
6779 else
6780 value = UCHARAT(RExC_parse++);
7f6f358c 6781
9a86a77b
JH
6782 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6783 if (value == '[' && POSIXCC(nextvalue))
830247a4 6784 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6785 else if (value == '\\') {
ffc61ed2 6786 if (UTF) {
5e12f4fb 6787 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6788 RExC_end - RExC_parse,
9f7f3913 6789 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6790 RExC_parse += numlen;
6791 }
6792 else
6793 value = UCHARAT(RExC_parse++);
470c3474 6794 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6795 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6796 * be a problem later if we want switch on Unicode.
6797 * A similar issue a little bit later when switching on
6798 * namedclass. --jhi */
ffc61ed2 6799 switch ((I32)value) {
b8c5462f
JH
6800 case 'w': namedclass = ANYOF_ALNUM; break;
6801 case 'W': namedclass = ANYOF_NALNUM; break;
6802 case 's': namedclass = ANYOF_SPACE; break;
6803 case 'S': namedclass = ANYOF_NSPACE; break;
6804 case 'd': namedclass = ANYOF_DIGIT; break;
6805 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6806 case 'N': /* Handle \N{NAME} in class */
6807 {
6808 /* We only pay attention to the first char of
6809 multichar strings being returned. I kinda wonder
6810 if this makes sense as it does change the behaviour
6811 from earlier versions, OTOH that behaviour was broken
6812 as well. */
6813 UV v; /* value is register so we cant & it /grrr */
6814 if (reg_namedseq(pRExC_state, &v)) {
6815 goto parseit;
6816 }
6817 value= v;
6818 }
6819 break;
ffc61ed2
JH
6820 case 'p':
6821 case 'P':
3dab1dad
YO
6822 {
6823 char *e;
af6f566e 6824 if (RExC_parse >= RExC_end)
2a4859cd 6825 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6826 if (*RExC_parse == '{') {
1df70142 6827 const U8 c = (U8)value;
ffc61ed2
JH
6828 e = strchr(RExC_parse++, '}');
6829 if (!e)
0da60cf5 6830 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6831 while (isSPACE(UCHARAT(RExC_parse)))
6832 RExC_parse++;
6833 if (e == RExC_parse)
0da60cf5 6834 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6835 n = e - RExC_parse;
ab13f0c7
JH
6836 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6837 n--;
ffc61ed2
JH
6838 }
6839 else {
6840 e = RExC_parse;
6841 n = 1;
6842 }
6843 if (!SIZE_ONLY) {
ab13f0c7
JH
6844 if (UCHARAT(RExC_parse) == '^') {
6845 RExC_parse++;
6846 n--;
6847 value = value == 'p' ? 'P' : 'p'; /* toggle */
6848 while (isSPACE(UCHARAT(RExC_parse))) {
6849 RExC_parse++;
6850 n--;
6851 }
6852 }
097eb12c
AL
6853 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6854 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6855 }
6856 RExC_parse = e + 1;
6857 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6858 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6859 }
f81125e2 6860 break;
b8c5462f
JH
6861 case 'n': value = '\n'; break;
6862 case 'r': value = '\r'; break;
6863 case 't': value = '\t'; break;
6864 case 'f': value = '\f'; break;
6865 case 'b': value = '\b'; break;
c7f1f016
NIS
6866 case 'e': value = ASCII_TO_NATIVE('\033');break;
6867 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6868 case 'x':
ffc61ed2 6869 if (*RExC_parse == '{') {
a4c04bdc
NC
6870 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6871 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6872 char * const e = strchr(RExC_parse++, '}');
b81d288d 6873 if (!e)
ffc61ed2 6874 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6875
6876 numlen = e - RExC_parse;
6877 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6878 RExC_parse = e + 1;
6879 }
6880 else {
a4c04bdc 6881 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6882 numlen = 2;
6883 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6884 RExC_parse += numlen;
6885 }
9e08bc66
TS
6886 if (PL_encoding && value < 0x100)
6887 goto recode_encoding;
b8c5462f
JH
6888 break;
6889 case 'c':
830247a4 6890 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6891 value = toCTRL(value);
6892 break;
6893 case '0': case '1': case '2': case '3': case '4':
6894 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
6895 {
6896 I32 flags = 0;
6897 numlen = 3;
6898 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6899 RExC_parse += numlen;
6900 if (PL_encoding && value < 0x100)
6901 goto recode_encoding;
6902 break;
6903 }
6904 recode_encoding:
6905 {
6906 SV* enc = PL_encoding;
6907 value = reg_recode((const char)(U8)value, &enc);
6908 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6909 vWARN(RExC_parse,
6910 "Invalid escape in the specified encoding");
6911 break;
6912 }
1028017a 6913 default:
041457d9 6914 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6915 vWARN2(RExC_parse,
6916 "Unrecognized escape \\%c in character class passed through",
6917 (int)value);
1028017a 6918 break;
b8c5462f 6919 }
ffc61ed2 6920 } /* end of \blah */
1b2d223b
JH
6921#ifdef EBCDIC
6922 else
6923 literal_endpoint++;
6924#endif
ffc61ed2
JH
6925
6926 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6927
6928 if (!SIZE_ONLY && !need_class)
936ed897 6929 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6930
936ed897 6931 need_class = 1;
ffc61ed2
JH
6932
6933 /* a bad range like a-\d, a-[:digit:] ? */
6934 if (range) {
73b437c8 6935 if (!SIZE_ONLY) {
afd78fd5 6936 if (ckWARN(WARN_REGEXP)) {
097eb12c 6937 const int w =
afd78fd5
JH
6938 RExC_parse >= rangebegin ?
6939 RExC_parse - rangebegin : 0;
830247a4 6940 vWARN4(RExC_parse,
b45f050a 6941 "False [] range \"%*.*s\"",
097eb12c 6942 w, w, rangebegin);
afd78fd5 6943 }
3568d838
JH
6944 if (prevvalue < 256) {
6945 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6946 ANYOF_BITMAP_SET(ret, '-');
6947 }
6948 else {
6949 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6950 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6951 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6952 }
b8c5462f 6953 }
ffc61ed2
JH
6954
6955 range = 0; /* this was not a true range */
73b437c8 6956 }
ffc61ed2 6957
73b437c8 6958 if (!SIZE_ONLY) {
c49a72a9
NC
6959 const char *what = NULL;
6960 char yesno = 0;
6961
3568d838
JH
6962 if (namedclass > OOB_NAMEDCLASS)
6963 optimize_invert = FALSE;
e2962f66
JH
6964 /* Possible truncation here but in some 64-bit environments
6965 * the compiler gets heartburn about switch on 64-bit values.
6966 * A similar issue a little earlier when switching on value.
98f323fa 6967 * --jhi */
e2962f66 6968 switch ((I32)namedclass) {
73b437c8
JH
6969 case ANYOF_ALNUM:
6970 if (LOC)
936ed897 6971 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6972 else {
6973 for (value = 0; value < 256; value++)
6974 if (isALNUM(value))
936ed897 6975 ANYOF_BITMAP_SET(ret, value);
73b437c8 6976 }
c49a72a9
NC
6977 yesno = '+';
6978 what = "Word";
73b437c8
JH
6979 break;
6980 case ANYOF_NALNUM:
6981 if (LOC)
936ed897 6982 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6983 else {
6984 for (value = 0; value < 256; value++)
6985 if (!isALNUM(value))
936ed897 6986 ANYOF_BITMAP_SET(ret, value);
73b437c8 6987 }
c49a72a9
NC
6988 yesno = '!';
6989 what = "Word";
73b437c8 6990 break;
ffc61ed2 6991 case ANYOF_ALNUMC:
73b437c8 6992 if (LOC)
ffc61ed2 6993 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6994 else {
6995 for (value = 0; value < 256; value++)
ffc61ed2 6996 if (isALNUMC(value))
936ed897 6997 ANYOF_BITMAP_SET(ret, value);
73b437c8 6998 }
c49a72a9
NC
6999 yesno = '+';
7000 what = "Alnum";
73b437c8
JH
7001 break;
7002 case ANYOF_NALNUMC:
7003 if (LOC)
936ed897 7004 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7005 else {
7006 for (value = 0; value < 256; value++)
7007 if (!isALNUMC(value))
936ed897 7008 ANYOF_BITMAP_SET(ret, value);
73b437c8 7009 }
c49a72a9
NC
7010 yesno = '!';
7011 what = "Alnum";
73b437c8
JH
7012 break;
7013 case ANYOF_ALPHA:
7014 if (LOC)
936ed897 7015 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7016 else {
7017 for (value = 0; value < 256; value++)
7018 if (isALPHA(value))
936ed897 7019 ANYOF_BITMAP_SET(ret, value);
73b437c8 7020 }
c49a72a9
NC
7021 yesno = '+';
7022 what = "Alpha";
73b437c8
JH
7023 break;
7024 case ANYOF_NALPHA:
7025 if (LOC)
936ed897 7026 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7027 else {
7028 for (value = 0; value < 256; value++)
7029 if (!isALPHA(value))
936ed897 7030 ANYOF_BITMAP_SET(ret, value);
73b437c8 7031 }
c49a72a9
NC
7032 yesno = '!';
7033 what = "Alpha";
73b437c8
JH
7034 break;
7035 case ANYOF_ASCII:
7036 if (LOC)
936ed897 7037 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7038 else {
c7f1f016 7039#ifndef EBCDIC
1ba5c669
JH
7040 for (value = 0; value < 128; value++)
7041 ANYOF_BITMAP_SET(ret, value);
7042#else /* EBCDIC */
ffbc6a93 7043 for (value = 0; value < 256; value++) {
3a3c4447
JH
7044 if (isASCII(value))
7045 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7046 }
1ba5c669 7047#endif /* EBCDIC */
73b437c8 7048 }
c49a72a9
NC
7049 yesno = '+';
7050 what = "ASCII";
73b437c8
JH
7051 break;
7052 case ANYOF_NASCII:
7053 if (LOC)
936ed897 7054 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7055 else {
c7f1f016 7056#ifndef EBCDIC
1ba5c669
JH
7057 for (value = 128; value < 256; value++)
7058 ANYOF_BITMAP_SET(ret, value);
7059#else /* EBCDIC */
ffbc6a93 7060 for (value = 0; value < 256; value++) {
3a3c4447
JH
7061 if (!isASCII(value))
7062 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7063 }
1ba5c669 7064#endif /* EBCDIC */
73b437c8 7065 }
c49a72a9
NC
7066 yesno = '!';
7067 what = "ASCII";
73b437c8 7068 break;
aaa51d5e
JF
7069 case ANYOF_BLANK:
7070 if (LOC)
7071 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7072 else {
7073 for (value = 0; value < 256; value++)
7074 if (isBLANK(value))
7075 ANYOF_BITMAP_SET(ret, value);
7076 }
c49a72a9
NC
7077 yesno = '+';
7078 what = "Blank";
aaa51d5e
JF
7079 break;
7080 case ANYOF_NBLANK:
7081 if (LOC)
7082 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7083 else {
7084 for (value = 0; value < 256; value++)
7085 if (!isBLANK(value))
7086 ANYOF_BITMAP_SET(ret, value);
7087 }
c49a72a9
NC
7088 yesno = '!';
7089 what = "Blank";
aaa51d5e 7090 break;
73b437c8
JH
7091 case ANYOF_CNTRL:
7092 if (LOC)
936ed897 7093 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7094 else {
7095 for (value = 0; value < 256; value++)
7096 if (isCNTRL(value))
936ed897 7097 ANYOF_BITMAP_SET(ret, value);
73b437c8 7098 }
c49a72a9
NC
7099 yesno = '+';
7100 what = "Cntrl";
73b437c8
JH
7101 break;
7102 case ANYOF_NCNTRL:
7103 if (LOC)
936ed897 7104 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7105 else {
7106 for (value = 0; value < 256; value++)
7107 if (!isCNTRL(value))
936ed897 7108 ANYOF_BITMAP_SET(ret, value);
73b437c8 7109 }
c49a72a9
NC
7110 yesno = '!';
7111 what = "Cntrl";
ffc61ed2
JH
7112 break;
7113 case ANYOF_DIGIT:
7114 if (LOC)
7115 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7116 else {
7117 /* consecutive digits assumed */
7118 for (value = '0'; value <= '9'; value++)
7119 ANYOF_BITMAP_SET(ret, value);
7120 }
c49a72a9
NC
7121 yesno = '+';
7122 what = "Digit";
ffc61ed2
JH
7123 break;
7124 case ANYOF_NDIGIT:
7125 if (LOC)
7126 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7127 else {
7128 /* consecutive digits assumed */
7129 for (value = 0; value < '0'; value++)
7130 ANYOF_BITMAP_SET(ret, value);
7131 for (value = '9' + 1; value < 256; value++)
7132 ANYOF_BITMAP_SET(ret, value);
7133 }
c49a72a9
NC
7134 yesno = '!';
7135 what = "Digit";
73b437c8
JH
7136 break;
7137 case ANYOF_GRAPH:
7138 if (LOC)
936ed897 7139 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7140 else {
7141 for (value = 0; value < 256; value++)
7142 if (isGRAPH(value))
936ed897 7143 ANYOF_BITMAP_SET(ret, value);
73b437c8 7144 }
c49a72a9
NC
7145 yesno = '+';
7146 what = "Graph";
73b437c8
JH
7147 break;
7148 case ANYOF_NGRAPH:
7149 if (LOC)
936ed897 7150 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7151 else {
7152 for (value = 0; value < 256; value++)
7153 if (!isGRAPH(value))
936ed897 7154 ANYOF_BITMAP_SET(ret, value);
73b437c8 7155 }
c49a72a9
NC
7156 yesno = '!';
7157 what = "Graph";
73b437c8
JH
7158 break;
7159 case ANYOF_LOWER:
7160 if (LOC)
936ed897 7161 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7162 else {
7163 for (value = 0; value < 256; value++)
7164 if (isLOWER(value))
936ed897 7165 ANYOF_BITMAP_SET(ret, value);
73b437c8 7166 }
c49a72a9
NC
7167 yesno = '+';
7168 what = "Lower";
73b437c8
JH
7169 break;
7170 case ANYOF_NLOWER:
7171 if (LOC)
936ed897 7172 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7173 else {
7174 for (value = 0; value < 256; value++)
7175 if (!isLOWER(value))
936ed897 7176 ANYOF_BITMAP_SET(ret, value);
73b437c8 7177 }
c49a72a9
NC
7178 yesno = '!';
7179 what = "Lower";
73b437c8
JH
7180 break;
7181 case ANYOF_PRINT:
7182 if (LOC)
936ed897 7183 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7184 else {
7185 for (value = 0; value < 256; value++)
7186 if (isPRINT(value))
936ed897 7187 ANYOF_BITMAP_SET(ret, value);
73b437c8 7188 }
c49a72a9
NC
7189 yesno = '+';
7190 what = "Print";
73b437c8
JH
7191 break;
7192 case ANYOF_NPRINT:
7193 if (LOC)
936ed897 7194 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7195 else {
7196 for (value = 0; value < 256; value++)
7197 if (!isPRINT(value))
936ed897 7198 ANYOF_BITMAP_SET(ret, value);
73b437c8 7199 }
c49a72a9
NC
7200 yesno = '!';
7201 what = "Print";
73b437c8 7202 break;
aaa51d5e
JF
7203 case ANYOF_PSXSPC:
7204 if (LOC)
7205 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7206 else {
7207 for (value = 0; value < 256; value++)
7208 if (isPSXSPC(value))
7209 ANYOF_BITMAP_SET(ret, value);
7210 }
c49a72a9
NC
7211 yesno = '+';
7212 what = "Space";
aaa51d5e
JF
7213 break;
7214 case ANYOF_NPSXSPC:
7215 if (LOC)
7216 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7217 else {
7218 for (value = 0; value < 256; value++)
7219 if (!isPSXSPC(value))
7220 ANYOF_BITMAP_SET(ret, value);
7221 }
c49a72a9
NC
7222 yesno = '!';
7223 what = "Space";
aaa51d5e 7224 break;
73b437c8
JH
7225 case ANYOF_PUNCT:
7226 if (LOC)
936ed897 7227 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7228 else {
7229 for (value = 0; value < 256; value++)
7230 if (isPUNCT(value))
936ed897 7231 ANYOF_BITMAP_SET(ret, value);
73b437c8 7232 }
c49a72a9
NC
7233 yesno = '+';
7234 what = "Punct";
73b437c8
JH
7235 break;
7236 case ANYOF_NPUNCT:
7237 if (LOC)
936ed897 7238 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7239 else {
7240 for (value = 0; value < 256; value++)
7241 if (!isPUNCT(value))
936ed897 7242 ANYOF_BITMAP_SET(ret, value);
73b437c8 7243 }
c49a72a9
NC
7244 yesno = '!';
7245 what = "Punct";
ffc61ed2
JH
7246 break;
7247 case ANYOF_SPACE:
7248 if (LOC)
7249 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7250 else {
7251 for (value = 0; value < 256; value++)
7252 if (isSPACE(value))
7253 ANYOF_BITMAP_SET(ret, value);
7254 }
c49a72a9
NC
7255 yesno = '+';
7256 what = "SpacePerl";
ffc61ed2
JH
7257 break;
7258 case ANYOF_NSPACE:
7259 if (LOC)
7260 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7261 else {
7262 for (value = 0; value < 256; value++)
7263 if (!isSPACE(value))
7264 ANYOF_BITMAP_SET(ret, value);
7265 }
c49a72a9
NC
7266 yesno = '!';
7267 what = "SpacePerl";
73b437c8
JH
7268 break;
7269 case ANYOF_UPPER:
7270 if (LOC)
936ed897 7271 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7272 else {
7273 for (value = 0; value < 256; value++)
7274 if (isUPPER(value))
936ed897 7275 ANYOF_BITMAP_SET(ret, value);
73b437c8 7276 }
c49a72a9
NC
7277 yesno = '+';
7278 what = "Upper";
73b437c8
JH
7279 break;
7280 case ANYOF_NUPPER:
7281 if (LOC)
936ed897 7282 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7283 else {
7284 for (value = 0; value < 256; value++)
7285 if (!isUPPER(value))
936ed897 7286 ANYOF_BITMAP_SET(ret, value);
73b437c8 7287 }
c49a72a9
NC
7288 yesno = '!';
7289 what = "Upper";
73b437c8
JH
7290 break;
7291 case ANYOF_XDIGIT:
7292 if (LOC)
936ed897 7293 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7294 else {
7295 for (value = 0; value < 256; value++)
7296 if (isXDIGIT(value))
936ed897 7297 ANYOF_BITMAP_SET(ret, value);
73b437c8 7298 }
c49a72a9
NC
7299 yesno = '+';
7300 what = "XDigit";
73b437c8
JH
7301 break;
7302 case ANYOF_NXDIGIT:
7303 if (LOC)
936ed897 7304 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7305 else {
7306 for (value = 0; value < 256; value++)
7307 if (!isXDIGIT(value))
936ed897 7308 ANYOF_BITMAP_SET(ret, value);
73b437c8 7309 }
c49a72a9
NC
7310 yesno = '!';
7311 what = "XDigit";
73b437c8 7312 break;
f81125e2
JP
7313 case ANYOF_MAX:
7314 /* this is to handle \p and \P */
7315 break;
73b437c8 7316 default:
b45f050a 7317 vFAIL("Invalid [::] class");
73b437c8 7318 break;
b8c5462f 7319 }
c49a72a9
NC
7320 if (what) {
7321 /* Strings such as "+utf8::isWord\n" */
7322 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7323 }
b8c5462f 7324 if (LOC)
936ed897 7325 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7326 continue;
a0d0e21e 7327 }
ffc61ed2
JH
7328 } /* end of namedclass \blah */
7329
a0d0e21e 7330 if (range) {
eb160463 7331 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7332 const int w = RExC_parse - rangebegin;
7333 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7334 range = 0; /* not a valid range */
73b437c8 7335 }
a0d0e21e
LW
7336 }
7337 else {
3568d838 7338 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7339 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7340 RExC_parse[1] != ']') {
7341 RExC_parse++;
ffc61ed2
JH
7342
7343 /* a bad range like \w-, [:word:]- ? */
7344 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7345 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7346 const int w =
afd78fd5
JH
7347 RExC_parse >= rangebegin ?
7348 RExC_parse - rangebegin : 0;
830247a4 7349 vWARN4(RExC_parse,
b45f050a 7350 "False [] range \"%*.*s\"",
097eb12c 7351 w, w, rangebegin);
afd78fd5 7352 }
73b437c8 7353 if (!SIZE_ONLY)
936ed897 7354 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7355 } else
ffc61ed2
JH
7356 range = 1; /* yeah, it's a range! */
7357 continue; /* but do it the next time */
a0d0e21e 7358 }
a687059c 7359 }
ffc61ed2 7360
93733859 7361 /* now is the next time */
07be1b83 7362 /*stored += (value - prevvalue + 1);*/
ae5c130c 7363 if (!SIZE_ONLY) {
3568d838 7364 if (prevvalue < 256) {
1df70142 7365 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7366 IV i;
3568d838 7367#ifdef EBCDIC
1b2d223b
JH
7368 /* In EBCDIC [\x89-\x91] should include
7369 * the \x8e but [i-j] should not. */
7370 if (literal_endpoint == 2 &&
7371 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7372 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7373 {
3568d838
JH
7374 if (isLOWER(prevvalue)) {
7375 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7376 if (isLOWER(i))
7377 ANYOF_BITMAP_SET(ret, i);
7378 } else {
3568d838 7379 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7380 if (isUPPER(i))
7381 ANYOF_BITMAP_SET(ret, i);
7382 }
8ada0baa 7383 }
ffc61ed2 7384 else
8ada0baa 7385#endif
07be1b83
YO
7386 for (i = prevvalue; i <= ceilvalue; i++) {
7387 if (!ANYOF_BITMAP_TEST(ret,i)) {
7388 stored++;
7389 ANYOF_BITMAP_SET(ret, i);
7390 }
7391 }
3568d838 7392 }
a5961de5 7393 if (value > 255 || UTF) {
1df70142
AL
7394 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7395 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7396 stored+=2; /* can't optimize this class */
ffc61ed2 7397 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7398 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7399 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7400 prevnatvalue, natvalue);
7401 }
7402 else if (prevnatvalue == natvalue) {
7403 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7404 if (FOLD) {
89ebb4a3 7405 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7406 STRLEN foldlen;
1df70142 7407 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7408
e294cc5d
JH
7409#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7410 if (RExC_precomp[0] == ':' &&
7411 RExC_precomp[1] == '[' &&
7412 (f == 0xDF || f == 0x92)) {
7413 f = NATIVE_TO_UNI(f);
7414 }
7415#endif
c840d2a2
JH
7416 /* If folding and foldable and a single
7417 * character, insert also the folded version
7418 * to the charclass. */
9e55ce06 7419 if (f != value) {
e294cc5d
JH
7420#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7421 if ((RExC_precomp[0] == ':' &&
7422 RExC_precomp[1] == '[' &&
7423 (f == 0xA2 &&
7424 (value == 0xFB05 || value == 0xFB06))) ?
7425 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7426 foldlen == (STRLEN)UNISKIP(f) )
7427#else
eb160463 7428 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7429#endif
9e55ce06
JH
7430 Perl_sv_catpvf(aTHX_ listsv,
7431 "%04"UVxf"\n", f);
7432 else {
7433 /* Any multicharacter foldings
7434 * require the following transform:
7435 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7436 * where E folds into "pq" and F folds
7437 * into "rst", all other characters
7438 * fold to single characters. We save
7439 * away these multicharacter foldings,
7440 * to be later saved as part of the
7441 * additional "s" data. */
7442 SV *sv;
7443
7444 if (!unicode_alternate)
7445 unicode_alternate = newAV();
7446 sv = newSVpvn((char*)foldbuf, foldlen);
7447 SvUTF8_on(sv);
7448 av_push(unicode_alternate, sv);
7449 }
7450 }
254ba52a 7451
60a8b682
JH
7452 /* If folding and the value is one of the Greek
7453 * sigmas insert a few more sigmas to make the
7454 * folding rules of the sigmas to work right.
7455 * Note that not all the possible combinations
7456 * are handled here: some of them are handled
9e55ce06
JH
7457 * by the standard folding rules, and some of
7458 * them (literal or EXACTF cases) are handled
7459 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7460 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7461 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7462 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7463 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7464 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7465 }
7466 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7467 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7468 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7469 }
7470 }
ffc61ed2 7471 }
1b2d223b
JH
7472#ifdef EBCDIC
7473 literal_endpoint = 0;
7474#endif
8ada0baa 7475 }
ffc61ed2
JH
7476
7477 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7478 }
ffc61ed2 7479
936ed897 7480 if (need_class) {
4f66b38d 7481 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7482 if (SIZE_ONLY)
830247a4 7483 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7484 else
830247a4 7485 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7486 }
ffc61ed2 7487
7f6f358c
YO
7488
7489 if (SIZE_ONLY)
7490 return ret;
7491 /****** !SIZE_ONLY AFTER HERE *********/
7492
7493 if( stored == 1 && value < 256
7494 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7495 ) {
7496 /* optimize single char class to an EXACT node
7497 but *only* when its not a UTF/high char */
07be1b83
YO
7498 const char * cur_parse= RExC_parse;
7499 RExC_emit = (regnode *)orig_emit;
7500 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7501 ret = reg_node(pRExC_state,
7502 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7503 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7504 *STRING(ret)= (char)value;
7505 STR_LEN(ret)= 1;
7506 RExC_emit += STR_SZ(1);
7507 return ret;
7508 }
ae5c130c 7509 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7510 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7511 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7512 ) {
a0ed51b3 7513 for (value = 0; value < 256; ++value) {
936ed897 7514 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7515 UV fold = PL_fold[value];
ffc61ed2
JH
7516
7517 if (fold != value)
7518 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7519 }
7520 }
936ed897 7521 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7522 }
ffc61ed2 7523
ae5c130c 7524 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7525 if (optimize_invert &&
ffc61ed2
JH
7526 /* If the only flag is inversion. */
7527 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7528 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7529 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7530 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7531 }
7f6f358c 7532 {
097eb12c 7533 AV * const av = newAV();
ffc61ed2 7534 SV *rv;
9e55ce06 7535 /* The 0th element stores the character class description
6a0407ee 7536 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7537 * to initialize the appropriate swash (which gets stored in
7538 * the 1st element), and also useful for dumping the regnode.
7539 * The 2nd element stores the multicharacter foldings,
6a0407ee 7540 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7541 av_store(av, 0, listsv);
7542 av_store(av, 1, NULL);
9e55ce06 7543 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7544 rv = newRV_noinc((SV*)av);
19860706 7545 n = add_data(pRExC_state, 1, "s");
830247a4 7546 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7547 ARG_SET(ret, n);
a0ed51b3 7548 }
a0ed51b3
LW
7549 return ret;
7550}
7551
76e3520e 7552STATIC char*
830247a4 7553S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7554{
097eb12c 7555 char* const retval = RExC_parse++;
a0d0e21e 7556
4633a7c4 7557 for (;;) {
830247a4
IZ
7558 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7559 RExC_parse[2] == '#') {
e994fd66
AE
7560 while (*RExC_parse != ')') {
7561 if (RExC_parse == RExC_end)
7562 FAIL("Sequence (?#... not terminated");
830247a4 7563 RExC_parse++;
e994fd66 7564 }
830247a4 7565 RExC_parse++;
4633a7c4
LW
7566 continue;
7567 }
e2509266 7568 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7569 if (isSPACE(*RExC_parse)) {
7570 RExC_parse++;
748a9306
LW
7571 continue;
7572 }
830247a4 7573 else if (*RExC_parse == '#') {
e994fd66
AE
7574 while (RExC_parse < RExC_end)
7575 if (*RExC_parse++ == '\n') break;
748a9306
LW
7576 continue;
7577 }
748a9306 7578 }
4633a7c4 7579 return retval;
a0d0e21e 7580 }
a687059c
LW
7581}
7582
7583/*
c277df42 7584- reg_node - emit a node
a0d0e21e 7585*/
76e3520e 7586STATIC regnode * /* Location. */
830247a4 7587S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7588{
97aff369 7589 dVAR;
c277df42 7590 register regnode *ptr;
504618e9 7591 regnode * const ret = RExC_emit;
07be1b83 7592 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7593
c277df42 7594 if (SIZE_ONLY) {
830247a4
IZ
7595 SIZE_ALIGN(RExC_size);
7596 RExC_size += 1;
a0d0e21e
LW
7597 return(ret);
7598 }
e2e6a0f1
YO
7599#ifdef DEBUGGING
7600 if (OP(RExC_emit) == 255)
7601 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7602 reg_name[op], OP(RExC_emit));
7603#endif
c277df42 7604 NODE_ALIGN_FILL(ret);
a0d0e21e 7605 ptr = ret;
c277df42 7606 FILL_ADVANCE_NODE(ptr, op);
fac92740 7607 if (RExC_offsets) { /* MJD */
07be1b83 7608 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7609 "reg_node", __LINE__,
7610 reg_name[op],
07be1b83
YO
7611 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7612 ? "Overwriting end of array!\n" : "OK",
7613 (UV)(RExC_emit - RExC_emit_start),
7614 (UV)(RExC_parse - RExC_start),
7615 (UV)RExC_offsets[0]));
ccb2c380 7616 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7617 }
07be1b83 7618
830247a4 7619 RExC_emit = ptr;
a0d0e21e 7620 return(ret);
a687059c
LW
7621}
7622
7623/*
a0d0e21e
LW
7624- reganode - emit a node with an argument
7625*/
76e3520e 7626STATIC regnode * /* Location. */
830247a4 7627S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7628{
97aff369 7629 dVAR;
c277df42 7630 register regnode *ptr;
504618e9 7631 regnode * const ret = RExC_emit;
07be1b83 7632 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7633
c277df42 7634 if (SIZE_ONLY) {
830247a4
IZ
7635 SIZE_ALIGN(RExC_size);
7636 RExC_size += 2;
6bda09f9
YO
7637 /*
7638 We can't do this:
7639
7640 assert(2==regarglen[op]+1);
7641
7642 Anything larger than this has to allocate the extra amount.
7643 If we changed this to be:
7644
7645 RExC_size += (1 + regarglen[op]);
7646
7647 then it wouldn't matter. Its not clear what side effect
7648 might come from that so its not done so far.
7649 -- dmq
7650 */
a0d0e21e
LW
7651 return(ret);
7652 }
e2e6a0f1
YO
7653#ifdef DEBUGGING
7654 if (OP(RExC_emit) == 255)
7655 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7656#endif
c277df42 7657 NODE_ALIGN_FILL(ret);
a0d0e21e 7658 ptr = ret;
c277df42 7659 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7660 if (RExC_offsets) { /* MJD */
07be1b83 7661 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7662 "reganode",
ccb2c380
MP
7663 __LINE__,
7664 reg_name[op],
07be1b83 7665 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7666 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7667 (UV)(RExC_emit - RExC_emit_start),
7668 (UV)(RExC_parse - RExC_start),
7669 (UV)RExC_offsets[0]));
ccb2c380 7670 Set_Cur_Node_Offset;
fac92740
MJD
7671 }
7672
830247a4 7673 RExC_emit = ptr;
a0d0e21e 7674 return(ret);
fe14fcc3
LW
7675}
7676
7677/*
cd439c50 7678- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7679*/
71207a34
AL
7680STATIC STRLEN
7681S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7682{
97aff369 7683 dVAR;
71207a34 7684 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7685}
7686
7687/*
a0d0e21e
LW
7688- reginsert - insert an operator in front of already-emitted operand
7689*
7690* Means relocating the operand.
7691*/
76e3520e 7692STATIC void
6bda09f9 7693S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7694{
97aff369 7695 dVAR;
c277df42
IZ
7696 register regnode *src;
7697 register regnode *dst;
7698 register regnode *place;
504618e9 7699 const int offset = regarglen[(U8)op];
6bda09f9 7700 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7701 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7702/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7703 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7704 if (SIZE_ONLY) {
6bda09f9 7705 RExC_size += size;
a0d0e21e
LW
7706 return;
7707 }
a687059c 7708
830247a4 7709 src = RExC_emit;
6bda09f9 7710 RExC_emit += size;
830247a4 7711 dst = RExC_emit;
40d049e4 7712 if (RExC_open_parens) {
6bda09f9 7713 int paren;
6d99fb9b 7714 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7715 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7716 if ( RExC_open_parens[paren] >= opnd ) {
7717 DEBUG_PARSE_FMT("open"," - %d",size);
7718 RExC_open_parens[paren] += size;
7719 } else {
7720 DEBUG_PARSE_FMT("open"," - %s","ok");
7721 }
7722 if ( RExC_close_parens[paren] >= opnd ) {
7723 DEBUG_PARSE_FMT("close"," - %d",size);
7724 RExC_close_parens[paren] += size;
7725 } else {
7726 DEBUG_PARSE_FMT("close"," - %s","ok");
7727 }
7728 }
6bda09f9 7729 }
40d049e4 7730
fac92740 7731 while (src > opnd) {
c277df42 7732 StructCopy(--src, --dst, regnode);
fac92740 7733 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7734 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7735 "reg_insert",
ccb2c380
MP
7736 __LINE__,
7737 reg_name[op],
07be1b83
YO
7738 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7739 ? "Overwriting end of array!\n" : "OK",
7740 (UV)(src - RExC_emit_start),
7741 (UV)(dst - RExC_emit_start),
7742 (UV)RExC_offsets[0]));
ccb2c380
MP
7743 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7744 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7745 }
7746 }
7747
a0d0e21e
LW
7748
7749 place = opnd; /* Op node, where operand used to be. */
fac92740 7750 if (RExC_offsets) { /* MJD */
07be1b83 7751 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7752 "reginsert",
ccb2c380
MP
7753 __LINE__,
7754 reg_name[op],
07be1b83 7755 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7756 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7757 (UV)(place - RExC_emit_start),
7758 (UV)(RExC_parse - RExC_start),
786e8c11 7759 (UV)RExC_offsets[0]));
ccb2c380 7760 Set_Node_Offset(place, RExC_parse);
45948336 7761 Set_Node_Length(place, 1);
fac92740 7762 }
c277df42
IZ
7763 src = NEXTOPER(place);
7764 FILL_ADVANCE_NODE(place, op);
7765 Zero(src, offset, regnode);
a687059c
LW
7766}
7767
7768/*
c277df42 7769- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7770- SEE ALSO: regtail_study
a0d0e21e 7771*/
097eb12c 7772/* TODO: All three parms should be const */
76e3520e 7773STATIC void
3dab1dad 7774S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7775{
97aff369 7776 dVAR;
c277df42 7777 register regnode *scan;
72f13be8 7778 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7779#ifndef DEBUGGING
7780 PERL_UNUSED_ARG(depth);
7781#endif
a0d0e21e 7782
c277df42 7783 if (SIZE_ONLY)
a0d0e21e
LW
7784 return;
7785
7786 /* Find last node. */
7787 scan = p;
7788 for (;;) {
504618e9 7789 regnode * const temp = regnext(scan);
3dab1dad
YO
7790 DEBUG_PARSE_r({
7791 SV * const mysv=sv_newmortal();
7792 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7793 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7794 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7795 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7796 (temp == NULL ? "->" : ""),
7797 (temp == NULL ? reg_name[OP(val)] : "")
7798 );
3dab1dad
YO
7799 });
7800 if (temp == NULL)
7801 break;
7802 scan = temp;
7803 }
7804
7805 if (reg_off_by_arg[OP(scan)]) {
7806 ARG_SET(scan, val - scan);
7807 }
7808 else {
7809 NEXT_OFF(scan) = val - scan;
7810 }
7811}
7812
07be1b83 7813#ifdef DEBUGGING
3dab1dad
YO
7814/*
7815- regtail_study - set the next-pointer at the end of a node chain of p to val.
7816- Look for optimizable sequences at the same time.
7817- currently only looks for EXACT chains.
07be1b83
YO
7818
7819This is expermental code. The idea is to use this routine to perform
7820in place optimizations on branches and groups as they are constructed,
7821with the long term intention of removing optimization from study_chunk so
7822that it is purely analytical.
7823
7824Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7825to control which is which.
7826
3dab1dad
YO
7827*/
7828/* TODO: All four parms should be const */
07be1b83 7829
3dab1dad
YO
7830STATIC U8
7831S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7832{
7833 dVAR;
7834 register regnode *scan;
07be1b83
YO
7835 U8 exact = PSEUDO;
7836#ifdef EXPERIMENTAL_INPLACESCAN
7837 I32 min = 0;
7838#endif
7839
3dab1dad
YO
7840 GET_RE_DEBUG_FLAGS_DECL;
7841
07be1b83 7842
3dab1dad
YO
7843 if (SIZE_ONLY)
7844 return exact;
7845
7846 /* Find last node. */
7847
7848 scan = p;
7849 for (;;) {
7850 regnode * const temp = regnext(scan);
07be1b83
YO
7851#ifdef EXPERIMENTAL_INPLACESCAN
7852 if (PL_regkind[OP(scan)] == EXACT)
7853 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7854 return EXACT;
7855#endif
3dab1dad
YO
7856 if ( exact ) {
7857 switch (OP(scan)) {
7858 case EXACT:
7859 case EXACTF:
7860 case EXACTFL:
7861 if( exact == PSEUDO )
7862 exact= OP(scan);
07be1b83
YO
7863 else if ( exact != OP(scan) )
7864 exact= 0;
3dab1dad
YO
7865 case NOTHING:
7866 break;
7867 default:
7868 exact= 0;
7869 }
7870 }
7871 DEBUG_PARSE_r({
7872 SV * const mysv=sv_newmortal();
7873 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7874 regprop(RExC_rx, mysv, scan);
eaf3ca90 7875 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7876 SvPV_nolen_const(mysv),
eaf3ca90
YO
7877 REG_NODE_NUM(scan),
7878 reg_name[exact]);
3dab1dad 7879 });
a0d0e21e
LW
7880 if (temp == NULL)
7881 break;
7882 scan = temp;
7883 }
07be1b83
YO
7884 DEBUG_PARSE_r({
7885 SV * const mysv_val=sv_newmortal();
7886 DEBUG_PARSE_MSG("");
7887 regprop(RExC_rx, mysv_val, val);
7888 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7889 SvPV_nolen_const(mysv_val),
7890 REG_NODE_NUM(val),
7891 val - scan
7892 );
7893 });
c277df42
IZ
7894 if (reg_off_by_arg[OP(scan)]) {
7895 ARG_SET(scan, val - scan);
a0ed51b3
LW
7896 }
7897 else {
c277df42
IZ
7898 NEXT_OFF(scan) = val - scan;
7899 }
3dab1dad
YO
7900
7901 return exact;
a687059c 7902}
07be1b83 7903#endif
a687059c
LW
7904
7905/*
a687059c
LW
7906 - regcurly - a little FSA that accepts {\d+,?\d*}
7907 */
79072805 7908STATIC I32
5f66b61c 7909S_regcurly(register const char *s)
a687059c
LW
7910{
7911 if (*s++ != '{')
7912 return FALSE;
f0fcb552 7913 if (!isDIGIT(*s))
a687059c 7914 return FALSE;
f0fcb552 7915 while (isDIGIT(*s))
a687059c
LW
7916 s++;
7917 if (*s == ',')
7918 s++;
f0fcb552 7919 while (isDIGIT(*s))
a687059c
LW
7920 s++;
7921 if (*s != '}')
7922 return FALSE;
7923 return TRUE;
7924}
7925
a687059c
LW
7926
7927/*
fd181c75 7928 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7929 */
7930void
097eb12c 7931Perl_regdump(pTHX_ const regexp *r)
a687059c 7932{
35ff7856 7933#ifdef DEBUGGING
97aff369 7934 dVAR;
c445ea15 7935 SV * const sv = sv_newmortal();
ab3bbdeb 7936 SV *dsv= sv_newmortal();
a687059c 7937
786e8c11 7938 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7939
7940 /* Header fields of interest. */
ab3bbdeb
YO
7941 if (r->anchored_substr) {
7942 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7943 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7944 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7945 "anchored %s%s at %"IVdf" ",
7946 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7947 (IV)r->anchored_offset);
ab3bbdeb
YO
7948 } else if (r->anchored_utf8) {
7949 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7950 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7951 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7952 "anchored utf8 %s%s at %"IVdf" ",
7953 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7954 (IV)r->anchored_offset);
ab3bbdeb
YO
7955 }
7956 if (r->float_substr) {
7957 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7958 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7959 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7960 "floating %s%s at %"IVdf"..%"UVuf" ",
7961 s, RE_SV_TAIL(r->float_substr),
7b0972df 7962 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7963 } else if (r->float_utf8) {
7964 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7965 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7966 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7967 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7968 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7969 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7970 }
33b8afdf 7971 if (r->check_substr || r->check_utf8)
b81d288d 7972 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7973 (const char *)
7974 (r->check_substr == r->float_substr
7975 && r->check_utf8 == r->float_utf8
7976 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7977 if (r->reganch & ROPT_NOSCAN)
7978 PerlIO_printf(Perl_debug_log, " noscan");
7979 if (r->reganch & ROPT_CHECK_ALL)
7980 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7981 if (r->check_substr || r->check_utf8)
c277df42
IZ
7982 PerlIO_printf(Perl_debug_log, ") ");
7983
46fc3d4c 7984 if (r->regstclass) {
32fc9b6a 7985 regprop(r, sv, r->regstclass);
1de06328 7986 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7987 }
774d564b 7988 if (r->reganch & ROPT_ANCH) {
7989 PerlIO_printf(Perl_debug_log, "anchored");
7990 if (r->reganch & ROPT_ANCH_BOL)
7991 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7992 if (r->reganch & ROPT_ANCH_MBOL)
7993 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7994 if (r->reganch & ROPT_ANCH_SBOL)
7995 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7996 if (r->reganch & ROPT_ANCH_GPOS)
7997 PerlIO_printf(Perl_debug_log, "(GPOS)");
7998 PerlIO_putc(Perl_debug_log, ' ');
7999 }
c277df42
IZ
8000 if (r->reganch & ROPT_GPOS_SEEN)
8001 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 8002 if (r->reganch & ROPT_SKIP)
760ac839 8003 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8004 if (r->reganch & ROPT_IMPLICIT)
760ac839 8005 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8006 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8007 if (r->reganch & ROPT_EVAL_SEEN)
8008 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8009 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8010#else
96a5add6 8011 PERL_UNUSED_CONTEXT;
65e66c80 8012 PERL_UNUSED_ARG(r);
17c3b450 8013#endif /* DEBUGGING */
a687059c
LW
8014}
8015
8016/*
a0d0e21e
LW
8017- regprop - printable representation of opcode
8018*/
46fc3d4c 8019void
32fc9b6a 8020Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8021{
35ff7856 8022#ifdef DEBUGGING
97aff369 8023 dVAR;
9b155405 8024 register int k;
1de06328 8025 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8026
54dc92de 8027 sv_setpvn(sv, "", 0);
03363afd 8028 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8029 /* It would be nice to FAIL() here, but this may be called from
8030 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8031 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8032 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8033
3dab1dad 8034 k = PL_regkind[OP(o)];
9b155405 8035
2a782b5b 8036 if (k == EXACT) {
396482e1 8037 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8038 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8039 * is a crude hack but it may be the best for now since
8040 * we have no flag "this EXACTish node was UTF-8"
8041 * --jhi */
8042 const char * const s =
ddc5bc0f 8043 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8044 PL_colors[0], PL_colors[1],
8045 PERL_PV_ESCAPE_UNI_DETECT |
8046 PERL_PV_PRETTY_ELIPSES |
8047 PERL_PV_PRETTY_LTGT
8048 );
8049 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8050 } else if (k == TRIE) {
3dab1dad 8051 /* print the details of the trie in dumpuntil instead, as
4f639d21 8052 * prog->data isn't available here */
1de06328
YO
8053 const char op = OP(o);
8054 const I32 n = ARG(o);
8055 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8056 (reg_ac_data *)prog->data->data[n] :
8057 NULL;
8058 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8059 (reg_trie_data*)prog->data->data[n] :
8060 ac->trie;
8061
8062 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8063 DEBUG_TRIE_COMPILE_r(
8064 Perl_sv_catpvf(aTHX_ sv,
8065 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8066 (UV)trie->startstate,
1e2e3d02 8067 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8068 (UV)trie->wordcount,
8069 (UV)trie->minlen,
8070 (UV)trie->maxlen,
8071 (UV)TRIE_CHARCOUNT(trie),
8072 (UV)trie->uniquecharcount
8073 )
8074 );
8075 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8076 int i;
8077 int rangestart = -1;
f46cb337 8078 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8079 Perl_sv_catpvf(aTHX_ sv, "[");
8080 for (i = 0; i <= 256; i++) {
8081 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8082 if (rangestart == -1)
8083 rangestart = i;
8084 } else if (rangestart != -1) {
8085 if (i <= rangestart + 3)
8086 for (; rangestart < i; rangestart++)
8087 put_byte(sv, rangestart);
8088 else {
8089 put_byte(sv, rangestart);
8090 sv_catpvs(sv, "-");
8091 put_byte(sv, i - 1);
8092 }
8093 rangestart = -1;
8094 }
8095 }
8096 Perl_sv_catpvf(aTHX_ sv, "]");
8097 }
8098
a3621e74 8099 } else if (k == CURLY) {
cb434fcc 8100 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8101 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8102 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8103 }
2c2d71f5
JH
8104 else if (k == WHILEM && o->flags) /* Ordinal/of */
8105 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8106 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8107 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8108 else if (k == GOSUB)
6bda09f9 8109 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8110 else if (k == VERB) {
8111 if (!o->flags)
8112 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8113 (SV*)prog->data->data[ ARG( o ) ]);
8114 } else if (k == LOGICAL)
04ebc1ab 8115 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8116 else if (k == ANYOF) {
8117 int i, rangestart = -1;
2d03de9c 8118 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8119
8120 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8121 static const char * const anyofs[] = {
653099ff
GS
8122 "\\w",
8123 "\\W",
8124 "\\s",
8125 "\\S",
8126 "\\d",
8127 "\\D",
8128 "[:alnum:]",
8129 "[:^alnum:]",
8130 "[:alpha:]",
8131 "[:^alpha:]",
8132 "[:ascii:]",
8133 "[:^ascii:]",
8134 "[:ctrl:]",
8135 "[:^ctrl:]",
8136 "[:graph:]",
8137 "[:^graph:]",
8138 "[:lower:]",
8139 "[:^lower:]",
8140 "[:print:]",
8141 "[:^print:]",
8142 "[:punct:]",
8143 "[:^punct:]",
8144 "[:upper:]",
aaa51d5e 8145 "[:^upper:]",
653099ff 8146 "[:xdigit:]",
aaa51d5e
JF
8147 "[:^xdigit:]",
8148 "[:space:]",
8149 "[:^space:]",
8150 "[:blank:]",
8151 "[:^blank:]"
653099ff
GS
8152 };
8153
19860706 8154 if (flags & ANYOF_LOCALE)
396482e1 8155 sv_catpvs(sv, "{loc}");
19860706 8156 if (flags & ANYOF_FOLD)
396482e1 8157 sv_catpvs(sv, "{i}");
653099ff 8158 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8159 if (flags & ANYOF_INVERT)
396482e1 8160 sv_catpvs(sv, "^");
ffc61ed2
JH
8161 for (i = 0; i <= 256; i++) {
8162 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8163 if (rangestart == -1)
8164 rangestart = i;
8165 } else if (rangestart != -1) {
8166 if (i <= rangestart + 3)
8167 for (; rangestart < i; rangestart++)
653099ff 8168 put_byte(sv, rangestart);
ffc61ed2
JH
8169 else {
8170 put_byte(sv, rangestart);
396482e1 8171 sv_catpvs(sv, "-");
ffc61ed2 8172 put_byte(sv, i - 1);
653099ff 8173 }
ffc61ed2 8174 rangestart = -1;
653099ff 8175 }
847a199f 8176 }
ffc61ed2
JH
8177
8178 if (o->flags & ANYOF_CLASS)
bb7a0f54 8179 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8180 if (ANYOF_CLASS_TEST(o,i))
8181 sv_catpv(sv, anyofs[i]);
8182
8183 if (flags & ANYOF_UNICODE)
396482e1 8184 sv_catpvs(sv, "{unicode}");
1aa99e6b 8185 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8186 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8187
8188 {
8189 SV *lv;
32fc9b6a 8190 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8191
ffc61ed2
JH
8192 if (lv) {
8193 if (sw) {
89ebb4a3 8194 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8195
ffc61ed2 8196 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8197 uvchr_to_utf8(s, i);
ffc61ed2 8198
3568d838 8199 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8200 if (rangestart == -1)
8201 rangestart = i;
8202 } else if (rangestart != -1) {
ffc61ed2
JH
8203 if (i <= rangestart + 3)
8204 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8205 const U8 * const e = uvchr_to_utf8(s,rangestart);
8206 U8 *p;
8207 for(p = s; p < e; p++)
ffc61ed2
JH
8208 put_byte(sv, *p);
8209 }
8210 else {
2d03de9c
AL
8211 const U8 *e = uvchr_to_utf8(s,rangestart);
8212 U8 *p;
8213 for (p = s; p < e; p++)
ffc61ed2 8214 put_byte(sv, *p);
396482e1 8215 sv_catpvs(sv, "-");
2d03de9c
AL
8216 e = uvchr_to_utf8(s, i-1);
8217 for (p = s; p < e; p++)
1df70142 8218 put_byte(sv, *p);
ffc61ed2
JH
8219 }
8220 rangestart = -1;
8221 }
19860706 8222 }
ffc61ed2 8223
396482e1 8224 sv_catpvs(sv, "..."); /* et cetera */
19860706 8225 }
fde631ed 8226
ffc61ed2 8227 {
2e0de35c 8228 char *s = savesvpv(lv);
c445ea15 8229 char * const origs = s;
b81d288d 8230
3dab1dad
YO
8231 while (*s && *s != '\n')
8232 s++;
b81d288d 8233
ffc61ed2 8234 if (*s == '\n') {
2d03de9c 8235 const char * const t = ++s;
ffc61ed2
JH
8236
8237 while (*s) {
8238 if (*s == '\n')
8239 *s = ' ';
8240 s++;
8241 }
8242 if (s[-1] == ' ')
8243 s[-1] = 0;
8244
8245 sv_catpv(sv, t);
fde631ed 8246 }
b81d288d 8247
ffc61ed2 8248 Safefree(origs);
fde631ed
JH
8249 }
8250 }
653099ff 8251 }
ffc61ed2 8252
653099ff
GS
8253 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8254 }
9b155405 8255 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8256 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8257#else
96a5add6 8258 PERL_UNUSED_CONTEXT;
65e66c80
SP
8259 PERL_UNUSED_ARG(sv);
8260 PERL_UNUSED_ARG(o);
f9049ba1 8261 PERL_UNUSED_ARG(prog);
17c3b450 8262#endif /* DEBUGGING */
35ff7856 8263}
a687059c 8264
cad2e5aa
JH
8265SV *
8266Perl_re_intuit_string(pTHX_ regexp *prog)
8267{ /* Assume that RE_INTUIT is set */
97aff369 8268 dVAR;
a3621e74 8269 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8270 PERL_UNUSED_CONTEXT;
8271
a3621e74 8272 DEBUG_COMPILE_r(
cfd0369c 8273 {
2d03de9c 8274 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8275 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8276
8277 if (!PL_colorset) reginitcolors();
8278 PerlIO_printf(Perl_debug_log,
a0288114 8279 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8280 PL_colors[4],
8281 prog->check_substr ? "" : "utf8 ",
8282 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8283 s,
8284 PL_colors[1],
8285 (strlen(s) > 60 ? "..." : ""));
8286 } );
8287
33b8afdf 8288 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8289}
8290
84da74a7
YO
8291/*
8292 pregfree - free a regexp
8293
8294 See regdupe below if you change anything here.
8295*/
8296
2b69d0c2 8297void
864dbfa3 8298Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8299{
27da23d5 8300 dVAR;
0df25f3d 8301
fc32ee4a 8302 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8303
7821416a
IZ
8304 if (!r || (--r->refcnt > 0))
8305 return;
ab3bbdeb 8306 DEBUG_COMPILE_r({
0df25f3d
YO
8307 if (!PL_colorset)
8308 reginitcolors();
ab3bbdeb
YO
8309 if (RX_DEBUG(r)){
8310 SV *dsv= sv_newmortal();
8311 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8312 dsv, r->precomp, r->prelen, 60);
8313 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8314 PL_colors[4],PL_colors[5],s);
8315 }
9e55ce06 8316 });
cad2e5aa 8317
43c5f42d
NC
8318 /* gcov results gave these as non-null 100% of the time, so there's no
8319 optimisation in checking them before calling Safefree */
8320 Safefree(r->precomp);
8321 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8322 RX_MATCH_COPY_FREE(r);
f8c7b90f 8323#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8324 if (r->saved_copy)
8325 SvREFCNT_dec(r->saved_copy);
8326#endif
a193d654
GS
8327 if (r->substrs) {
8328 if (r->anchored_substr)
8329 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8330 if (r->anchored_utf8)
8331 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8332 if (r->float_substr)
8333 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8334 if (r->float_utf8)
8335 SvREFCNT_dec(r->float_utf8);
2779dcf1 8336 Safefree(r->substrs);
a193d654 8337 }
81714fb9
YO
8338 if (r->paren_names)
8339 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8340 if (r->data) {
8341 int n = r->data->count;
f3548bdc
DM
8342 PAD* new_comppad = NULL;
8343 PAD* old_comppad;
4026c95a 8344 PADOFFSET refcnt;
dfad63ad 8345
c277df42 8346 while (--n >= 0) {
261faec3 8347 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8348 switch (r->data->what[n]) {
8349 case 's':
81714fb9 8350 case 'S':
c277df42
IZ
8351 SvREFCNT_dec((SV*)r->data->data[n]);
8352 break;
653099ff
GS
8353 case 'f':
8354 Safefree(r->data->data[n]);
8355 break;
dfad63ad
HS
8356 case 'p':
8357 new_comppad = (AV*)r->data->data[n];
8358 break;
c277df42 8359 case 'o':
dfad63ad 8360 if (new_comppad == NULL)
cea2e8a9 8361 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8362 PAD_SAVE_LOCAL(old_comppad,
8363 /* Watch out for global destruction's random ordering. */
c445ea15 8364 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8365 );
b34c0dd4 8366 OP_REFCNT_LOCK;
4026c95a
SH
8367 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8368 OP_REFCNT_UNLOCK;
8369 if (!refcnt)
9b978d73 8370 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8371
f3548bdc 8372 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8373 SvREFCNT_dec((SV*)new_comppad);
8374 new_comppad = NULL;
c277df42
IZ
8375 break;
8376 case 'n':
9e55ce06 8377 break;
07be1b83 8378 case 'T':
be8e71aa
YO
8379 { /* Aho Corasick add-on structure for a trie node.
8380 Used in stclass optimization only */
07be1b83
YO
8381 U32 refcount;
8382 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8383 OP_REFCNT_LOCK;
8384 refcount = --aho->refcount;
8385 OP_REFCNT_UNLOCK;
8386 if ( !refcount ) {
8387 Safefree(aho->states);
8388 Safefree(aho->fail);
8389 aho->trie=NULL; /* not necessary to free this as it is
8390 handled by the 't' case */
8391 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8392 Safefree(r->regstclass);
07be1b83
YO
8393 }
8394 }
8395 break;
a3621e74 8396 case 't':
07be1b83 8397 {
be8e71aa 8398 /* trie structure. */
07be1b83
YO
8399 U32 refcount;
8400 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8401 OP_REFCNT_LOCK;
8402 refcount = --trie->refcount;
8403 OP_REFCNT_UNLOCK;
8404 if ( !refcount ) {
8405 Safefree(trie->charmap);
8406 if (trie->widecharmap)
8407 SvREFCNT_dec((SV*)trie->widecharmap);
8408 Safefree(trie->states);
8409 Safefree(trie->trans);
8410 if (trie->bitmap)
8411 Safefree(trie->bitmap);
8412 if (trie->wordlen)
8413 Safefree(trie->wordlen);
786e8c11
YO
8414 if (trie->jump)
8415 Safefree(trie->jump);
8416 if (trie->nextword)
8417 Safefree(trie->nextword);
a3621e74 8418#ifdef DEBUGGING
be8e71aa
YO
8419 if (RX_DEBUG(r)) {
8420 if (trie->words)
8421 SvREFCNT_dec((SV*)trie->words);
8422 if (trie->revcharmap)
8423 SvREFCNT_dec((SV*)trie->revcharmap);
8424 }
a3621e74 8425#endif
07be1b83 8426 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8427 }
07be1b83
YO
8428 }
8429 break;
c277df42 8430 default:
830247a4 8431 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8432 }
8433 }
8434 Safefree(r->data->what);
8435 Safefree(r->data);
a0d0e21e
LW
8436 }
8437 Safefree(r->startp);
8438 Safefree(r->endp);
8439 Safefree(r);
a687059c 8440}
c277df42 8441
84da74a7
YO
8442#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8443#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8444#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8445#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8446
8447/*
8448 regdupe - duplicate a regexp.
8449
8450 This routine is called by sv.c's re_dup and is expected to clone a
8451 given regexp structure. It is a no-op when not under USE_ITHREADS.
8452 (Originally this *was* re_dup() for change history see sv.c)
8453
8454 See pregfree() above if you change anything here.
8455*/
a3c0e9ca 8456#if defined(USE_ITHREADS)
84da74a7
YO
8457regexp *
8458Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8459{
84da74a7
YO
8460 dVAR;
8461 REGEXP *ret;
8462 int i, len, npar;
8463 struct reg_substr_datum *s;
8464
8465 if (!r)
8466 return (REGEXP *)NULL;
8467
8468 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8469 return ret;
8470
8471 len = r->offsets[0];
8472 npar = r->nparens+1;
8473
8474 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8475 Copy(r->program, ret->program, len+1, regnode);
8476
8477 Newx(ret->startp, npar, I32);
8478 Copy(r->startp, ret->startp, npar, I32);
8479 Newx(ret->endp, npar, I32);
8480 Copy(r->startp, ret->startp, npar, I32);
8481
8482 Newx(ret->substrs, 1, struct reg_substr_data);
8483 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8484 s->min_offset = r->substrs->data[i].min_offset;
8485 s->max_offset = r->substrs->data[i].max_offset;
8486 s->end_shift = r->substrs->data[i].end_shift;
8487 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8488 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8489 }
8490
8491 ret->regstclass = NULL;
8492 if (r->data) {
8493 struct reg_data *d;
8494 const int count = r->data->count;
8495 int i;
8496
8497 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8498 char, struct reg_data);
8499 Newx(d->what, count, U8);
8500
8501 d->count = count;
8502 for (i = 0; i < count; i++) {
8503 d->what[i] = r->data->what[i];
8504 switch (d->what[i]) {
e2e6a0f1 8505 /* legal options are one of: sSfpont
84da74a7
YO
8506 see also regcomp.h and pregfree() */
8507 case 's':
81714fb9 8508 case 'S':
84da74a7
YO
8509 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8510 break;
8511 case 'p':
8512 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8513 break;
8514 case 'f':
8515 /* This is cheating. */
8516 Newx(d->data[i], 1, struct regnode_charclass_class);
8517 StructCopy(r->data->data[i], d->data[i],
8518 struct regnode_charclass_class);
8519 ret->regstclass = (regnode*)d->data[i];
8520 break;
8521 case 'o':
8522 /* Compiled op trees are readonly, and can thus be
8523 shared without duplication. */
8524 OP_REFCNT_LOCK;
8525 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8526 OP_REFCNT_UNLOCK;
8527 break;
8528 case 'n':
8529 d->data[i] = r->data->data[i];
8530 break;
8531 case 't':
8532 d->data[i] = r->data->data[i];
8533 OP_REFCNT_LOCK;
8534 ((reg_trie_data*)d->data[i])->refcount++;
8535 OP_REFCNT_UNLOCK;
8536 break;
8537 case 'T':
8538 d->data[i] = r->data->data[i];
8539 OP_REFCNT_LOCK;
8540 ((reg_ac_data*)d->data[i])->refcount++;
8541 OP_REFCNT_UNLOCK;
8542 /* Trie stclasses are readonly and can thus be shared
8543 * without duplication. We free the stclass in pregfree
8544 * when the corresponding reg_ac_data struct is freed.
8545 */
8546 ret->regstclass= r->regstclass;
8547 break;
8548 default:
8549 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8550 }
8551 }
8552
8553 ret->data = d;
8554 }
8555 else
8556 ret->data = NULL;
8557
8558 Newx(ret->offsets, 2*len+1, U32);
8559 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8560
8561 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8562 ret->refcnt = r->refcnt;
8563 ret->minlen = r->minlen;
8564 ret->prelen = r->prelen;
8565 ret->nparens = r->nparens;
8566 ret->lastparen = r->lastparen;
8567 ret->lastcloseparen = r->lastcloseparen;
8568 ret->reganch = r->reganch;
8569
8570 ret->sublen = r->sublen;
8571
f9f4320a 8572 ret->engine = r->engine;
81714fb9
YO
8573
8574 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8575
84da74a7
YO
8576 if (RX_MATCH_COPIED(ret))
8577 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8578 else
8579 ret->subbeg = NULL;
8580#ifdef PERL_OLD_COPY_ON_WRITE
8581 ret->saved_copy = NULL;
8582#endif
8583
8584 ptr_table_store(PL_ptr_table, r, ret);
8585 return ret;
84da74a7 8586}
a3c0e9ca 8587#endif
84da74a7 8588
76234dfb 8589#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8590/*
8591 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8592 */
8593regnode *
864dbfa3 8594Perl_regnext(pTHX_ register regnode *p)
c277df42 8595{
97aff369 8596 dVAR;
c277df42
IZ
8597 register I32 offset;
8598
3280af22 8599 if (p == &PL_regdummy)
c277df42
IZ
8600 return(NULL);
8601
8602 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8603 if (offset == 0)
8604 return(NULL);
8605
c277df42 8606 return(p+offset);
c277df42 8607}
76234dfb 8608#endif
c277df42 8609
01f988be 8610STATIC void
cea2e8a9 8611S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8612{
8613 va_list args;
8614 STRLEN l1 = strlen(pat1);
8615 STRLEN l2 = strlen(pat2);
8616 char buf[512];
06bf62c7 8617 SV *msv;
73d840c0 8618 const char *message;
c277df42
IZ
8619
8620 if (l1 > 510)
8621 l1 = 510;
8622 if (l1 + l2 > 510)
8623 l2 = 510 - l1;
8624 Copy(pat1, buf, l1 , char);
8625 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8626 buf[l1 + l2] = '\n';
8627 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8628#ifdef I_STDARG
8629 /* ANSI variant takes additional second argument */
c277df42 8630 va_start(args, pat2);
8736538c
AS
8631#else
8632 va_start(args);
8633#endif
5a844595 8634 msv = vmess(buf, &args);
c277df42 8635 va_end(args);
cfd0369c 8636 message = SvPV_const(msv,l1);
c277df42
IZ
8637 if (l1 > 512)
8638 l1 = 512;
8639 Copy(message, buf, l1 , char);
197cf9b9 8640 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8641 Perl_croak(aTHX_ "%s", buf);
c277df42 8642}
a0ed51b3
LW
8643
8644/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8645
76234dfb 8646#ifndef PERL_IN_XSUB_RE
a0ed51b3 8647void
864dbfa3 8648Perl_save_re_context(pTHX)
b81d288d 8649{
97aff369 8650 dVAR;
1ade1aa1
NC
8651
8652 struct re_save_state *state;
8653
8654 SAVEVPTR(PL_curcop);
8655 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8656
8657 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8658 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8659 SSPUSHINT(SAVEt_RE_STATE);
8660
46ab3289 8661 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8662
a0ed51b3 8663 PL_reg_start_tmp = 0;
a0ed51b3 8664 PL_reg_start_tmpl = 0;
c445ea15 8665 PL_reg_oldsaved = NULL;
a5db57d6 8666 PL_reg_oldsavedlen = 0;
a5db57d6 8667 PL_reg_maxiter = 0;
a5db57d6 8668 PL_reg_leftiter = 0;
c445ea15 8669 PL_reg_poscache = NULL;
a5db57d6 8670 PL_reg_poscache_size = 0;
1ade1aa1
NC
8671#ifdef PERL_OLD_COPY_ON_WRITE
8672 PL_nrs = NULL;
8673#endif
ada6e8a9 8674
c445ea15
AL
8675 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8676 if (PL_curpm) {
8677 const REGEXP * const rx = PM_GETRE(PL_curpm);
8678 if (rx) {
1df70142 8679 U32 i;
ada6e8a9 8680 for (i = 1; i <= rx->nparens; i++) {
1df70142 8681 char digits[TYPE_CHARS(long)];
d9fad198 8682 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8683 GV *const *const gvp
8684 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8685
b37c2d43
AL
8686 if (gvp) {
8687 GV * const gv = *gvp;
8688 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8689 save_scalar(gv);
49f27e4b 8690 }
ada6e8a9
AMS
8691 }
8692 }
8693 }
a0ed51b3 8694}
76234dfb 8695#endif
51371543 8696
51371543 8697static void
acfe0abc 8698clear_re(pTHX_ void *r)
51371543 8699{
97aff369 8700 dVAR;
51371543
GS
8701 ReREFCNT_dec((regexp *)r);
8702}
ffbc6a93 8703
a28509cc
AL
8704#ifdef DEBUGGING
8705
8706STATIC void
8707S_put_byte(pTHX_ SV *sv, int c)
8708{
8709 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8710 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8711 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8712 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8713 else
8714 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8715}
8716
786e8c11 8717
3dab1dad
YO
8718#define CLEAR_OPTSTART \
8719 if (optstart) STMT_START { \
07be1b83 8720 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8721 optstart=NULL; \
8722 } STMT_END
8723
786e8c11 8724#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8725
b5a2f8d8
NC
8726STATIC const regnode *
8727S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8728 const regnode *last, const regnode *plast,
8729 SV* sv, I32 indent, U32 depth)
a28509cc 8730{
97aff369 8731 dVAR;
786e8c11 8732 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8733 register const regnode *next;
3dab1dad
YO
8734 const regnode *optstart= NULL;
8735 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8736
786e8c11
YO
8737#ifdef DEBUG_DUMPUNTIL
8738 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8739 last ? last-start : 0,plast ? plast-start : 0);
8740#endif
8741
8742 if (plast && plast < last)
8743 last= plast;
8744
8745 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8746 /* While that wasn't END last time... */
8747
8748 NODE_ALIGN(node);
8749 op = OP(node);
8750 if (op == CLOSE)
786e8c11 8751 indent--;
b5a2f8d8 8752 next = regnext((regnode *)node);
07be1b83 8753
a28509cc 8754 /* Where, what. */
8e11feef 8755 if (OP(node) == OPTIMIZED) {
e68ec53f 8756 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8757 optstart = node;
3dab1dad 8758 else
8e11feef 8759 goto after_print;
3dab1dad
YO
8760 } else
8761 CLEAR_OPTSTART;
07be1b83 8762
32fc9b6a 8763 regprop(r, sv, node);
a28509cc 8764 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8765 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8766
8767 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8768 if (next == NULL) /* Next ptr. */
8769 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8770 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8771 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8772 else
8773 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8774
1de06328 8775 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8776 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8777 }
8778
a28509cc
AL
8779 after_print:
8780 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8781 assert(next);
8782 {
8783 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8784 ? regnext((regnode *)next)
8785 : next);
be8e71aa
YO
8786 if (last && nnode > last)
8787 nnode = last;
786e8c11 8788 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8789 }
a28509cc
AL
8790 }
8791 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8792 assert(next);
786e8c11 8793 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8794 }
8795 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 8796 const regnode *this_trie = node;
1de06328 8797 const char op = OP(node);
a28509cc 8798 const I32 n = ARG(node);
1de06328
YO
8799 const reg_ac_data * const ac = op>=AHOCORASICK ?
8800 (reg_ac_data *)r->data->data[n] :
8801 NULL;
8802 const reg_trie_data * const trie = op<AHOCORASICK ?
8803 (reg_trie_data*)r->data->data[n] :
8804 ac->trie;
786e8c11 8805 const regnode *nextbranch= NULL;
a28509cc 8806 I32 word_idx;
1de06328 8807 sv_setpvn(sv, "", 0);
786e8c11 8808 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8809 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8810
8811 PerlIO_printf(Perl_debug_log, "%*s%s ",
8812 (int)(2*(indent+3)), "",
8813 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8814 PL_colors[0], PL_colors[1],
8815 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8816 PERL_PV_PRETTY_ELIPSES |
7f69552c 8817 PERL_PV_PRETTY_LTGT
786e8c11
YO
8818 )
8819 : "???"
8820 );
8821 if (trie->jump) {
40d049e4 8822 U16 dist= trie->jump[word_idx+1];
7f69552c 8823 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 8824 (dist ? this_trie + dist : next) - start);
786e8c11
YO
8825 if (dist) {
8826 if (!nextbranch)
24b23f37 8827 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
8828 DUMPUNTIL(this_trie + dist, nextbranch);
8829 }
786e8c11
YO
8830 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8831 nextbranch= regnext((regnode *)nextbranch);
8832 } else {
8833 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8834 }
786e8c11
YO
8835 }
8836 if (last && next > last)
8837 node= last;
8838 else
8839 node= next;
a28509cc 8840 }
786e8c11
YO
8841 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8842 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8843 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8844 }
8845 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8846 assert(next);
786e8c11 8847 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8848 }
8849 else if ( op == PLUS || op == STAR) {
786e8c11 8850 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8851 }
8852 else if (op == ANYOF) {
8853 /* arglen 1 + class block */
8854 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8855 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8856 node = NEXTOPER(node);
8857 }
8858 else if (PL_regkind[(U8)op] == EXACT) {
8859 /* Literal string, where present. */
8860 node += NODE_SZ_STR(node) - 1;
8861 node = NEXTOPER(node);
8862 }
8863 else {
8864 node = NEXTOPER(node);
8865 node += regarglen[(U8)op];
8866 }
8867 if (op == CURLYX || op == OPEN)
786e8c11 8868 indent++;
a28509cc 8869 else if (op == WHILEM)
786e8c11 8870 indent--;
a28509cc 8871 }
3dab1dad 8872 CLEAR_OPTSTART;
786e8c11
YO
8873#ifdef DEBUG_DUMPUNTIL
8874 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8875#endif
1de06328 8876 return node;
a28509cc
AL
8877}
8878
8879#endif /* DEBUGGING */
8880
241d1a3b
NC
8881/*
8882 * Local variables:
8883 * c-indentation-style: bsd
8884 * c-basic-offset: 4
8885 * indent-tabs-mode: t
8886 * End:
8887 *
37442d52
RGS
8888 * ex: set ts=8 sts=4 sw=4 noet:
8889 */