This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move the test files for the debugger in a t/ subdirectory, so they
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
e2e6a0f1 117 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
118 I32 extralen;
119 I32 seen_zerolen;
120 I32 seen_evals;
40d049e4
YO
121 regnode **open_parens; /* pointers to open parens */
122 regnode **close_parens; /* pointers to close parens */
123 regnode *opend; /* END node in program */
1aa99e6b 124 I32 utf8;
6bda09f9 125 HV *charnames; /* cache of named sequences */
81714fb9 126 HV *paren_names; /* Paren names */
40d049e4
YO
127 regnode **recurse; /* Recurse regops */
128 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
129#if ADD_TO_REGEXEC
130 char *starttry; /* -Dr: where regtry was called. */
131#define RExC_starttry (pRExC_state->starttry)
132#endif
3dab1dad 133#ifdef DEBUGGING
be8e71aa 134 const char *lastparse;
3dab1dad
YO
135 I32 lastnum;
136#define RExC_lastparse (pRExC_state->lastparse)
137#define RExC_lastnum (pRExC_state->lastnum)
138#endif
830247a4
IZ
139} RExC_state_t;
140
e2509266 141#define RExC_flags (pRExC_state->flags)
830247a4
IZ
142#define RExC_precomp (pRExC_state->precomp)
143#define RExC_rx (pRExC_state->rx)
fac92740 144#define RExC_start (pRExC_state->start)
830247a4
IZ
145#define RExC_end (pRExC_state->end)
146#define RExC_parse (pRExC_state->parse)
147#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 148#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 149#define RExC_emit (pRExC_state->emit)
fac92740 150#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
151#define RExC_naughty (pRExC_state->naughty)
152#define RExC_sawback (pRExC_state->sawback)
153#define RExC_seen (pRExC_state->seen)
154#define RExC_size (pRExC_state->size)
155#define RExC_npar (pRExC_state->npar)
e2e6a0f1 156#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
157#define RExC_extralen (pRExC_state->extralen)
158#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
159#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 160#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 161#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
162#define RExC_open_parens (pRExC_state->open_parens)
163#define RExC_close_parens (pRExC_state->close_parens)
164#define RExC_opend (pRExC_state->opend)
81714fb9 165#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
166#define RExC_recurse (pRExC_state->recurse)
167#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 168
a687059c
LW
169#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
170#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
171 ((*s) == '{' && regcurly(s)))
a687059c 172
35c8bce7
LW
173#ifdef SPSTART
174#undef SPSTART /* dratted cpp namespace... */
175#endif
a687059c
LW
176/*
177 * Flags to be passed up and down.
178 */
a687059c 179#define WORST 0 /* Worst case. */
821b33a5 180#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
181#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
182#define SPSTART 0x4 /* Starts with * or +. */
183#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 184
3dab1dad
YO
185#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
186
07be1b83
YO
187/* whether trie related optimizations are enabled */
188#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
189#define TRIE_STUDY_OPT
786e8c11 190#define FULL_TRIE_STUDY
07be1b83
YO
191#define TRIE_STCLASS
192#endif
1de06328
YO
193
194
40d049e4
YO
195
196#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
197#define PBITVAL(paren) (1 << ((paren) & 7))
198#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
199#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
200#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
201
202
1de06328
YO
203/* About scan_data_t.
204
205 During optimisation we recurse through the regexp program performing
206 various inplace (keyhole style) optimisations. In addition study_chunk
207 and scan_commit populate this data structure with information about
208 what strings MUST appear in the pattern. We look for the longest
209 string that must appear for at a fixed location, and we look for the
210 longest string that may appear at a floating location. So for instance
211 in the pattern:
212
213 /FOO[xX]A.*B[xX]BAR/
214
215 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
216 strings (because they follow a .* construct). study_chunk will identify
217 both FOO and BAR as being the longest fixed and floating strings respectively.
218
219 The strings can be composites, for instance
220
221 /(f)(o)(o)/
222
223 will result in a composite fixed substring 'foo'.
224
225 For each string some basic information is maintained:
226
227 - offset or min_offset
228 This is the position the string must appear at, or not before.
229 It also implicitly (when combined with minlenp) tells us how many
230 character must match before the string we are searching.
231 Likewise when combined with minlenp and the length of the string
232 tells us how many characters must appear after the string we have
233 found.
234
235 - max_offset
236 Only used for floating strings. This is the rightmost point that
237 the string can appear at. Ifset to I32 max it indicates that the
238 string can occur infinitely far to the right.
239
240 - minlenp
241 A pointer to the minimum length of the pattern that the string
242 was found inside. This is important as in the case of positive
243 lookahead or positive lookbehind we can have multiple patterns
244 involved. Consider
245
246 /(?=FOO).*F/
247
248 The minimum length of the pattern overall is 3, the minimum length
249 of the lookahead part is 3, but the minimum length of the part that
250 will actually match is 1. So 'FOO's minimum length is 3, but the
251 minimum length for the F is 1. This is important as the minimum length
252 is used to determine offsets in front of and behind the string being
253 looked for. Since strings can be composites this is the length of the
254 pattern at the time it was commited with a scan_commit. Note that
255 the length is calculated by study_chunk, so that the minimum lengths
256 are not known until the full pattern has been compiled, thus the
257 pointer to the value.
258
259 - lookbehind
260
261 In the case of lookbehind the string being searched for can be
262 offset past the start point of the final matching string.
263 If this value was just blithely removed from the min_offset it would
264 invalidate some of the calculations for how many chars must match
265 before or after (as they are derived from min_offset and minlen and
266 the length of the string being searched for).
267 When the final pattern is compiled and the data is moved from the
268 scan_data_t structure into the regexp structure the information
269 about lookbehind is factored in, with the information that would
270 have been lost precalculated in the end_shift field for the
271 associated string.
272
273 The fields pos_min and pos_delta are used to store the minimum offset
274 and the delta to the maximum offset at the current point in the pattern.
275
276*/
2c2d71f5
JH
277
278typedef struct scan_data_t {
1de06328
YO
279 /*I32 len_min; unused */
280 /*I32 len_delta; unused */
2c2d71f5
JH
281 I32 pos_min;
282 I32 pos_delta;
283 SV *last_found;
1de06328 284 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
285 I32 last_start_min;
286 I32 last_start_max;
1de06328
YO
287 SV **longest; /* Either &l_fixed, or &l_float. */
288 SV *longest_fixed; /* longest fixed string found in pattern */
289 I32 offset_fixed; /* offset where it starts */
290 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
291 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
292 SV *longest_float; /* longest floating string found in pattern */
293 I32 offset_float_min; /* earliest point in string it can appear */
294 I32 offset_float_max; /* latest point in string it can appear */
295 I32 *minlen_float; /* pointer to the minlen relevent to the string */
296 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
297 I32 flags;
298 I32 whilem_c;
cb434fcc 299 I32 *last_closep;
653099ff 300 struct regnode_charclass_class *start_class;
2c2d71f5
JH
301} scan_data_t;
302
a687059c 303/*
e50aee73 304 * Forward declarations for pregcomp()'s friends.
a687059c 305 */
a0d0e21e 306
27da23d5 307static const scan_data_t zero_scan_data =
1de06328 308 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
309
310#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
311#define SF_BEFORE_SEOL 0x0001
312#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
313#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
314#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
315
09b7f37c
CB
316#ifdef NO_UNARY_PLUS
317# define SF_FIX_SHIFT_EOL (0+2)
318# define SF_FL_SHIFT_EOL (0+4)
319#else
320# define SF_FIX_SHIFT_EOL (+2)
321# define SF_FL_SHIFT_EOL (+4)
322#endif
c277df42
IZ
323
324#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
325#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
326
327#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
328#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
329#define SF_IS_INF 0x0040
330#define SF_HAS_PAR 0x0080
331#define SF_IN_PAR 0x0100
332#define SF_HAS_EVAL 0x0200
333#define SCF_DO_SUBSTR 0x0400
653099ff
GS
334#define SCF_DO_STCLASS_AND 0x0800
335#define SCF_DO_STCLASS_OR 0x1000
336#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 337#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 338
786e8c11 339#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 340#define SCF_SEEN_ACCEPT 0x8000
07be1b83 341
eb160463 342#define UTF (RExC_utf8 != 0)
e2509266
JH
343#define LOC ((RExC_flags & PMf_LOCALE) != 0)
344#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 345
ffc61ed2 346#define OOB_UNICODE 12345678
93733859 347#define OOB_NAMEDCLASS -1
b8c5462f 348
a0ed51b3
LW
349#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
350#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
351
8615cb43 352
b45f050a
JF
353/* length of regex to show in messages that don't mark a position within */
354#define RegexLengthToShowInErrorMessages 127
355
356/*
357 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
358 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
359 * op/pragma/warn/regcomp.
360 */
7253e4e3
RK
361#define MARKER1 "<-- HERE" /* marker as it appears in the description */
362#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 363
7253e4e3 364#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
365
366/*
367 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
368 * arg. Show regex, up to a maximum length. If it's too long, chop and add
369 * "...".
370 */
ccb2c380 371#define FAIL(msg) STMT_START { \
bfed75c6 372 const char *ellipses = ""; \
ccb2c380
MP
373 IV len = RExC_end - RExC_precomp; \
374 \
375 if (!SIZE_ONLY) \
376 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
377 if (len > RegexLengthToShowInErrorMessages) { \
378 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
379 len = RegexLengthToShowInErrorMessages - 10; \
380 ellipses = "..."; \
381 } \
382 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
383 msg, (int)len, RExC_precomp, ellipses); \
384} STMT_END
8615cb43 385
b45f050a 386/*
b45f050a
JF
387 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
388 */
ccb2c380 389#define Simple_vFAIL(m) STMT_START { \
a28509cc 390 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
391 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
392 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
393} STMT_END
b45f050a
JF
394
395/*
396 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
397 */
ccb2c380
MP
398#define vFAIL(m) STMT_START { \
399 if (!SIZE_ONLY) \
400 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
401 Simple_vFAIL(m); \
402} STMT_END
b45f050a
JF
403
404/*
405 * Like Simple_vFAIL(), but accepts two arguments.
406 */
ccb2c380 407#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 408 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
409 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
410 (int)offset, RExC_precomp, RExC_precomp + offset); \
411} STMT_END
b45f050a
JF
412
413/*
414 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
415 */
ccb2c380
MP
416#define vFAIL2(m,a1) STMT_START { \
417 if (!SIZE_ONLY) \
418 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
419 Simple_vFAIL2(m, a1); \
420} STMT_END
b45f050a
JF
421
422
423/*
424 * Like Simple_vFAIL(), but accepts three arguments.
425 */
ccb2c380 426#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 427 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
428 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
429 (int)offset, RExC_precomp, RExC_precomp + offset); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
434 */
ccb2c380
MP
435#define vFAIL3(m,a1,a2) STMT_START { \
436 if (!SIZE_ONLY) \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
438 Simple_vFAIL3(m, a1, a2); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Like Simple_vFAIL(), but accepts four arguments.
443 */
ccb2c380 444#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 445 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
448} STMT_END
b45f050a 449
ccb2c380 450#define vWARN(loc,m) STMT_START { \
a28509cc 451 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
452 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
453 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
454} STMT_END
455
456#define vWARNdep(loc,m) STMT_START { \
a28509cc 457 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
458 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
459 "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
461} STMT_END
462
463
464#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 465 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
466 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
467 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
468} STMT_END
469
470#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 471 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
472 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
473 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
475
476#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
478 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
480} STMT_END
481
482#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
484 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
485 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
486} STMT_END
9d1d55b5 487
8615cb43 488
cd439c50 489/* Allow for side effects in s */
ccb2c380
MP
490#define REGC(c,s) STMT_START { \
491 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
492} STMT_END
cd439c50 493
fac92740
MJD
494/* Macros for recording node offsets. 20001227 mjd@plover.com
495 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
496 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
497 * Element 0 holds the number n.
07be1b83 498 * Position is 1 indexed.
fac92740
MJD
499 */
500
ccb2c380
MP
501#define Set_Node_Offset_To_R(node,byte) STMT_START { \
502 if (! SIZE_ONLY) { \
503 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 504 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 505 if((node) < 0) { \
551405c4 506 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
507 } else { \
508 RExC_offsets[2*(node)-1] = (byte); \
509 } \
510 } \
511} STMT_END
512
513#define Set_Node_Offset(node,byte) \
514 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
515#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
516
517#define Set_Node_Length_To_R(node,len) STMT_START { \
518 if (! SIZE_ONLY) { \
519 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 520 __LINE__, (int)(node), (int)(len))); \
ccb2c380 521 if((node) < 0) { \
551405c4 522 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
523 } else { \
524 RExC_offsets[2*(node)] = (len); \
525 } \
526 } \
527} STMT_END
528
529#define Set_Node_Length(node,len) \
530 Set_Node_Length_To_R((node)-RExC_emit_start, len)
531#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
532#define Set_Node_Cur_Length(node) \
533 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
534
535/* Get offsets and lengths */
536#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
537#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
538
07be1b83
YO
539#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
540 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
541 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
542} STMT_END
543
544
545#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
546#define EXPERIMENTAL_INPLACESCAN
547#endif
548
1de06328 549#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 550DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
551 PerlIO_printf(Perl_debug_log, \
552 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
553 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
554 (int)(depth)*2, "", \
555 (IV)((data)->pos_min), \
556 (IV)((data)->pos_delta), \
557 (IV)((data)->flags), \
558 (IV)((data)->whilem_c), \
559 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
560 ); \
561 if ((data)->last_found) \
562 PerlIO_printf(Perl_debug_log, \
563 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
564 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
565 SvPVX_const((data)->last_found), \
566 (IV)((data)->last_end), \
567 (IV)((data)->last_start_min), \
568 (IV)((data)->last_start_max), \
569 ((data)->longest && \
570 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
571 SvPVX_const((data)->longest_fixed), \
572 (IV)((data)->offset_fixed), \
573 ((data)->longest && \
574 (data)->longest==&((data)->longest_float)) ? "*" : "", \
575 SvPVX_const((data)->longest_float), \
576 (IV)((data)->offset_float_min), \
577 (IV)((data)->offset_float_max) \
578 ); \
579 PerlIO_printf(Perl_debug_log,"\n"); \
580});
581
acfe0abc 582static void clear_re(pTHX_ void *r);
4327152a 583
653099ff 584/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 585 Update the longest found anchored substring and the longest found
653099ff
GS
586 floating substrings if needed. */
587
4327152a 588STATIC void
1de06328 589S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 590{
e1ec3a88
AL
591 const STRLEN l = CHR_SVLEN(data->last_found);
592 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 593 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 594
c277df42 595 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 596 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
597 if (*data->longest == data->longest_fixed) {
598 data->offset_fixed = l ? data->last_start_min : data->pos_min;
599 if (data->flags & SF_BEFORE_EOL)
b81d288d 600 data->flags
c277df42
IZ
601 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
602 else
603 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
604 data->minlen_fixed=minlenp;
605 data->lookbehind_fixed=0;
a0ed51b3
LW
606 }
607 else {
c277df42 608 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
609 data->offset_float_max = (l
610 ? data->last_start_max
c277df42 611 : data->pos_min + data->pos_delta);
9051bda5
HS
612 if ((U32)data->offset_float_max > (U32)I32_MAX)
613 data->offset_float_max = I32_MAX;
c277df42 614 if (data->flags & SF_BEFORE_EOL)
b81d288d 615 data->flags
c277df42
IZ
616 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
617 else
618 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
619 data->minlen_float=minlenp;
620 data->lookbehind_float=0;
c277df42
IZ
621 }
622 }
623 SvCUR_set(data->last_found, 0);
0eda9292 624 {
a28509cc 625 SV * const sv = data->last_found;
097eb12c
AL
626 if (SvUTF8(sv) && SvMAGICAL(sv)) {
627 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
628 if (mg)
629 mg->mg_len = 0;
630 }
0eda9292 631 }
c277df42
IZ
632 data->last_end = -1;
633 data->flags &= ~SF_BEFORE_EOL;
1de06328 634 DEBUG_STUDYDATA(data,0);
c277df42
IZ
635}
636
653099ff
GS
637/* Can match anything (initialization) */
638STATIC void
097eb12c 639S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 640{
653099ff 641 ANYOF_CLASS_ZERO(cl);
f8bef550 642 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 643 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
644 if (LOC)
645 cl->flags |= ANYOF_LOCALE;
646}
647
648/* Can match anything (initialization) */
649STATIC int
5f66b61c 650S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
651{
652 int value;
653
aaa51d5e 654 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
655 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
656 return 1;
1aa99e6b
IH
657 if (!(cl->flags & ANYOF_UNICODE_ALL))
658 return 0;
10edeb5d 659 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 660 return 0;
653099ff
GS
661 return 1;
662}
663
664/* Can match anything (initialization) */
665STATIC void
097eb12c 666S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 667{
8ecf7187 668 Zero(cl, 1, struct regnode_charclass_class);
653099ff 669 cl->type = ANYOF;
830247a4 670 cl_anything(pRExC_state, cl);
653099ff
GS
671}
672
673STATIC void
097eb12c 674S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 675{
8ecf7187 676 Zero(cl, 1, struct regnode_charclass_class);
653099ff 677 cl->type = ANYOF;
830247a4 678 cl_anything(pRExC_state, cl);
653099ff
GS
679 if (LOC)
680 cl->flags |= ANYOF_LOCALE;
681}
682
683/* 'And' a given class with another one. Can create false positives */
684/* We assume that cl is not inverted */
685STATIC void
5f66b61c 686S_cl_and(struct regnode_charclass_class *cl,
a28509cc 687 const struct regnode_charclass_class *and_with)
653099ff 688{
40d049e4
YO
689
690 assert(and_with->type == ANYOF);
653099ff
GS
691 if (!(and_with->flags & ANYOF_CLASS)
692 && !(cl->flags & ANYOF_CLASS)
693 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
694 && !(and_with->flags & ANYOF_FOLD)
695 && !(cl->flags & ANYOF_FOLD)) {
696 int i;
697
698 if (and_with->flags & ANYOF_INVERT)
699 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
700 cl->bitmap[i] &= ~and_with->bitmap[i];
701 else
702 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
703 cl->bitmap[i] &= and_with->bitmap[i];
704 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
705 if (!(and_with->flags & ANYOF_EOS))
706 cl->flags &= ~ANYOF_EOS;
1aa99e6b 707
14ebb1a2
JH
708 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
709 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
710 cl->flags &= ~ANYOF_UNICODE_ALL;
711 cl->flags |= ANYOF_UNICODE;
712 ARG_SET(cl, ARG(and_with));
713 }
14ebb1a2
JH
714 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
715 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 716 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
717 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
718 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 719 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
720}
721
722/* 'OR' a given class with another one. Can create false positives */
723/* We assume that cl is not inverted */
724STATIC void
097eb12c 725S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 726{
653099ff
GS
727 if (or_with->flags & ANYOF_INVERT) {
728 /* We do not use
729 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
730 * <= (B1 | !B2) | (CL1 | !CL2)
731 * which is wasteful if CL2 is small, but we ignore CL2:
732 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
733 * XXXX Can we handle case-fold? Unclear:
734 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
735 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
736 */
737 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
738 && !(or_with->flags & ANYOF_FOLD)
739 && !(cl->flags & ANYOF_FOLD) ) {
740 int i;
741
742 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
743 cl->bitmap[i] |= ~or_with->bitmap[i];
744 } /* XXXX: logic is complicated otherwise */
745 else {
830247a4 746 cl_anything(pRExC_state, cl);
653099ff
GS
747 }
748 } else {
749 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
750 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 751 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
752 || (cl->flags & ANYOF_FOLD)) ) {
753 int i;
754
755 /* OR char bitmap and class bitmap separately */
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] |= or_with->bitmap[i];
758 if (or_with->flags & ANYOF_CLASS) {
759 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
760 cl->classflags[i] |= or_with->classflags[i];
761 cl->flags |= ANYOF_CLASS;
762 }
763 }
764 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 765 cl_anything(pRExC_state, cl);
653099ff
GS
766 }
767 }
768 if (or_with->flags & ANYOF_EOS)
769 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
770
771 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
772 ARG(cl) != ARG(or_with)) {
773 cl->flags |= ANYOF_UNICODE_ALL;
774 cl->flags &= ~ANYOF_UNICODE;
775 }
776 if (or_with->flags & ANYOF_UNICODE_ALL) {
777 cl->flags |= ANYOF_UNICODE_ALL;
778 cl->flags &= ~ANYOF_UNICODE;
779 }
653099ff
GS
780}
781
a3621e74
YO
782#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
783#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
784#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
785#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
786
3dab1dad
YO
787
788#ifdef DEBUGGING
07be1b83 789/*
3dab1dad
YO
790 dump_trie(trie)
791 dump_trie_interim_list(trie,next_alloc)
792 dump_trie_interim_table(trie,next_alloc)
793
794 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
795 The _interim_ variants are used for debugging the interim
796 tables that are used to generate the final compressed
797 representation which is what dump_trie expects.
798
3dab1dad
YO
799 Part of the reason for their existance is to provide a form
800 of documentation as to how the different representations function.
07be1b83
YO
801
802*/
3dab1dad
YO
803
804/*
805 dump_trie(trie)
806 Dumps the final compressed table form of the trie to Perl_debug_log.
807 Used for debugging make_trie().
808*/
809
810STATIC void
811S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
812{
813 U32 state;
ab3bbdeb
YO
814 SV *sv=sv_newmortal();
815 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
816 GET_RE_DEBUG_FLAGS_DECL;
817
ab3bbdeb 818
3dab1dad
YO
819 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
820 (int)depth * 2 + 2,"",
821 "Match","Base","Ofs" );
822
823 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 824 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 825 if ( tmp ) {
ab3bbdeb
YO
826 PerlIO_printf( Perl_debug_log, "%*s",
827 colwidth,
ddc5bc0f 828 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
829 PL_colors[0], PL_colors[1],
830 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
831 PERL_PV_ESCAPE_FIRSTCHAR
832 )
833 );
3dab1dad
YO
834 }
835 }
836 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
837 (int)depth * 2 + 2,"");
838
839 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 840 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
841 PerlIO_printf( Perl_debug_log, "\n");
842
1e2e3d02 843 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 844 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
845
846 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
847
848 if ( trie->states[ state ].wordnum ) {
849 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
850 } else {
851 PerlIO_printf( Perl_debug_log, "%6s", "" );
852 }
853
854 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
855
856 if ( base ) {
857 U32 ofs = 0;
858
859 while( ( base + ofs < trie->uniquecharcount ) ||
860 ( base + ofs - trie->uniquecharcount < trie->lasttrans
861 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
862 ofs++;
863
864 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
865
866 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
867 if ( ( base + ofs >= trie->uniquecharcount ) &&
868 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
869 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
870 {
ab3bbdeb
YO
871 PerlIO_printf( Perl_debug_log, "%*"UVXf,
872 colwidth,
3dab1dad
YO
873 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
874 } else {
ab3bbdeb 875 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
876 }
877 }
878
879 PerlIO_printf( Perl_debug_log, "]");
880
881 }
882 PerlIO_printf( Perl_debug_log, "\n" );
883 }
884}
885/*
886 dump_trie_interim_list(trie,next_alloc)
887 Dumps a fully constructed but uncompressed trie in list form.
888 List tries normally only are used for construction when the number of
889 possible chars (trie->uniquecharcount) is very high.
890 Used for debugging make_trie().
891*/
892STATIC void
893S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
894{
895 U32 state;
ab3bbdeb
YO
896 SV *sv=sv_newmortal();
897 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
898 GET_RE_DEBUG_FLAGS_DECL;
899 /* print out the table precompression. */
ab3bbdeb
YO
900 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
901 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
902 "------:-----+-----------------\n" );
3dab1dad
YO
903
904 for( state=1 ; state < next_alloc ; state ++ ) {
905 U16 charid;
906
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
908 (int)depth * 2 + 2,"", (UV)state );
909 if ( ! trie->states[ state ].wordnum ) {
910 PerlIO_printf( Perl_debug_log, "%5s| ","");
911 } else {
912 PerlIO_printf( Perl_debug_log, "W%4x| ",
913 trie->states[ state ].wordnum
914 );
915 }
916 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 917 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
918 if ( tmp ) {
919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
920 colwidth,
ddc5bc0f 921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
922 PL_colors[0], PL_colors[1],
923 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
924 PERL_PV_ESCAPE_FIRSTCHAR
925 ) ,
1e2e3d02
YO
926 TRIE_LIST_ITEM(state,charid).forid,
927 (UV)TRIE_LIST_ITEM(state,charid).newstate
928 );
929 if (!(charid % 10))
664e119d
RGS
930 PerlIO_printf(Perl_debug_log, "\n%*s| ",
931 (int)((depth * 2) + 14), "");
1e2e3d02 932 }
ab3bbdeb
YO
933 }
934 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
935 }
936}
937
938/*
939 dump_trie_interim_table(trie,next_alloc)
940 Dumps a fully constructed but uncompressed trie in table form.
941 This is the normal DFA style state transition table, with a few
942 twists to facilitate compression later.
943 Used for debugging make_trie().
944*/
945STATIC void
946S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
947{
948 U32 state;
949 U16 charid;
ab3bbdeb
YO
950 SV *sv=sv_newmortal();
951 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
952 GET_RE_DEBUG_FLAGS_DECL;
953
954 /*
955 print out the table precompression so that we can do a visual check
956 that they are identical.
957 */
958
959 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
960
961 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 962 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 963 if ( tmp ) {
ab3bbdeb
YO
964 PerlIO_printf( Perl_debug_log, "%*s",
965 colwidth,
ddc5bc0f 966 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
967 PL_colors[0], PL_colors[1],
968 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
969 PERL_PV_ESCAPE_FIRSTCHAR
970 )
971 );
3dab1dad
YO
972 }
973 }
974
975 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
976
977 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 978 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
979 }
980
981 PerlIO_printf( Perl_debug_log, "\n" );
982
983 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
984
985 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
986 (int)depth * 2 + 2,"",
987 (UV)TRIE_NODENUM( state ) );
988
989 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
990 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
991 if (v)
992 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
993 else
994 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
995 }
996 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
998 } else {
999 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1000 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1001 }
1002 }
07be1b83 1003}
3dab1dad
YO
1004
1005#endif
1006
786e8c11
YO
1007/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1008 startbranch: the first branch in the whole branch sequence
1009 first : start branch of sequence of branch-exact nodes.
1010 May be the same as startbranch
1011 last : Thing following the last branch.
1012 May be the same as tail.
1013 tail : item following the branch sequence
1014 count : words in the sequence
1015 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1016 depth : indent depth
3dab1dad 1017
786e8c11 1018Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1019
786e8c11
YO
1020A trie is an N'ary tree where the branches are determined by digital
1021decomposition of the key. IE, at the root node you look up the 1st character and
1022follow that branch repeat until you find the end of the branches. Nodes can be
1023marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1024
786e8c11 1025 /he|she|his|hers/
72f13be8 1026
786e8c11
YO
1027would convert into the following structure. Numbers represent states, letters
1028following numbers represent valid transitions on the letter from that state, if
1029the number is in square brackets it represents an accepting state, otherwise it
1030will be in parenthesis.
07be1b83 1031
786e8c11
YO
1032 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1033 | |
1034 | (2)
1035 | |
1036 (1) +-i->(6)-+-s->[7]
1037 |
1038 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1039
786e8c11
YO
1040 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1041
1042This shows that when matching against the string 'hers' we will begin at state 1
1043read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1044then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1045is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1046single traverse. We store a mapping from accepting to state to which word was
1047matched, and then when we have multiple possibilities we try to complete the
1048rest of the regex in the order in which they occured in the alternation.
1049
1050The only prior NFA like behaviour that would be changed by the TRIE support is
1051the silent ignoring of duplicate alternations which are of the form:
1052
1053 / (DUPE|DUPE) X? (?{ ... }) Y /x
1054
1055Thus EVAL blocks follwing a trie may be called a different number of times with
1056and without the optimisation. With the optimisations dupes will be silently
1057ignored. This inconsistant behaviour of EVAL type nodes is well established as
1058the following demonstrates:
1059
1060 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1061
1062which prints out 'word' three times, but
1063
1064 'words'=~/(word|word|word)(?{ print $1 })S/
1065
1066which doesnt print it out at all. This is due to other optimisations kicking in.
1067
1068Example of what happens on a structural level:
1069
1070The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1071
1072 1: CURLYM[1] {1,32767}(18)
1073 5: BRANCH(8)
1074 6: EXACT <ac>(16)
1075 8: BRANCH(11)
1076 9: EXACT <ad>(16)
1077 11: BRANCH(14)
1078 12: EXACT <ab>(16)
1079 16: SUCCEED(0)
1080 17: NOTHING(18)
1081 18: END(0)
1082
1083This would be optimizable with startbranch=5, first=5, last=16, tail=16
1084and should turn into:
1085
1086 1: CURLYM[1] {1,32767}(18)
1087 5: TRIE(16)
1088 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1089 <ac>
1090 <ad>
1091 <ab>
1092 16: SUCCEED(0)
1093 17: NOTHING(18)
1094 18: END(0)
1095
1096Cases where tail != last would be like /(?foo|bar)baz/:
1097
1098 1: BRANCH(4)
1099 2: EXACT <foo>(8)
1100 4: BRANCH(7)
1101 5: EXACT <bar>(8)
1102 7: TAIL(8)
1103 8: EXACT <baz>(10)
1104 10: END(0)
1105
1106which would be optimizable with startbranch=1, first=1, last=7, tail=8
1107and would end up looking like:
1108
1109 1: TRIE(8)
1110 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1111 <foo>
1112 <bar>
1113 7: TAIL(8)
1114 8: EXACT <baz>(10)
1115 10: END(0)
1116
1117 d = uvuni_to_utf8_flags(d, uv, 0);
1118
1119is the recommended Unicode-aware way of saying
1120
1121 *(d++) = uv;
1122*/
1123
1e2e3d02 1124#define TRIE_STORE_REVCHAR \
786e8c11 1125 STMT_START { \
1e2e3d02 1126 SV *tmp = newSVpvs(""); \
786e8c11 1127 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1128 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1129 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1130 } STMT_END
1131
1132#define TRIE_READ_CHAR STMT_START { \
1133 wordlen++; \
1134 if ( UTF ) { \
1135 if ( folder ) { \
1136 if ( foldlen > 0 ) { \
1137 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1138 foldlen -= len; \
1139 scan += len; \
1140 len = 0; \
1141 } else { \
1142 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1143 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1144 foldlen -= UNISKIP( uvc ); \
1145 scan = foldbuf + UNISKIP( uvc ); \
1146 } \
1147 } else { \
1148 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1149 } \
1150 } else { \
1151 uvc = (U32)*uc; \
1152 len = 1; \
1153 } \
1154} STMT_END
1155
1156
1157
1158#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1159 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1160 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1161 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1162 } \
1163 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1164 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1165 TRIE_LIST_CUR( state )++; \
1166} STMT_END
07be1b83 1167
786e8c11
YO
1168#define TRIE_LIST_NEW(state) STMT_START { \
1169 Newxz( trie->states[ state ].trans.list, \
1170 4, reg_trie_trans_le ); \
1171 TRIE_LIST_CUR( state ) = 1; \
1172 TRIE_LIST_LEN( state ) = 4; \
1173} STMT_END
07be1b83 1174
786e8c11
YO
1175#define TRIE_HANDLE_WORD(state) STMT_START { \
1176 U16 dupe= trie->states[ state ].wordnum; \
1177 regnode * const noper_next = regnext( noper ); \
1178 \
1179 if (trie->wordlen) \
1180 trie->wordlen[ curword ] = wordlen; \
1181 DEBUG_r({ \
1182 /* store the word for dumping */ \
1183 SV* tmp; \
1184 if (OP(noper) != NOTHING) \
1185 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1186 else \
1187 tmp = newSVpvn( "", 0 ); \
1188 if ( UTF ) SvUTF8_on( tmp ); \
1189 av_push( trie->words, tmp ); \
1190 }); \
1191 \
1192 curword++; \
1193 \
1194 if ( noper_next < tail ) { \
1195 if (!trie->jump) \
1196 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1197 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1198 if (!jumper) \
1199 jumper = noper_next; \
1200 if (!nextbranch) \
1201 nextbranch= regnext(cur); \
1202 } \
1203 \
1204 if ( dupe ) { \
1205 /* So it's a dupe. This means we need to maintain a */\
1206 /* linked-list from the first to the next. */\
1207 /* we only allocate the nextword buffer when there */\
1208 /* a dupe, so first time we have to do the allocation */\
1209 if (!trie->nextword) \
1210 Newxz( trie->nextword, word_count + 1, U16); \
1211 while ( trie->nextword[dupe] ) \
1212 dupe= trie->nextword[dupe]; \
1213 trie->nextword[dupe]= curword; \
1214 } else { \
1215 /* we haven't inserted this word yet. */ \
1216 trie->states[ state ].wordnum = curword; \
1217 } \
1218} STMT_END
07be1b83 1219
3dab1dad 1220
786e8c11
YO
1221#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1222 ( ( base + charid >= ucharcount \
1223 && base + charid < ubound \
1224 && state == trie->trans[ base - ucharcount + charid ].check \
1225 && trie->trans[ base - ucharcount + charid ].next ) \
1226 ? trie->trans[ base - ucharcount + charid ].next \
1227 : ( state==1 ? special : 0 ) \
1228 )
3dab1dad 1229
786e8c11
YO
1230#define MADE_TRIE 1
1231#define MADE_JUMP_TRIE 2
1232#define MADE_EXACT_TRIE 4
3dab1dad 1233
a3621e74 1234STATIC I32
786e8c11 1235S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1236{
27da23d5 1237 dVAR;
a3621e74
YO
1238 /* first pass, loop through and scan words */
1239 reg_trie_data *trie;
1240 regnode *cur;
9f7f3913 1241 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1242 STRLEN len = 0;
1243 UV uvc = 0;
1244 U16 curword = 0;
1245 U32 next_alloc = 0;
786e8c11
YO
1246 regnode *jumper = NULL;
1247 regnode *nextbranch = NULL;
7f69552c 1248 regnode *convert = NULL;
a3621e74 1249 /* we just use folder as a flag in utf8 */
e1ec3a88 1250 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1251 ? PL_fold
1252 : ( flags == EXACTFL
1253 ? PL_fold_locale
1254 : NULL
1255 )
1256 );
1257
e1ec3a88 1258 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1259 SV *re_trie_maxbuff;
3dab1dad
YO
1260#ifndef DEBUGGING
1261 /* these are only used during construction but are useful during
8e11feef 1262 * debugging so we store them in the struct when debugging.
8e11feef 1263 */
3dab1dad 1264 STRLEN trie_charcount=0;
3dab1dad
YO
1265 AV *trie_revcharmap;
1266#endif
a3621e74 1267 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1268#ifndef DEBUGGING
1269 PERL_UNUSED_ARG(depth);
1270#endif
a3621e74 1271
a02a5408 1272 Newxz( trie, 1, reg_trie_data );
a3621e74 1273 trie->refcount = 1;
3dab1dad 1274 trie->startstate = 1;
786e8c11 1275 trie->wordcount = word_count;
a3621e74 1276 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1277 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1278 if (!(UTF && folder))
1279 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1280 DEBUG_r({
1281 trie->words = newAV();
a3621e74 1282 });
3dab1dad 1283 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1284
0111c4fd 1285 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1286 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1287 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1288 }
3dab1dad
YO
1289 DEBUG_OPTIMISE_r({
1290 PerlIO_printf( Perl_debug_log,
786e8c11 1291 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1292 (int)depth * 2 + 2, "",
1293 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1294 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1295 (int)depth);
3dab1dad 1296 });
7f69552c
YO
1297
1298 /* Find the node we are going to overwrite */
1299 if ( first == startbranch && OP( last ) != BRANCH ) {
1300 /* whole branch chain */
1301 convert = first;
1302 } else {
1303 /* branch sub-chain */
1304 convert = NEXTOPER( first );
1305 }
1306
a3621e74
YO
1307 /* -- First loop and Setup --
1308
1309 We first traverse the branches and scan each word to determine if it
1310 contains widechars, and how many unique chars there are, this is
1311 important as we have to build a table with at least as many columns as we
1312 have unique chars.
1313
1314 We use an array of integers to represent the character codes 0..255
1315 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1316 native representation of the character value as the key and IV's for the
1317 coded index.
1318
1319 *TODO* If we keep track of how many times each character is used we can
1320 remap the columns so that the table compression later on is more
1321 efficient in terms of memory by ensuring most common value is in the
1322 middle and the least common are on the outside. IMO this would be better
1323 than a most to least common mapping as theres a decent chance the most
1324 common letter will share a node with the least common, meaning the node
1325 will not be compressable. With a middle is most common approach the worst
1326 case is when we have the least common nodes twice.
1327
1328 */
1329
a3621e74 1330 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1331 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1332 const U8 *uc = (U8*)STRING( noper );
a28509cc 1333 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1334 STRLEN foldlen = 0;
1335 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1336 const U8 *scan = (U8*)NULL;
07be1b83 1337 U32 wordlen = 0; /* required init */
3dab1dad 1338 STRLEN chars=0;
a3621e74 1339
3dab1dad
YO
1340 if (OP(noper) == NOTHING) {
1341 trie->minlen= 0;
1342 continue;
1343 }
1344 if (trie->bitmap) {
1345 TRIE_BITMAP_SET(trie,*uc);
1346 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1347 }
a3621e74 1348 for ( ; uc < e ; uc += len ) {
3dab1dad 1349 TRIE_CHARCOUNT(trie)++;
a3621e74 1350 TRIE_READ_CHAR;
3dab1dad 1351 chars++;
a3621e74
YO
1352 if ( uvc < 256 ) {
1353 if ( !trie->charmap[ uvc ] ) {
1354 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1355 if ( folder )
1356 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1357 TRIE_STORE_REVCHAR;
a3621e74
YO
1358 }
1359 } else {
1360 SV** svpp;
1361 if ( !trie->widecharmap )
1362 trie->widecharmap = newHV();
1363
1364 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1365
1366 if ( !svpp )
e4584336 1367 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1368
1369 if ( !SvTRUE( *svpp ) ) {
1370 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1371 TRIE_STORE_REVCHAR;
a3621e74
YO
1372 }
1373 }
1374 }
3dab1dad
YO
1375 if( cur == first ) {
1376 trie->minlen=chars;
1377 trie->maxlen=chars;
1378 } else if (chars < trie->minlen) {
1379 trie->minlen=chars;
1380 } else if (chars > trie->maxlen) {
1381 trie->maxlen=chars;
1382 }
1383
a3621e74
YO
1384 } /* end first pass */
1385 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1386 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1387 (int)depth * 2 + 2,"",
85c3142d 1388 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1389 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1390 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1391 );
786e8c11 1392 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1393
1394 /*
1395 We now know what we are dealing with in terms of unique chars and
1396 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1397 representation using a flat table will take. If it's over a reasonable
1398 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1399 conservative but potentially much slower representation using an array
1400 of lists.
1401
1402 At the end we convert both representations into the same compressed
1403 form that will be used in regexec.c for matching with. The latter
1404 is a form that cannot be used to construct with but has memory
1405 properties similar to the list form and access properties similar
1406 to the table form making it both suitable for fast searches and
1407 small enough that its feasable to store for the duration of a program.
1408
1409 See the comment in the code where the compressed table is produced
1410 inplace from the flat tabe representation for an explanation of how
1411 the compression works.
1412
1413 */
1414
1415
3dab1dad 1416 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1417 /*
1418 Second Pass -- Array Of Lists Representation
1419
1420 Each state will be represented by a list of charid:state records
1421 (reg_trie_trans_le) the first such element holds the CUR and LEN
1422 points of the allocated array. (See defines above).
1423
1424 We build the initial structure using the lists, and then convert
1425 it into the compressed table form which allows faster lookups
1426 (but cant be modified once converted).
a3621e74
YO
1427 */
1428
a3621e74
YO
1429 STRLEN transcount = 1;
1430
1e2e3d02
YO
1431 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1432 "%*sCompiling trie using list compiler\n",
1433 (int)depth * 2 + 2, ""));
1434
3dab1dad 1435 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1436 TRIE_LIST_NEW(1);
1437 next_alloc = 2;
1438
1439 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1440
c445ea15
AL
1441 regnode * const noper = NEXTOPER( cur );
1442 U8 *uc = (U8*)STRING( noper );
1443 const U8 * const e = uc + STR_LEN( noper );
1444 U32 state = 1; /* required init */
1445 U16 charid = 0; /* sanity init */
1446 U8 *scan = (U8*)NULL; /* sanity init */
1447 STRLEN foldlen = 0; /* required init */
07be1b83 1448 U32 wordlen = 0; /* required init */
c445ea15
AL
1449 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1450
3dab1dad 1451 if (OP(noper) != NOTHING) {
786e8c11 1452 for ( ; uc < e ; uc += len ) {
c445ea15 1453
786e8c11 1454 TRIE_READ_CHAR;
c445ea15 1455
786e8c11
YO
1456 if ( uvc < 256 ) {
1457 charid = trie->charmap[ uvc ];
c445ea15 1458 } else {
786e8c11
YO
1459 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1460 if ( !svpp ) {
1461 charid = 0;
1462 } else {
1463 charid=(U16)SvIV( *svpp );
1464 }
c445ea15 1465 }
786e8c11
YO
1466 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1467 if ( charid ) {
a3621e74 1468
786e8c11
YO
1469 U16 check;
1470 U32 newstate = 0;
a3621e74 1471
786e8c11
YO
1472 charid--;
1473 if ( !trie->states[ state ].trans.list ) {
1474 TRIE_LIST_NEW( state );
c445ea15 1475 }
786e8c11
YO
1476 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1477 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1478 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1479 break;
1480 }
1481 }
1482 if ( ! newstate ) {
1483 newstate = next_alloc++;
1484 TRIE_LIST_PUSH( state, charid, newstate );
1485 transcount++;
1486 }
1487 state = newstate;
1488 } else {
1489 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1490 }
a28509cc 1491 }
c445ea15 1492 }
3dab1dad 1493 TRIE_HANDLE_WORD(state);
a3621e74
YO
1494
1495 } /* end second pass */
1496
1e2e3d02
YO
1497 /* next alloc is the NEXT state to be allocated */
1498 trie->statecount = next_alloc;
a3621e74
YO
1499 Renew( trie->states, next_alloc, reg_trie_state );
1500
3dab1dad
YO
1501 /* and now dump it out before we compress it */
1502 DEBUG_TRIE_COMPILE_MORE_r(
1503 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1504 );
a3621e74 1505
a02a5408 1506 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1507 {
1508 U32 state;
a3621e74
YO
1509 U32 tp = 0;
1510 U32 zp = 0;
1511
1512
1513 for( state=1 ; state < next_alloc ; state ++ ) {
1514 U32 base=0;
1515
1516 /*
1517 DEBUG_TRIE_COMPILE_MORE_r(
1518 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1519 );
1520 */
1521
1522 if (trie->states[state].trans.list) {
1523 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1524 U16 maxid=minid;
a28509cc 1525 U16 idx;
a3621e74
YO
1526
1527 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1528 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1529 if ( forid < minid ) {
1530 minid=forid;
1531 } else if ( forid > maxid ) {
1532 maxid=forid;
1533 }
a3621e74
YO
1534 }
1535 if ( transcount < tp + maxid - minid + 1) {
1536 transcount *= 2;
1537 Renew( trie->trans, transcount, reg_trie_trans );
1538 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1539 }
1540 base = trie->uniquecharcount + tp - minid;
1541 if ( maxid == minid ) {
1542 U32 set = 0;
1543 for ( ; zp < tp ; zp++ ) {
1544 if ( ! trie->trans[ zp ].next ) {
1545 base = trie->uniquecharcount + zp - minid;
1546 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1547 trie->trans[ zp ].check = state;
1548 set = 1;
1549 break;
1550 }
1551 }
1552 if ( !set ) {
1553 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1554 trie->trans[ tp ].check = state;
1555 tp++;
1556 zp = tp;
1557 }
1558 } else {
1559 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1560 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1561 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1562 trie->trans[ tid ].check = state;
1563 }
1564 tp += ( maxid - minid + 1 );
1565 }
1566 Safefree(trie->states[ state ].trans.list);
1567 }
1568 /*
1569 DEBUG_TRIE_COMPILE_MORE_r(
1570 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1571 );
1572 */
1573 trie->states[ state ].trans.base=base;
1574 }
cc601c31 1575 trie->lasttrans = tp + 1;
a3621e74
YO
1576 }
1577 } else {
1578 /*
1579 Second Pass -- Flat Table Representation.
1580
1581 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1582 We know that we will need Charcount+1 trans at most to store the data
1583 (one row per char at worst case) So we preallocate both structures
1584 assuming worst case.
1585
1586 We then construct the trie using only the .next slots of the entry
1587 structs.
1588
1589 We use the .check field of the first entry of the node temporarily to
1590 make compression both faster and easier by keeping track of how many non
1591 zero fields are in the node.
1592
1593 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1594 transition.
1595
1596 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1597 number representing the first entry of the node, and state as a
1598 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1599 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1600 are 2 entrys per node. eg:
1601
1602 A B A B
1603 1. 2 4 1. 3 7
1604 2. 0 3 3. 0 5
1605 3. 0 0 5. 0 0
1606 4. 0 0 7. 0 0
1607
1608 The table is internally in the right hand, idx form. However as we also
1609 have to deal with the states array which is indexed by nodenum we have to
1610 use TRIE_NODENUM() to convert.
1611
1612 */
1e2e3d02
YO
1613 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1614 "%*sCompiling trie using table compiler\n",
1615 (int)depth * 2 + 2, ""));
3dab1dad
YO
1616
1617 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1618 reg_trie_trans );
3dab1dad 1619 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1620 next_alloc = trie->uniquecharcount + 1;
1621
3dab1dad 1622
a3621e74
YO
1623 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1624
c445ea15 1625 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1626 const U8 *uc = (U8*)STRING( noper );
1627 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1628
1629 U32 state = 1; /* required init */
1630
1631 U16 charid = 0; /* sanity init */
1632 U32 accept_state = 0; /* sanity init */
1633 U8 *scan = (U8*)NULL; /* sanity init */
1634
1635 STRLEN foldlen = 0; /* required init */
07be1b83 1636 U32 wordlen = 0; /* required init */
a3621e74
YO
1637 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1638
3dab1dad 1639 if ( OP(noper) != NOTHING ) {
786e8c11 1640 for ( ; uc < e ; uc += len ) {
a3621e74 1641
786e8c11 1642 TRIE_READ_CHAR;
a3621e74 1643
786e8c11
YO
1644 if ( uvc < 256 ) {
1645 charid = trie->charmap[ uvc ];
1646 } else {
1647 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1648 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1649 }
786e8c11
YO
1650 if ( charid ) {
1651 charid--;
1652 if ( !trie->trans[ state + charid ].next ) {
1653 trie->trans[ state + charid ].next = next_alloc;
1654 trie->trans[ state ].check++;
1655 next_alloc += trie->uniquecharcount;
1656 }
1657 state = trie->trans[ state + charid ].next;
1658 } else {
1659 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1660 }
1661 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1662 }
a3621e74 1663 }
3dab1dad
YO
1664 accept_state = TRIE_NODENUM( state );
1665 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1666
1667 } /* end second pass */
1668
3dab1dad
YO
1669 /* and now dump it out before we compress it */
1670 DEBUG_TRIE_COMPILE_MORE_r(
1671 dump_trie_interim_table(trie,next_alloc,depth+1)
1672 );
a3621e74 1673
a3621e74
YO
1674 {
1675 /*
1676 * Inplace compress the table.*
1677
1678 For sparse data sets the table constructed by the trie algorithm will
1679 be mostly 0/FAIL transitions or to put it another way mostly empty.
1680 (Note that leaf nodes will not contain any transitions.)
1681
1682 This algorithm compresses the tables by eliminating most such
1683 transitions, at the cost of a modest bit of extra work during lookup:
1684
1685 - Each states[] entry contains a .base field which indicates the
1686 index in the state[] array wheres its transition data is stored.
1687
1688 - If .base is 0 there are no valid transitions from that node.
1689
1690 - If .base is nonzero then charid is added to it to find an entry in
1691 the trans array.
1692
1693 -If trans[states[state].base+charid].check!=state then the
1694 transition is taken to be a 0/Fail transition. Thus if there are fail
1695 transitions at the front of the node then the .base offset will point
1696 somewhere inside the previous nodes data (or maybe even into a node
1697 even earlier), but the .check field determines if the transition is
1698 valid.
1699
786e8c11 1700 XXX - wrong maybe?
a3621e74
YO
1701 The following process inplace converts the table to the compressed
1702 table: We first do not compress the root node 1,and mark its all its
1703 .check pointers as 1 and set its .base pointer as 1 as well. This
1704 allows to do a DFA construction from the compressed table later, and
1705 ensures that any .base pointers we calculate later are greater than
1706 0.
1707
1708 - We set 'pos' to indicate the first entry of the second node.
1709
1710 - We then iterate over the columns of the node, finding the first and
1711 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1712 and set the .check pointers accordingly, and advance pos
1713 appropriately and repreat for the next node. Note that when we copy
1714 the next pointers we have to convert them from the original
1715 NODEIDX form to NODENUM form as the former is not valid post
1716 compression.
1717
1718 - If a node has no transitions used we mark its base as 0 and do not
1719 advance the pos pointer.
1720
1721 - If a node only has one transition we use a second pointer into the
1722 structure to fill in allocated fail transitions from other states.
1723 This pointer is independent of the main pointer and scans forward
1724 looking for null transitions that are allocated to a state. When it
1725 finds one it writes the single transition into the "hole". If the
786e8c11 1726 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1727
1728 - Once compressed we can Renew/realloc the structures to release the
1729 excess space.
1730
1731 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1732 specifically Fig 3.47 and the associated pseudocode.
1733
1734 demq
1735 */
a3b680e6 1736 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1737 U32 state, charid;
a3621e74 1738 U32 pos = 0, zp=0;
1e2e3d02 1739 trie->statecount = laststate;
a3621e74
YO
1740
1741 for ( state = 1 ; state < laststate ; state++ ) {
1742 U8 flag = 0;
a28509cc
AL
1743 const U32 stateidx = TRIE_NODEIDX( state );
1744 const U32 o_used = trie->trans[ stateidx ].check;
1745 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1746 trie->trans[ stateidx ].check = 0;
1747
1748 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1749 if ( flag || trie->trans[ stateidx + charid ].next ) {
1750 if ( trie->trans[ stateidx + charid ].next ) {
1751 if (o_used == 1) {
1752 for ( ; zp < pos ; zp++ ) {
1753 if ( ! trie->trans[ zp ].next ) {
1754 break;
1755 }
1756 }
1757 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1758 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1759 trie->trans[ zp ].check = state;
1760 if ( ++zp > pos ) pos = zp;
1761 break;
1762 }
1763 used--;
1764 }
1765 if ( !flag ) {
1766 flag = 1;
1767 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1768 }
1769 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1770 trie->trans[ pos ].check = state;
1771 pos++;
1772 }
1773 }
1774 }
cc601c31 1775 trie->lasttrans = pos + 1;
1e2e3d02 1776 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1777 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1778 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1779 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1780 (int)depth * 2 + 2,"",
1781 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1782 (IV)next_alloc,
1783 (IV)pos,
a3621e74
YO
1784 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1785 );
1786
1787 } /* end table compress */
1788 }
1e2e3d02
YO
1789 DEBUG_TRIE_COMPILE_MORE_r(
1790 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1791 (int)depth * 2 + 2, "",
1792 (UV)trie->statecount,
1793 (UV)trie->lasttrans)
1794 );
cc601c31
YO
1795 /* resize the trans array to remove unused space */
1796 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1797
3dab1dad
YO
1798 /* and now dump out the compressed format */
1799 DEBUG_TRIE_COMPILE_r(
1800 dump_trie(trie,depth+1)
1801 );
07be1b83 1802
3dab1dad 1803 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1804 U8 nodetype =(U8)(flags & 0xFF);
1805 char *str=NULL;
786e8c11 1806
07be1b83 1807#ifdef DEBUGGING
e62cc96a 1808 regnode *optimize = NULL;
b57a0404
JH
1809 U32 mjd_offset = 0;
1810 U32 mjd_nodelen = 0;
07be1b83 1811#endif
a3621e74 1812 /*
3dab1dad
YO
1813 This means we convert either the first branch or the first Exact,
1814 depending on whether the thing following (in 'last') is a branch
1815 or not and whther first is the startbranch (ie is it a sub part of
1816 the alternation or is it the whole thing.)
1817 Assuming its a sub part we conver the EXACT otherwise we convert
1818 the whole branch sequence, including the first.
a3621e74 1819 */
3dab1dad 1820 /* Find the node we are going to overwrite */
7f69552c 1821 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1822 /* branch sub-chain */
3dab1dad 1823 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1824 DEBUG_r({
1825 mjd_offset= Node_Offset((convert));
1826 mjd_nodelen= Node_Length((convert));
1827 });
7f69552c
YO
1828 /* whole branch chain */
1829 } else {
1830 DEBUG_r({
1831 const regnode *nop = NEXTOPER( convert );
1832 mjd_offset= Node_Offset((nop));
1833 mjd_nodelen= Node_Length((nop));
1834 });
07be1b83 1835 }
7f69552c 1836
07be1b83
YO
1837 DEBUG_OPTIMISE_r(
1838 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1839 (int)depth * 2 + 2, "",
786e8c11 1840 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1841 );
a3621e74 1842
3dab1dad
YO
1843 /* But first we check to see if there is a common prefix we can
1844 split out as an EXACT and put in front of the TRIE node. */
1845 trie->startstate= 1;
786e8c11 1846 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1847 U32 state;
1e2e3d02 1848 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1849 U32 ofs = 0;
8e11feef
RGS
1850 I32 idx = -1;
1851 U32 count = 0;
1852 const U32 base = trie->states[ state ].trans.base;
a3621e74 1853
3dab1dad 1854 if ( trie->states[state].wordnum )
8e11feef 1855 count = 1;
a3621e74 1856
8e11feef 1857 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1858 if ( ( base + ofs >= trie->uniquecharcount ) &&
1859 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1860 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1861 {
3dab1dad 1862 if ( ++count > 1 ) {
8e11feef 1863 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1864 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1865 if ( state == 1 ) break;
3dab1dad
YO
1866 if ( count == 2 ) {
1867 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1868 DEBUG_OPTIMISE_r(
8e11feef
RGS
1869 PerlIO_printf(Perl_debug_log,
1870 "%*sNew Start State=%"UVuf" Class: [",
1871 (int)depth * 2 + 2, "",
786e8c11 1872 (UV)state));
be8e71aa
YO
1873 if (idx >= 0) {
1874 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1875 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1876
3dab1dad 1877 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1878 if ( folder )
1879 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1880 DEBUG_OPTIMISE_r(
07be1b83 1881 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1882 );
8e11feef
RGS
1883 }
1884 }
1885 TRIE_BITMAP_SET(trie,*ch);
1886 if ( folder )
1887 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1888 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1889 }
1890 idx = ofs;
1891 }
3dab1dad
YO
1892 }
1893 if ( count == 1 ) {
1894 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1895 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1896 DEBUG_OPTIMISE_r(
8e11feef
RGS
1897 PerlIO_printf( Perl_debug_log,
1898 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1899 (int)depth * 2 + 2, "",
786e8c11 1900 (UV)state, (UV)idx, ch)
3dab1dad
YO
1901 );
1902 if ( state==1 ) {
1903 OP( convert ) = nodetype;
1904 str=STRING(convert);
1905 STR_LEN(convert)=0;
1906 }
1907 *str++=*ch;
1908 STR_LEN(convert)++;
a3621e74 1909
8e11feef 1910 } else {
f9049ba1 1911#ifdef DEBUGGING
8e11feef
RGS
1912 if (state>1)
1913 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1914#endif
8e11feef
RGS
1915 break;
1916 }
1917 }
3dab1dad 1918 if (str) {
8e11feef 1919 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1920 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1921 trie->startstate = state;
07be1b83
YO
1922 trie->minlen -= (state - 1);
1923 trie->maxlen -= (state - 1);
1924 DEBUG_r({
1925 regnode *fix = convert;
1926 mjd_nodelen++;
1927 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1928 while( ++fix < n ) {
1929 Set_Node_Offset_Length(fix, 0, 0);
1930 }
1931 });
8e11feef
RGS
1932 if (trie->maxlen) {
1933 convert = n;
1934 } else {
3dab1dad 1935 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1936 DEBUG_r(optimize= n);
3dab1dad
YO
1937 }
1938 }
1939 }
a5ca303d
YO
1940 if (!jumper)
1941 jumper = last;
3dab1dad 1942 if ( trie->maxlen ) {
8e11feef
RGS
1943 NEXT_OFF( convert ) = (U16)(tail - convert);
1944 ARG_SET( convert, data_slot );
786e8c11
YO
1945 /* Store the offset to the first unabsorbed branch in
1946 jump[0], which is otherwise unused by the jump logic.
1947 We use this when dumping a trie and during optimisation. */
1948 if (trie->jump)
7f69552c 1949 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1950
786e8c11
YO
1951 /* XXXX */
1952 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1953 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1954 {
1955 OP( convert ) = TRIEC;
1956 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1957 Safefree(trie->bitmap);
1958 trie->bitmap= NULL;
1959 } else
1960 OP( convert ) = TRIE;
a3621e74 1961
3dab1dad
YO
1962 /* store the type in the flags */
1963 convert->flags = nodetype;
a5ca303d
YO
1964 DEBUG_r({
1965 optimize = convert
1966 + NODE_STEP_REGNODE
1967 + regarglen[ OP( convert ) ];
1968 });
1969 /* XXX We really should free up the resource in trie now,
1970 as we won't use them - (which resources?) dmq */
3dab1dad 1971 }
a3621e74 1972 /* needed for dumping*/
e62cc96a 1973 DEBUG_r(if (optimize) {
07be1b83 1974 regnode *opt = convert;
e62cc96a 1975 while ( ++opt < optimize) {
07be1b83
YO
1976 Set_Node_Offset_Length(opt,0,0);
1977 }
786e8c11
YO
1978 /*
1979 Try to clean up some of the debris left after the
1980 optimisation.
a3621e74 1981 */
786e8c11 1982 while( optimize < jumper ) {
07be1b83 1983 mjd_nodelen += Node_Length((optimize));
a3621e74 1984 OP( optimize ) = OPTIMIZED;
07be1b83 1985 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1986 optimize++;
1987 }
07be1b83 1988 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1989 });
1990 } /* end node insert */
07be1b83 1991#ifndef DEBUGGING
6e8b4190 1992 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1993#endif
786e8c11
YO
1994 return trie->jump
1995 ? MADE_JUMP_TRIE
1996 : trie->startstate>1
1997 ? MADE_EXACT_TRIE
1998 : MADE_TRIE;
1999}
2000
2001STATIC void
2002S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2003{
2004/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2005
2006 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2007 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2008 ISBN 0-201-10088-6
2009
2010 We find the fail state for each state in the trie, this state is the longest proper
2011 suffix of the current states 'word' that is also a proper prefix of another word in our
2012 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2013 the DFA not to have to restart after its tried and failed a word at a given point, it
2014 simply continues as though it had been matching the other word in the first place.
2015 Consider
2016 'abcdgu'=~/abcdefg|cdgu/
2017 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2018 fail, which would bring use to the state representing 'd' in the second word where we would
2019 try 'g' and succeed, prodceding to match 'cdgu'.
2020 */
2021 /* add a fail transition */
2022 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2023 U32 *q;
2024 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2025 const U32 numstates = trie->statecount;
786e8c11
YO
2026 const U32 ubound = trie->lasttrans + ucharcount;
2027 U32 q_read = 0;
2028 U32 q_write = 0;
2029 U32 charid;
2030 U32 base = trie->states[ 1 ].trans.base;
2031 U32 *fail;
2032 reg_ac_data *aho;
2033 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2034 GET_RE_DEBUG_FLAGS_DECL;
2035#ifndef DEBUGGING
2036 PERL_UNUSED_ARG(depth);
2037#endif
2038
2039
2040 ARG_SET( stclass, data_slot );
2041 Newxz( aho, 1, reg_ac_data );
2042 RExC_rx->data->data[ data_slot ] = (void*)aho;
2043 aho->trie=trie;
2044 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2045 numstates * sizeof(reg_trie_state));
786e8c11
YO
2046 Newxz( q, numstates, U32);
2047 Newxz( aho->fail, numstates, U32 );
2048 aho->refcount = 1;
2049 fail = aho->fail;
2050 /* initialize fail[0..1] to be 1 so that we always have
2051 a valid final fail state */
2052 fail[ 0 ] = fail[ 1 ] = 1;
2053
2054 for ( charid = 0; charid < ucharcount ; charid++ ) {
2055 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2056 if ( newstate ) {
2057 q[ q_write ] = newstate;
2058 /* set to point at the root */
2059 fail[ q[ q_write++ ] ]=1;
2060 }
2061 }
2062 while ( q_read < q_write) {
2063 const U32 cur = q[ q_read++ % numstates ];
2064 base = trie->states[ cur ].trans.base;
2065
2066 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2067 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2068 if (ch_state) {
2069 U32 fail_state = cur;
2070 U32 fail_base;
2071 do {
2072 fail_state = fail[ fail_state ];
2073 fail_base = aho->states[ fail_state ].trans.base;
2074 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2075
2076 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2077 fail[ ch_state ] = fail_state;
2078 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2079 {
2080 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2081 }
2082 q[ q_write++ % numstates] = ch_state;
2083 }
2084 }
2085 }
2086 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2087 when we fail in state 1, this allows us to use the
2088 charclass scan to find a valid start char. This is based on the principle
2089 that theres a good chance the string being searched contains lots of stuff
2090 that cant be a start char.
2091 */
2092 fail[ 0 ] = fail[ 1 ] = 0;
2093 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2094 PerlIO_printf(Perl_debug_log,
2095 "%*sStclass Failtable (%"UVuf" states): 0",
2096 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2097 );
786e8c11
YO
2098 for( q_read=1; q_read<numstates; q_read++ ) {
2099 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2100 }
2101 PerlIO_printf(Perl_debug_log, "\n");
2102 });
2103 Safefree(q);
2104 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2105}
2106
786e8c11 2107
a3621e74 2108/*
5d1c421c
JH
2109 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2110 * These need to be revisited when a newer toolchain becomes available.
2111 */
2112#if defined(__sparc64__) && defined(__GNUC__)
2113# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2114# undef SPARC64_GCC_WORKAROUND
2115# define SPARC64_GCC_WORKAROUND 1
2116# endif
2117#endif
2118
07be1b83
YO
2119#define DEBUG_PEEP(str,scan,depth) \
2120 DEBUG_OPTIMISE_r({ \
2121 SV * const mysv=sv_newmortal(); \
2122 regnode *Next = regnext(scan); \
2123 regprop(RExC_rx, mysv, scan); \
7f69552c 2124 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2125 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2126 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2127 });
2128
1de06328
YO
2129
2130
2131
2132
07be1b83
YO
2133#define JOIN_EXACT(scan,min,flags) \
2134 if (PL_regkind[OP(scan)] == EXACT) \
2135 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2136
be8e71aa 2137STATIC U32
07be1b83
YO
2138S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2139 /* Merge several consecutive EXACTish nodes into one. */
2140 regnode *n = regnext(scan);
2141 U32 stringok = 1;
2142 regnode *next = scan + NODE_SZ_STR(scan);
2143 U32 merged = 0;
2144 U32 stopnow = 0;
2145#ifdef DEBUGGING
2146 regnode *stop = scan;
72f13be8 2147 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2148#else
d47053eb
RGS
2149 PERL_UNUSED_ARG(depth);
2150#endif
2151#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2152 PERL_UNUSED_ARG(flags);
2153 PERL_UNUSED_ARG(val);
07be1b83 2154#endif
07be1b83
YO
2155 DEBUG_PEEP("join",scan,depth);
2156
2157 /* Skip NOTHING, merge EXACT*. */
2158 while (n &&
2159 ( PL_regkind[OP(n)] == NOTHING ||
2160 (stringok && (OP(n) == OP(scan))))
2161 && NEXT_OFF(n)
2162 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2163
2164 if (OP(n) == TAIL || n > next)
2165 stringok = 0;
2166 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2167 DEBUG_PEEP("skip:",n,depth);
2168 NEXT_OFF(scan) += NEXT_OFF(n);
2169 next = n + NODE_STEP_REGNODE;
2170#ifdef DEBUGGING
2171 if (stringok)
2172 stop = n;
2173#endif
2174 n = regnext(n);
2175 }
2176 else if (stringok) {
786e8c11 2177 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2178 regnode * const nnext = regnext(n);
2179
2180 DEBUG_PEEP("merg",n,depth);
2181
2182 merged++;
2183 if (oldl + STR_LEN(n) > U8_MAX)
2184 break;
2185 NEXT_OFF(scan) += NEXT_OFF(n);
2186 STR_LEN(scan) += STR_LEN(n);
2187 next = n + NODE_SZ_STR(n);
2188 /* Now we can overwrite *n : */
2189 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2190#ifdef DEBUGGING
2191 stop = next - 1;
2192#endif
2193 n = nnext;
2194 if (stopnow) break;
2195 }
2196
d47053eb
RGS
2197#ifdef EXPERIMENTAL_INPLACESCAN
2198 if (flags && !NEXT_OFF(n)) {
2199 DEBUG_PEEP("atch", val, depth);
2200 if (reg_off_by_arg[OP(n)]) {
2201 ARG_SET(n, val - n);
2202 }
2203 else {
2204 NEXT_OFF(n) = val - n;
2205 }
2206 stopnow = 1;
2207 }
07be1b83
YO
2208#endif
2209 }
2210
2211 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2212 /*
2213 Two problematic code points in Unicode casefolding of EXACT nodes:
2214
2215 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2216 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2217
2218 which casefold to
2219
2220 Unicode UTF-8
2221
2222 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2223 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2224
2225 This means that in case-insensitive matching (or "loose matching",
2226 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2227 length of the above casefolded versions) can match a target string
2228 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2229 This would rather mess up the minimum length computation.
2230
2231 What we'll do is to look for the tail four bytes, and then peek
2232 at the preceding two bytes to see whether we need to decrease
2233 the minimum length by four (six minus two).
2234
2235 Thanks to the design of UTF-8, there cannot be false matches:
2236 A sequence of valid UTF-8 bytes cannot be a subsequence of
2237 another valid sequence of UTF-8 bytes.
2238
2239 */
2240 char * const s0 = STRING(scan), *s, *t;
2241 char * const s1 = s0 + STR_LEN(scan) - 1;
2242 char * const s2 = s1 - 4;
e294cc5d
JH
2243#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2244 const char t0[] = "\xaf\x49\xaf\x42";
2245#else
07be1b83 2246 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2247#endif
07be1b83
YO
2248 const char * const t1 = t0 + 3;
2249
2250 for (s = s0 + 2;
2251 s < s2 && (t = ninstr(s, s1, t0, t1));
2252 s = t + 4) {
e294cc5d
JH
2253#ifdef EBCDIC
2254 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2255 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2256#else
07be1b83
YO
2257 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2258 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2259#endif
07be1b83
YO
2260 *min -= 4;
2261 }
2262 }
2263
2264#ifdef DEBUGGING
2265 /* Allow dumping */
2266 n = scan + NODE_SZ_STR(scan);
2267 while (n <= stop) {
2268 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2269 OP(n) = OPTIMIZED;
2270 NEXT_OFF(n) = 0;
2271 }
2272 n++;
2273 }
2274#endif
2275 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2276 return stopnow;
2277}
2278
653099ff
GS
2279/* REx optimizer. Converts nodes into quickier variants "in place".
2280 Finds fixed substrings. */
2281
a0288114 2282/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2283 to the position after last scanned or to NULL. */
2284
40d049e4
YO
2285#define INIT_AND_WITHP \
2286 assert(!and_withp); \
2287 Newx(and_withp,1,struct regnode_charclass_class); \
2288 SAVEFREEPV(and_withp)
07be1b83 2289
76e3520e 2290STATIC I32
40d049e4 2291S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2292 I32 *minlenp, I32 *deltap,
40d049e4
YO
2293 regnode *last,
2294 scan_data_t *data,
2295 I32 stopparen,
2296 U8* recursed,
2297 struct regnode_charclass_class *and_withp,
2298 U32 flags, U32 depth)
c277df42
IZ
2299 /* scanp: Start here (read-write). */
2300 /* deltap: Write maxlen-minlen here. */
2301 /* last: Stop before this one. */
40d049e4
YO
2302 /* data: string data about the pattern */
2303 /* stopparen: treat close N as END */
2304 /* recursed: which subroutines have we recursed into */
2305 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2306{
97aff369 2307 dVAR;
c277df42
IZ
2308 I32 min = 0, pars = 0, code;
2309 regnode *scan = *scanp, *next;
2310 I32 delta = 0;
2311 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2312 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2313 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2314 scan_data_t data_fake;
a3621e74 2315 SV *re_trie_maxbuff = NULL;
786e8c11 2316 regnode *first_non_open = scan;
e2e6a0f1 2317 I32 stopmin = I32_MAX;
a3621e74 2318 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2319#ifdef DEBUGGING
40d049e4 2320 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2321#endif
40d049e4 2322
786e8c11 2323 if ( depth == 0 ) {
40d049e4 2324 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2325 first_non_open=regnext(first_non_open);
2326 }
2327
b81d288d 2328
c277df42
IZ
2329 while (scan && OP(scan) != END && scan < last) {
2330 /* Peephole optimizer: */
1de06328 2331 DEBUG_STUDYDATA(data,depth);
07be1b83 2332 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2333 JOIN_EXACT(scan,&min,0);
a3621e74 2334
653099ff
GS
2335 /* Follow the next-chain of the current node and optimize
2336 away all the NOTHINGs from it. */
c277df42 2337 if (OP(scan) != CURLYX) {
a3b680e6 2338 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2339 ? I32_MAX
2340 /* I32 may be smaller than U16 on CRAYs! */
2341 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2342 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2343 int noff;
2344 regnode *n = scan;
b81d288d 2345
c277df42
IZ
2346 /* Skip NOTHING and LONGJMP. */
2347 while ((n = regnext(n))
3dab1dad 2348 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2349 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2350 && off + noff < max)
2351 off += noff;
2352 if (reg_off_by_arg[OP(scan)])
2353 ARG(scan) = off;
b81d288d 2354 else
c277df42
IZ
2355 NEXT_OFF(scan) = off;
2356 }
a3621e74 2357
07be1b83 2358
3dab1dad 2359
653099ff
GS
2360 /* The principal pseudo-switch. Cannot be a switch, since we
2361 look into several different things. */
b81d288d 2362 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2363 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2364 next = regnext(scan);
2365 code = OP(scan);
a3621e74 2366 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2367
2368 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2369 /* NOTE - There is similar code to this block below for handling
2370 TRIE nodes on a re-study. If you change stuff here check there
2371 too. */
c277df42 2372 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2373 struct regnode_charclass_class accum;
d4c19fe8 2374 regnode * const startbranch=scan;
c277df42 2375
653099ff 2376 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2377 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2378 if (flags & SCF_DO_STCLASS)
830247a4 2379 cl_init_zero(pRExC_state, &accum);
a3621e74 2380
c277df42 2381 while (OP(scan) == code) {
830247a4 2382 I32 deltanext, minnext, f = 0, fake;
653099ff 2383 struct regnode_charclass_class this_class;
c277df42
IZ
2384
2385 num++;
2386 data_fake.flags = 0;
40d049e4 2387 if (data) {
2c2d71f5 2388 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2389 data_fake.last_closep = data->last_closep;
2390 }
2391 else
2392 data_fake.last_closep = &fake;
c277df42
IZ
2393 next = regnext(scan);
2394 scan = NEXTOPER(scan);
2395 if (code != BRANCH)
2396 scan = NEXTOPER(scan);
653099ff 2397 if (flags & SCF_DO_STCLASS) {
830247a4 2398 cl_init(pRExC_state, &this_class);
653099ff
GS
2399 data_fake.start_class = &this_class;
2400 f = SCF_DO_STCLASS_AND;
b81d288d 2401 }
e1901655
IZ
2402 if (flags & SCF_WHILEM_VISITED_POS)
2403 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2404
653099ff 2405 /* we suppose the run is continuous, last=next...*/
1de06328 2406 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2407 next, &data_fake,
2408 stopparen, recursed, NULL, f,depth+1);
b81d288d 2409 if (min1 > minnext)
c277df42
IZ
2410 min1 = minnext;
2411 if (max1 < minnext + deltanext)
2412 max1 = minnext + deltanext;
2413 if (deltanext == I32_MAX)
aca2d497 2414 is_inf = is_inf_internal = 1;
c277df42
IZ
2415 scan = next;
2416 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2417 pars++;
e2e6a0f1
YO
2418 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2419 if ( stopmin > minnext)
2420 stopmin = min + min1;
2421 flags &= ~SCF_DO_SUBSTR;
2422 if (data)
2423 data->flags |= SCF_SEEN_ACCEPT;
2424 }
3dab1dad
YO
2425 if (data) {
2426 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2427 data->flags |= SF_HAS_EVAL;
2c2d71f5 2428 data->whilem_c = data_fake.whilem_c;
3dab1dad 2429 }
653099ff 2430 if (flags & SCF_DO_STCLASS)
830247a4 2431 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2432 if (code == SUSPEND)
c277df42
IZ
2433 break;
2434 }
2435 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2436 min1 = 0;
2437 if (flags & SCF_DO_SUBSTR) {
2438 data->pos_min += min1;
2439 data->pos_delta += max1 - min1;
2440 if (max1 != min1 || is_inf)
2441 data->longest = &(data->longest_float);
2442 }
2443 min += min1;
2444 delta += max1 - min1;
653099ff 2445 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2446 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2447 if (min1) {
40d049e4 2448 cl_and(data->start_class, and_withp);
653099ff
GS
2449 flags &= ~SCF_DO_STCLASS;
2450 }
2451 }
2452 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2453 if (min1) {
2454 cl_and(data->start_class, &accum);
653099ff 2455 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2456 }
2457 else {
b81d288d 2458 /* Switch to OR mode: cache the old value of
de0c8cb8 2459 * data->start_class */
40d049e4
YO
2460 INIT_AND_WITHP;
2461 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2462 struct regnode_charclass_class);
2463 flags &= ~SCF_DO_STCLASS_AND;
2464 StructCopy(&accum, data->start_class,
2465 struct regnode_charclass_class);
2466 flags |= SCF_DO_STCLASS_OR;
2467 data->start_class->flags |= ANYOF_EOS;
2468 }
653099ff 2469 }
a3621e74 2470
786e8c11 2471 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2472 /* demq.
2473
2474 Assuming this was/is a branch we are dealing with: 'scan' now
2475 points at the item that follows the branch sequence, whatever
2476 it is. We now start at the beginning of the sequence and look
2477 for subsequences of
2478
786e8c11
YO
2479 BRANCH->EXACT=>x1
2480 BRANCH->EXACT=>x2
2481 tail
a3621e74
YO
2482
2483 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2484
2485 If we can find such a subseqence we need to turn the first
2486 element into a trie and then add the subsequent branch exact
2487 strings to the trie.
2488
2489 We have two cases
2490
786e8c11 2491 1. patterns where the whole set of branch can be converted.
a3621e74 2492
786e8c11 2493 2. patterns where only a subset can be converted.
a3621e74
YO
2494
2495 In case 1 we can replace the whole set with a single regop
2496 for the trie. In case 2 we need to keep the start and end
2497 branchs so
2498
2499 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2500 becomes BRANCH TRIE; BRANCH X;
2501
786e8c11
YO
2502 There is an additional case, that being where there is a
2503 common prefix, which gets split out into an EXACT like node
2504 preceding the TRIE node.
2505
2506 If x(1..n)==tail then we can do a simple trie, if not we make
2507 a "jump" trie, such that when we match the appropriate word
2508 we "jump" to the appopriate tail node. Essentailly we turn
2509 a nested if into a case structure of sorts.
a3621e74
YO
2510
2511 */
786e8c11 2512
3dab1dad 2513 int made=0;
0111c4fd
RGS
2514 if (!re_trie_maxbuff) {
2515 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2516 if (!SvIOK(re_trie_maxbuff))
2517 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2518 }
786e8c11 2519 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2520 regnode *cur;
2521 regnode *first = (regnode *)NULL;
2522 regnode *last = (regnode *)NULL;
2523 regnode *tail = scan;
2524 U8 optype = 0;
2525 U32 count=0;
2526
2527#ifdef DEBUGGING
c445ea15 2528 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2529#endif
2530 /* var tail is used because there may be a TAIL
2531 regop in the way. Ie, the exacts will point to the
2532 thing following the TAIL, but the last branch will
2533 point at the TAIL. So we advance tail. If we
2534 have nested (?:) we may have to move through several
2535 tails.
2536 */
2537
2538 while ( OP( tail ) == TAIL ) {
2539 /* this is the TAIL generated by (?:) */
2540 tail = regnext( tail );
2541 }
2542
3dab1dad 2543
a3621e74 2544 DEBUG_OPTIMISE_r({
32fc9b6a 2545 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2546 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2547 (int)depth * 2 + 2, "",
2548 "Looking for TRIE'able sequences. Tail node is: ",
2549 SvPV_nolen_const( mysv )
a3621e74
YO
2550 );
2551 });
3dab1dad 2552
a3621e74
YO
2553 /*
2554
2555 step through the branches, cur represents each
2556 branch, noper is the first thing to be matched
2557 as part of that branch and noper_next is the
2558 regnext() of that node. if noper is an EXACT
2559 and noper_next is the same as scan (our current
2560 position in the regex) then the EXACT branch is
2561 a possible optimization target. Once we have
2562 two or more consequetive such branches we can
2563 create a trie of the EXACT's contents and stich
2564 it in place. If the sequence represents all of
2565 the branches we eliminate the whole thing and
2566 replace it with a single TRIE. If it is a
2567 subsequence then we need to stitch it in. This
2568 means the first branch has to remain, and needs
2569 to be repointed at the item on the branch chain
2570 following the last branch optimized. This could
2571 be either a BRANCH, in which case the
2572 subsequence is internal, or it could be the
2573 item following the branch sequence in which
2574 case the subsequence is at the end.
2575
2576 */
2577
2578 /* dont use tail as the end marker for this traverse */
2579 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2580 regnode * const noper = NEXTOPER( cur );
be981c67 2581#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2582 regnode * const noper_next = regnext( noper );
be981c67 2583#endif
a3621e74 2584
a3621e74 2585 DEBUG_OPTIMISE_r({
32fc9b6a 2586 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2587 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2588 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2589
32fc9b6a 2590 regprop(RExC_rx, mysv, noper);
a3621e74 2591 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2592 SvPV_nolen_const(mysv));
a3621e74
YO
2593
2594 if ( noper_next ) {
32fc9b6a 2595 regprop(RExC_rx, mysv, noper_next );
a3621e74 2596 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2597 SvPV_nolen_const(mysv));
a3621e74 2598 }
3dab1dad
YO
2599 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2600 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2601 });
3dab1dad
YO
2602 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2603 : PL_regkind[ OP( noper ) ] == EXACT )
2604 || OP(noper) == NOTHING )
786e8c11
YO
2605#ifdef NOJUMPTRIE
2606 && noper_next == tail
2607#endif
2608 && count < U16_MAX)
a3621e74
YO
2609 {
2610 count++;
3dab1dad
YO
2611 if ( !first || optype == NOTHING ) {
2612 if (!first) first = cur;
a3621e74
YO
2613 optype = OP( noper );
2614 } else {
a3621e74 2615 last = cur;
a3621e74
YO
2616 }
2617 } else {
2618 if ( last ) {
786e8c11
YO
2619 make_trie( pRExC_state,
2620 startbranch, first, cur, tail, count,
2621 optype, depth+1 );
a3621e74 2622 }
3dab1dad 2623 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2624#ifdef NOJUMPTRIE
2625 && noper_next == tail
2626#endif
2627 ){
a3621e74
YO
2628 count = 1;
2629 first = cur;
2630 optype = OP( noper );
2631 } else {
2632 count = 0;
2633 first = NULL;
2634 optype = 0;
2635 }
2636 last = NULL;
2637 }
2638 }
2639 DEBUG_OPTIMISE_r({
32fc9b6a 2640 regprop(RExC_rx, mysv, cur);
a3621e74 2641 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2642 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2643 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2644
2645 });
2646 if ( last ) {
786e8c11 2647 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2648#ifdef TRIE_STUDY_OPT
786e8c11
YO
2649 if ( ((made == MADE_EXACT_TRIE &&
2650 startbranch == first)
2651 || ( first_non_open == first )) &&
5d458dd8 2652 depth==0 ) {
786e8c11 2653 flags |= SCF_TRIE_RESTUDY;
5d458dd8
YO
2654 if ( startbranch == first
2655 && scan == tail )
2656 {
2657 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2658 }
2659 }
3dab1dad 2660#endif
07be1b83 2661 }
a3621e74 2662 }
3dab1dad
YO
2663
2664 } /* do trie */
786e8c11 2665
a0ed51b3 2666 }
a3621e74 2667 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2668 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2669 } else /* single branch is optimized. */
c277df42
IZ
2670 scan = NEXTOPER(scan);
2671 continue;
a0ed51b3
LW
2672 }
2673 else if (OP(scan) == EXACT) {
cd439c50 2674 I32 l = STR_LEN(scan);
c445ea15 2675 UV uc;
a0ed51b3 2676 if (UTF) {
a3b680e6 2677 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2678 l = utf8_length(s, s + l);
9041c2e3 2679 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2680 } else {
2681 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2682 }
2683 min += l;
c277df42 2684 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2685 /* The code below prefers earlier match for fixed
2686 offset, later match for variable offset. */
2687 if (data->last_end == -1) { /* Update the start info. */
2688 data->last_start_min = data->pos_min;
2689 data->last_start_max = is_inf
b81d288d 2690 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2691 }
cd439c50 2692 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2693 if (UTF)
2694 SvUTF8_on(data->last_found);
0eda9292 2695 {
9a957fbc 2696 SV * const sv = data->last_found;
a28509cc 2697 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2698 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2699 if (mg && mg->mg_len >= 0)
5e43f467
JH
2700 mg->mg_len += utf8_length((U8*)STRING(scan),
2701 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2702 }
c277df42
IZ
2703 data->last_end = data->pos_min + l;
2704 data->pos_min += l; /* As in the first entry. */
2705 data->flags &= ~SF_BEFORE_EOL;
2706 }
653099ff
GS
2707 if (flags & SCF_DO_STCLASS_AND) {
2708 /* Check whether it is compatible with what we know already! */
2709 int compat = 1;
2710
1aa99e6b 2711 if (uc >= 0x100 ||
516a5887 2712 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2713 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2714 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2715 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2716 )
653099ff
GS
2717 compat = 0;
2718 ANYOF_CLASS_ZERO(data->start_class);
2719 ANYOF_BITMAP_ZERO(data->start_class);
2720 if (compat)
1aa99e6b 2721 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2722 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2723 if (uc < 0x100)
2724 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2725 }
2726 else if (flags & SCF_DO_STCLASS_OR) {
2727 /* false positive possible if the class is case-folded */
1aa99e6b 2728 if (uc < 0x100)
9b877dbb
IH
2729 ANYOF_BITMAP_SET(data->start_class, uc);
2730 else
2731 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2732 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2733 cl_and(data->start_class, and_withp);
653099ff
GS
2734 }
2735 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2736 }
3dab1dad 2737 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2738 I32 l = STR_LEN(scan);
1aa99e6b 2739 UV uc = *((U8*)STRING(scan));
653099ff
GS
2740
2741 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2742 if (flags & SCF_DO_SUBSTR) {
2743 assert(data);
1de06328 2744 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2745 }
a0ed51b3 2746 if (UTF) {
6136c704 2747 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2748 l = utf8_length(s, s + l);
9041c2e3 2749 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2750 }
2751 min += l;
ecaa9b9c 2752 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2753 data->pos_min += l;
653099ff
GS
2754 if (flags & SCF_DO_STCLASS_AND) {
2755 /* Check whether it is compatible with what we know already! */
2756 int compat = 1;
2757
1aa99e6b 2758 if (uc >= 0x100 ||
516a5887 2759 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2760 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2761 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2762 compat = 0;
2763 ANYOF_CLASS_ZERO(data->start_class);
2764 ANYOF_BITMAP_ZERO(data->start_class);
2765 if (compat) {
1aa99e6b 2766 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2767 data->start_class->flags &= ~ANYOF_EOS;
2768 data->start_class->flags |= ANYOF_FOLD;
2769 if (OP(scan) == EXACTFL)
2770 data->start_class->flags |= ANYOF_LOCALE;
2771 }
2772 }
2773 else if (flags & SCF_DO_STCLASS_OR) {
2774 if (data->start_class->flags & ANYOF_FOLD) {
2775 /* false positive possible if the class is case-folded.
2776 Assume that the locale settings are the same... */
1aa99e6b
IH
2777 if (uc < 0x100)
2778 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2779 data->start_class->flags &= ~ANYOF_EOS;
2780 }
40d049e4 2781 cl_and(data->start_class, and_withp);
653099ff
GS
2782 }
2783 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2784 }
bfed75c6 2785 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2786 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2787 I32 f = flags, pos_before = 0;
d4c19fe8 2788 regnode * const oscan = scan;
653099ff
GS
2789 struct regnode_charclass_class this_class;
2790 struct regnode_charclass_class *oclass = NULL;
727f22e3 2791 I32 next_is_eval = 0;
653099ff 2792
3dab1dad 2793 switch (PL_regkind[OP(scan)]) {
653099ff 2794 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2795 scan = NEXTOPER(scan);
2796 goto finish;
2797 case PLUS:
653099ff 2798 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2799 next = NEXTOPER(scan);
653099ff 2800 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2801 mincount = 1;
2802 maxcount = REG_INFTY;
c277df42
IZ
2803 next = regnext(scan);
2804 scan = NEXTOPER(scan);
2805 goto do_curly;
2806 }
2807 }
2808 if (flags & SCF_DO_SUBSTR)
2809 data->pos_min++;
2810 min++;
2811 /* Fall through. */
2812 case STAR:
653099ff
GS
2813 if (flags & SCF_DO_STCLASS) {
2814 mincount = 0;
b81d288d 2815 maxcount = REG_INFTY;
653099ff
GS
2816 next = regnext(scan);
2817 scan = NEXTOPER(scan);
2818 goto do_curly;
2819 }
b81d288d 2820 is_inf = is_inf_internal = 1;
c277df42
IZ
2821 scan = regnext(scan);
2822 if (flags & SCF_DO_SUBSTR) {
1de06328 2823 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2824 data->longest = &(data->longest_float);
2825 }
2826 goto optimize_curly_tail;
2827 case CURLY:
40d049e4
YO
2828 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2829 && (scan->flags == stopparen))
2830 {
2831 mincount = 1;
2832 maxcount = 1;
2833 } else {
2834 mincount = ARG1(scan);
2835 maxcount = ARG2(scan);
2836 }
c277df42 2837 next = regnext(scan);
cb434fcc
IZ
2838 if (OP(scan) == CURLYX) {
2839 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2840 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2841 }
c277df42 2842 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2843 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2844 do_curly:
2845 if (flags & SCF_DO_SUBSTR) {
1de06328 2846 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2847 pos_before = data->pos_min;
2848 }
2849 if (data) {
2850 fl = data->flags;
2851 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2852 if (is_inf)
2853 data->flags |= SF_IS_INF;
2854 }
653099ff 2855 if (flags & SCF_DO_STCLASS) {
830247a4 2856 cl_init(pRExC_state, &this_class);
653099ff
GS
2857 oclass = data->start_class;
2858 data->start_class = &this_class;
2859 f |= SCF_DO_STCLASS_AND;
2860 f &= ~SCF_DO_STCLASS_OR;
2861 }
e1901655
IZ
2862 /* These are the cases when once a subexpression
2863 fails at a particular position, it cannot succeed
2864 even after backtracking at the enclosing scope.
b81d288d 2865
e1901655
IZ
2866 XXXX what if minimal match and we are at the
2867 initial run of {n,m}? */
2868 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2869 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2870
c277df42 2871 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2872 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2873 last, data, stopparen, recursed, NULL,
a3621e74
YO
2874 (mincount == 0
2875 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2876
2877 if (flags & SCF_DO_STCLASS)
2878 data->start_class = oclass;
2879 if (mincount == 0 || minnext == 0) {
2880 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2881 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2882 }
2883 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2884 /* Switch to OR mode: cache the old value of
653099ff 2885 * data->start_class */
40d049e4
YO
2886 INIT_AND_WITHP;
2887 StructCopy(data->start_class, and_withp,
653099ff
GS
2888 struct regnode_charclass_class);
2889 flags &= ~SCF_DO_STCLASS_AND;
2890 StructCopy(&this_class, data->start_class,
2891 struct regnode_charclass_class);
2892 flags |= SCF_DO_STCLASS_OR;
2893 data->start_class->flags |= ANYOF_EOS;
2894 }
2895 } else { /* Non-zero len */
2896 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2897 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2898 cl_and(data->start_class, and_withp);
653099ff
GS
2899 }
2900 else if (flags & SCF_DO_STCLASS_AND)
2901 cl_and(data->start_class, &this_class);
2902 flags &= ~SCF_DO_STCLASS;
2903 }
c277df42
IZ
2904 if (!scan) /* It was not CURLYX, but CURLY. */
2905 scan = next;
041457d9
DM
2906 if ( /* ? quantifier ok, except for (?{ ... }) */
2907 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2908 && (minnext == 0) && (deltanext == 0)
99799961 2909 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2910 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2911 && ckWARN(WARN_REGEXP))
b45f050a 2912 {
830247a4 2913 vWARN(RExC_parse,
b45f050a
JF
2914 "Quantifier unexpected on zero-length expression");
2915 }
2916
c277df42 2917 min += minnext * mincount;
b81d288d 2918 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2919 && (minnext + deltanext) > 0)
2920 || deltanext == I32_MAX);
aca2d497 2921 is_inf |= is_inf_internal;
c277df42
IZ
2922 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2923
2924 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2925 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2926 && data->flags & SF_IN_PAR
2927 && !(data->flags & SF_HAS_EVAL)
2928 && !deltanext && minnext == 1 ) {
2929 /* Try to optimize to CURLYN. */
2930 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2931 regnode * const nxt1 = nxt;
497b47a8
JH
2932#ifdef DEBUGGING
2933 regnode *nxt2;
2934#endif
c277df42
IZ
2935
2936 /* Skip open. */
2937 nxt = regnext(nxt);
bfed75c6 2938 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2939 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2940 && STR_LEN(nxt) == 1))
c277df42 2941 goto nogo;
497b47a8 2942#ifdef DEBUGGING
c277df42 2943 nxt2 = nxt;
497b47a8 2944#endif
c277df42 2945 nxt = regnext(nxt);
b81d288d 2946 if (OP(nxt) != CLOSE)
c277df42 2947 goto nogo;
40d049e4
YO
2948 if (RExC_open_parens) {
2949 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2950 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2951 }
c277df42 2952 /* Now we know that nxt2 is the only contents: */
eb160463 2953 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2954 OP(oscan) = CURLYN;
2955 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2956
c277df42
IZ
2957#ifdef DEBUGGING
2958 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2959 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2960 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2961 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2962 OP(nxt + 1) = OPTIMIZED; /* was count. */
2963 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2964#endif
c277df42 2965 }
c277df42
IZ
2966 nogo:
2967
2968 /* Try optimization CURLYX => CURLYM. */
b81d288d 2969 if ( OP(oscan) == CURLYX && data
c277df42 2970 && !(data->flags & SF_HAS_PAR)
c277df42 2971 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2972 && !deltanext /* atom is fixed width */
2973 && minnext != 0 /* CURLYM can't handle zero width */
2974 ) {
c277df42
IZ
2975 /* XXXX How to optimize if data == 0? */
2976 /* Optimize to a simpler form. */
2977 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2978 regnode *nxt2;
2979
2980 OP(oscan) = CURLYM;
2981 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2982 && (OP(nxt2) != WHILEM))
c277df42
IZ
2983 nxt = nxt2;
2984 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2985 /* Need to optimize away parenths. */
2986 if (data->flags & SF_IN_PAR) {
2987 /* Set the parenth number. */
2988 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2989
b81d288d 2990 if (OP(nxt) != CLOSE)
b45f050a 2991 FAIL("Panic opt close");
eb160463 2992 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2993 if (RExC_open_parens) {
2994 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2995 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2996 }
c277df42
IZ
2997 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2998 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2999
c277df42
IZ
3000#ifdef DEBUGGING
3001 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3002 OP(nxt + 1) = OPTIMIZED; /* was count. */
3003 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3004 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3005#endif
c277df42
IZ
3006#if 0
3007 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3008 regnode *nnxt = regnext(nxt1);
b81d288d 3009
c277df42
IZ
3010 if (nnxt == nxt) {
3011 if (reg_off_by_arg[OP(nxt1)])
3012 ARG_SET(nxt1, nxt2 - nxt1);
3013 else if (nxt2 - nxt1 < U16_MAX)
3014 NEXT_OFF(nxt1) = nxt2 - nxt1;
3015 else
3016 OP(nxt) = NOTHING; /* Cannot beautify */
3017 }
3018 nxt1 = nnxt;
3019 }
3020#endif
3021 /* Optimize again: */
1de06328 3022 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3023 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3024 }
3025 else
c277df42 3026 oscan->flags = 0;
c277df42 3027 }
e1901655
IZ
3028 else if ((OP(oscan) == CURLYX)
3029 && (flags & SCF_WHILEM_VISITED_POS)
3030 /* See the comment on a similar expression above.
3031 However, this time it not a subexpression
3032 we care about, but the expression itself. */
3033 && (maxcount == REG_INFTY)
3034 && data && ++data->whilem_c < 16) {
3035 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3036 /* Find WHILEM (as in regexec.c) */
3037 regnode *nxt = oscan + NEXT_OFF(oscan);
3038
3039 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3040 nxt += ARG(nxt);
eb160463
GS
3041 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3042 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3043 }
b81d288d 3044 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3045 pars++;
3046 if (flags & SCF_DO_SUBSTR) {
c445ea15 3047 SV *last_str = NULL;
c277df42
IZ
3048 int counted = mincount != 0;
3049
3050 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3051#if defined(SPARC64_GCC_WORKAROUND)
3052 I32 b = 0;
3053 STRLEN l = 0;
cfd0369c 3054 const char *s = NULL;
5d1c421c
JH
3055 I32 old = 0;
3056
3057 if (pos_before >= data->last_start_min)
3058 b = pos_before;
3059 else
3060 b = data->last_start_min;
3061
3062 l = 0;
cfd0369c 3063 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3064 old = b - data->last_start_min;
3065
3066#else
b81d288d 3067 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3068 ? pos_before : data->last_start_min;
3069 STRLEN l;
d4c19fe8 3070 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3071 I32 old = b - data->last_start_min;
5d1c421c 3072#endif
a0ed51b3
LW
3073
3074 if (UTF)
3075 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3076
a0ed51b3 3077 l -= old;
c277df42 3078 /* Get the added string: */
79cb57f6 3079 last_str = newSVpvn(s + old, l);
0e933229
IH
3080 if (UTF)
3081 SvUTF8_on(last_str);
c277df42
IZ
3082 if (deltanext == 0 && pos_before == b) {
3083 /* What was added is a constant string */
3084 if (mincount > 1) {
3085 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3086 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3087 SvPVX_const(last_str), l, mincount - 1);
b162af07 3088 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3089 /* Add additional parts. */
b81d288d 3090 SvCUR_set(data->last_found,
c277df42
IZ
3091 SvCUR(data->last_found) - l);
3092 sv_catsv(data->last_found, last_str);
0eda9292
JH
3093 {
3094 SV * sv = data->last_found;
3095 MAGIC *mg =
3096 SvUTF8(sv) && SvMAGICAL(sv) ?
3097 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3098 if (mg && mg->mg_len >= 0)
3099 mg->mg_len += CHR_SVLEN(last_str);
3100 }
c277df42
IZ
3101 data->last_end += l * (mincount - 1);
3102 }
2a8d9689
HS
3103 } else {
3104 /* start offset must point into the last copy */
3105 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3106 data->last_start_max += is_inf ? I32_MAX
3107 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3108 }
3109 }
3110 /* It is counted once already... */
3111 data->pos_min += minnext * (mincount - counted);
3112 data->pos_delta += - counted * deltanext +
3113 (minnext + deltanext) * maxcount - minnext * mincount;
3114 if (mincount != maxcount) {
653099ff
GS
3115 /* Cannot extend fixed substrings found inside
3116 the group. */
1de06328 3117 scan_commit(pRExC_state,data,minlenp);
c277df42 3118 if (mincount && last_str) {
d4c19fe8
AL
3119 SV * const sv = data->last_found;
3120 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3121 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3122
3123 if (mg)
3124 mg->mg_len = -1;
3125 sv_setsv(sv, last_str);
c277df42 3126 data->last_end = data->pos_min;
b81d288d 3127 data->last_start_min =
a0ed51b3 3128 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3129 data->last_start_max = is_inf
3130 ? I32_MAX
c277df42 3131 : data->pos_min + data->pos_delta
a0ed51b3 3132 - CHR_SVLEN(last_str);
c277df42
IZ
3133 }
3134 data->longest = &(data->longest_float);
3135 }
aca2d497 3136 SvREFCNT_dec(last_str);
c277df42 3137 }
405ff068 3138 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3139 data->flags |= SF_HAS_EVAL;
3140 optimize_curly_tail:
c277df42 3141 if (OP(oscan) != CURLYX) {
3dab1dad 3142 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3143 && NEXT_OFF(next))
3144 NEXT_OFF(oscan) += NEXT_OFF(next);
3145 }
c277df42 3146 continue;
653099ff 3147 default: /* REF and CLUMP only? */
c277df42 3148 if (flags & SCF_DO_SUBSTR) {
1de06328 3149 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3150 data->longest = &(data->longest_float);
3151 }
aca2d497 3152 is_inf = is_inf_internal = 1;
653099ff 3153 if (flags & SCF_DO_STCLASS_OR)
830247a4 3154 cl_anything(pRExC_state, data->start_class);
653099ff 3155 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3156 break;
3157 }
a0ed51b3 3158 }
bfed75c6 3159 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3160 int value = 0;
653099ff 3161
c277df42 3162 if (flags & SCF_DO_SUBSTR) {
1de06328 3163 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3164 data->pos_min++;
3165 }
3166 min++;
653099ff
GS
3167 if (flags & SCF_DO_STCLASS) {
3168 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3169
3170 /* Some of the logic below assumes that switching
3171 locale on will only add false positives. */
3dab1dad 3172 switch (PL_regkind[OP(scan)]) {
653099ff 3173 case SANY:
653099ff
GS
3174 default:
3175 do_default:
3176 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3177 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3178 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3179 break;
3180 case REG_ANY:
3181 if (OP(scan) == SANY)
3182 goto do_default;
3183 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3184 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3185 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3186 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3187 }
3188 if (flags & SCF_DO_STCLASS_AND || !value)
3189 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3190 break;
3191 case ANYOF:
3192 if (flags & SCF_DO_STCLASS_AND)
3193 cl_and(data->start_class,
3194 (struct regnode_charclass_class*)scan);
3195 else
830247a4 3196 cl_or(pRExC_state, data->start_class,
653099ff
GS
3197 (struct regnode_charclass_class*)scan);
3198 break;
3199 case ALNUM:
3200 if (flags & SCF_DO_STCLASS_AND) {
3201 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3202 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3203 for (value = 0; value < 256; value++)
3204 if (!isALNUM(value))
3205 ANYOF_BITMAP_CLEAR(data->start_class, value);
3206 }
3207 }
3208 else {
3209 if (data->start_class->flags & ANYOF_LOCALE)
3210 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3211 else {
3212 for (value = 0; value < 256; value++)
3213 if (isALNUM(value))
b81d288d 3214 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3215 }
3216 }
3217 break;
3218 case ALNUML:
3219 if (flags & SCF_DO_STCLASS_AND) {
3220 if (data->start_class->flags & ANYOF_LOCALE)
3221 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3222 }
3223 else {
3224 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3225 data->start_class->flags |= ANYOF_LOCALE;
3226 }
3227 break;
3228 case NALNUM:
3229 if (flags & SCF_DO_STCLASS_AND) {
3230 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3231 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3232 for (value = 0; value < 256; value++)
3233 if (isALNUM(value))
3234 ANYOF_BITMAP_CLEAR(data->start_class, value);
3235 }
3236 }
3237 else {
3238 if (data->start_class->flags & ANYOF_LOCALE)
3239 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3240 else {
3241 for (value = 0; value < 256; value++)
3242 if (!isALNUM(value))
b81d288d 3243 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3244 }
3245 }
3246 break;
3247 case NALNUML:
3248 if (flags & SCF_DO_STCLASS_AND) {
3249 if (data->start_class->flags & ANYOF_LOCALE)
3250 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3251 }
3252 else {
3253 data->start_class->flags |= ANYOF_LOCALE;
3254 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3255 }
3256 break;
3257 case SPACE:
3258 if (flags & SCF_DO_STCLASS_AND) {
3259 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3260 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3261 for (value = 0; value < 256; value++)
3262 if (!isSPACE(value))
3263 ANYOF_BITMAP_CLEAR(data->start_class, value);
3264 }
3265 }
3266 else {
3267 if (data->start_class->flags & ANYOF_LOCALE)
3268 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3269 else {
3270 for (value = 0; value < 256; value++)
3271 if (isSPACE(value))
b81d288d 3272 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3273 }
3274 }
3275 break;
3276 case SPACEL:
3277 if (flags & SCF_DO_STCLASS_AND) {
3278 if (data->start_class->flags & ANYOF_LOCALE)
3279 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3280 }
3281 else {
3282 data->start_class->flags |= ANYOF_LOCALE;
3283 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3284 }
3285 break;
3286 case NSPACE:
3287 if (flags & SCF_DO_STCLASS_AND) {
3288 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3289 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3290 for (value = 0; value < 256; value++)
3291 if (isSPACE(value))
3292 ANYOF_BITMAP_CLEAR(data->start_class, value);
3293 }
3294 }
3295 else {
3296 if (data->start_class->flags & ANYOF_LOCALE)
3297 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3298 else {
3299 for (value = 0; value < 256; value++)
3300 if (!isSPACE(value))
b81d288d 3301 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3302 }
3303 }
3304 break;
3305 case NSPACEL:
3306 if (flags & SCF_DO_STCLASS_AND) {
3307 if (data->start_class->flags & ANYOF_LOCALE) {
3308 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3309 for (value = 0; value < 256; value++)
3310 if (!isSPACE(value))
3311 ANYOF_BITMAP_CLEAR(data->start_class, value);
3312 }
3313 }
3314 else {
3315 data->start_class->flags |= ANYOF_LOCALE;
3316 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3317 }
3318 break;
3319 case DIGIT:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3322 for (value = 0; value < 256; value++)
3323 if (!isDIGIT(value))
3324 ANYOF_BITMAP_CLEAR(data->start_class, value);
3325 }
3326 else {
3327 if (data->start_class->flags & ANYOF_LOCALE)
3328 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3329 else {
3330 for (value = 0; value < 256; value++)
3331 if (isDIGIT(value))
b81d288d 3332 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3333 }
3334 }
3335 break;
3336 case NDIGIT:
3337 if (flags & SCF_DO_STCLASS_AND) {
3338 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3339 for (value = 0; value < 256; value++)
3340 if (isDIGIT(value))
3341 ANYOF_BITMAP_CLEAR(data->start_class, value);
3342 }
3343 else {
3344 if (data->start_class->flags & ANYOF_LOCALE)
3345 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3346 else {
3347 for (value = 0; value < 256; value++)
3348 if (!isDIGIT(value))
b81d288d 3349 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3350 }
3351 }
3352 break;
3353 }
3354 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3355 cl_and(data->start_class, and_withp);
653099ff
GS
3356 flags &= ~SCF_DO_STCLASS;
3357 }
a0ed51b3 3358 }
3dab1dad 3359 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3360 data->flags |= (OP(scan) == MEOL
3361 ? SF_BEFORE_MEOL
3362 : SF_BEFORE_SEOL);
a0ed51b3 3363 }
3dab1dad 3364 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3365 /* Lookbehind, or need to calculate parens/evals/stclass: */
3366 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3367 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3368 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3369 || OP(scan) == UNLESSM )
3370 {
3371 /* Negative Lookahead/lookbehind
3372 In this case we can't do fixed string optimisation.
3373 */
3374
3375 I32 deltanext, minnext, fake = 0;
3376 regnode *nscan;
3377 struct regnode_charclass_class intrnl;
3378 int f = 0;
3379
3380 data_fake.flags = 0;
3381 if (data) {
3382 data_fake.whilem_c = data->whilem_c;
3383 data_fake.last_closep = data->last_closep;
a0ed51b3 3384 }
1de06328
YO
3385 else
3386 data_fake.last_closep = &fake;
3387 if ( flags & SCF_DO_STCLASS && !scan->flags
3388 && OP(scan) == IFMATCH ) { /* Lookahead */
3389 cl_init(pRExC_state, &intrnl);
3390 data_fake.start_class = &intrnl;
3391 f |= SCF_DO_STCLASS_AND;
c277df42 3392 }
1de06328
YO
3393 if (flags & SCF_WHILEM_VISITED_POS)
3394 f |= SCF_WHILEM_VISITED_POS;
3395 next = regnext(scan);
3396 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3397 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3398 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3399 if (scan->flags) {
3400 if (deltanext) {
3401 vFAIL("Variable length lookbehind not implemented");
3402 }
3403 else if (minnext > (I32)U8_MAX) {
3404 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3405 }
3406 scan->flags = (U8)minnext;
3407 }
3408 if (data) {
3409 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3410 pars++;
3411 if (data_fake.flags & SF_HAS_EVAL)
3412 data->flags |= SF_HAS_EVAL;
3413 data->whilem_c = data_fake.whilem_c;
3414 }
3415 if (f & SCF_DO_STCLASS_AND) {
3416 const int was = (data->start_class->flags & ANYOF_EOS);
3417
3418 cl_and(data->start_class, &intrnl);
3419 if (was)
3420 data->start_class->flags |= ANYOF_EOS;
3421 }
be8e71aa 3422 }
1de06328
YO
3423#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3424 else {
3425 /* Positive Lookahead/lookbehind
3426 In this case we can do fixed string optimisation,
3427 but we must be careful about it. Note in the case of
3428 lookbehind the positions will be offset by the minimum
3429 length of the pattern, something we won't know about
3430 until after the recurse.
3431 */
3432 I32 deltanext, fake = 0;
3433 regnode *nscan;
3434 struct regnode_charclass_class intrnl;
3435 int f = 0;
3436 /* We use SAVEFREEPV so that when the full compile
3437 is finished perl will clean up the allocated
3438 minlens when its all done. This was we don't
3439 have to worry about freeing them when we know
3440 they wont be used, which would be a pain.
3441 */
3442 I32 *minnextp;
3443 Newx( minnextp, 1, I32 );
3444 SAVEFREEPV(minnextp);
3445
3446 if (data) {
3447 StructCopy(data, &data_fake, scan_data_t);
3448 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3449 f |= SCF_DO_SUBSTR;
3450 if (scan->flags)
3451 scan_commit(pRExC_state, &data_fake,minlenp);
3452 data_fake.last_found=newSVsv(data->last_found);
3453 }
3454 }
3455 else
3456 data_fake.last_closep = &fake;
3457 data_fake.flags = 0;
3458 if (is_inf)
3459 data_fake.flags |= SF_IS_INF;
3460 if ( flags & SCF_DO_STCLASS && !scan->flags
3461 && OP(scan) == IFMATCH ) { /* Lookahead */
3462 cl_init(pRExC_state, &intrnl);
3463 data_fake.start_class = &intrnl;
3464 f |= SCF_DO_STCLASS_AND;
3465 }
3466 if (flags & SCF_WHILEM_VISITED_POS)
3467 f |= SCF_WHILEM_VISITED_POS;
3468 next = regnext(scan);
3469 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3470
3471 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3472 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3473 if (scan->flags) {
3474 if (deltanext) {
3475 vFAIL("Variable length lookbehind not implemented");
3476 }
3477 else if (*minnextp > (I32)U8_MAX) {
3478 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3479 }
3480 scan->flags = (U8)*minnextp;
3481 }
40d049e4 3482
1de06328 3483 *minnextp += min;
40d049e4 3484
1de06328
YO
3485 if (f & SCF_DO_STCLASS_AND) {
3486 const int was = (data->start_class->flags & ANYOF_EOS);
3487
3488 cl_and(data->start_class, &intrnl);
3489 if (was)
3490 data->start_class->flags |= ANYOF_EOS;
40d049e4 3491 }
1de06328
YO
3492 if (data) {
3493 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3494 pars++;
3495 if (data_fake.flags & SF_HAS_EVAL)
3496 data->flags |= SF_HAS_EVAL;
3497 data->whilem_c = data_fake.whilem_c;
3498 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3499 if (RExC_rx->minlen<*minnextp)
3500 RExC_rx->minlen=*minnextp;
3501 scan_commit(pRExC_state, &data_fake, minnextp);
3502 SvREFCNT_dec(data_fake.last_found);
3503
3504 if ( data_fake.minlen_fixed != minlenp )
3505 {
3506 data->offset_fixed= data_fake.offset_fixed;
3507 data->minlen_fixed= data_fake.minlen_fixed;
3508 data->lookbehind_fixed+= scan->flags;
3509 }
3510 if ( data_fake.minlen_float != minlenp )
3511 {
3512 data->minlen_float= data_fake.minlen_float;
3513 data->offset_float_min=data_fake.offset_float_min;
3514 data->offset_float_max=data_fake.offset_float_max;
3515 data->lookbehind_float+= scan->flags;
3516 }
3517 }
3518 }
3519
653099ff 3520
653099ff 3521 }
1de06328 3522#endif
a0ed51b3
LW
3523 }
3524 else if (OP(scan) == OPEN) {
40d049e4
YO
3525 if (stopparen != (I32)ARG(scan))
3526 pars++;
a0ed51b3 3527 }
cb434fcc 3528 else if (OP(scan) == CLOSE) {
40d049e4
YO
3529 if (stopparen == (I32)ARG(scan)) {
3530 break;
3531 }
eb160463 3532 if ((I32)ARG(scan) == is_par) {
cb434fcc 3533 next = regnext(scan);
c277df42 3534
cb434fcc
IZ
3535 if ( next && (OP(next) != WHILEM) && next < last)
3536 is_par = 0; /* Disable optimization */
3537 }
3538 if (data)
3539 *(data->last_closep) = ARG(scan);
a0ed51b3 3540 }
1a147d38 3541 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3542 /* set the pointer */
3543 I32 paren;
3544 regnode *start;
3545 regnode *end;
1a147d38 3546 if (OP(scan) == GOSUB) {
40d049e4
YO
3547 paren = ARG(scan);
3548 RExC_recurse[ARG2L(scan)] = scan;
3549 start = RExC_open_parens[paren-1];
3550 end = RExC_close_parens[paren-1];
3551 } else {
3552 paren = 0;
3553 start = RExC_rx->program + 1;
3554 end = RExC_opend;
3555 }
3556 assert(start);
3557 assert(end);
3558 if (!recursed) {
3559 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3560 SAVEFREEPV(recursed);
3561 }
3562 if (!PAREN_TEST(recursed,paren+1)) {
3563 I32 deltanext = 0;
3564 PAREN_SET(recursed,paren+1);
3565
3566 DEBUG_PEEP("goto",start,depth);
3567 min += study_chunk(
3568 pRExC_state,
3569 &start,
3570 minlenp,
3571 &deltanext,
3572 end+1,
3573 data,
3574 paren,
3575 recursed,
3576 and_withp,
3577 flags,depth+1);
3578 delta+=deltanext;
3579 if (deltanext == I32_MAX) {
3580 is_inf = is_inf_internal = 1;
3581 delta=deltanext;
3582 }
3583 DEBUG_PEEP("rtrn",end,depth);
3584 PAREN_UNSET(recursed,paren+1);
3585 } else {
3586 if (flags & SCF_DO_SUBSTR) {
3587 scan_commit(pRExC_state,data,minlenp);
3588 data->longest = &(data->longest_float);
3589 }
3590 is_inf = is_inf_internal = 1;
3591 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3592 cl_anything(pRExC_state, data->start_class);
3593 flags &= ~SCF_DO_STCLASS;
3594 }
3595 }
a0ed51b3 3596 else if (OP(scan) == EVAL) {
c277df42
IZ
3597 if (data)
3598 data->flags |= SF_HAS_EVAL;
3599 }
e2e6a0f1 3600 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
40d049e4
YO
3601 if (flags & SCF_DO_SUBSTR) {
3602 scan_commit(pRExC_state,data,minlenp);
3603 flags &= ~SCF_DO_SUBSTR;
3604 }
e2e6a0f1
YO
3605 if (data && OP(scan)==ACCEPT) {
3606 data->flags |= SCF_SEEN_ACCEPT;
3607 if (stopmin > min)
3608 stopmin = min;
3609 }
40d049e4
YO
3610 }
3611 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3612 {
0f5d15d6 3613 if (flags & SCF_DO_SUBSTR) {
1de06328 3614 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3615 data->longest = &(data->longest_float);
3616 }
3617 is_inf = is_inf_internal = 1;
653099ff 3618 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3619 cl_anything(pRExC_state, data->start_class);
96776eda 3620 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3621 }
786e8c11 3622#ifdef TRIE_STUDY_OPT
40d049e4 3623#ifdef FULL_TRIE_STUDY
786e8c11
YO
3624 else if (PL_regkind[OP(scan)] == TRIE) {
3625 /* NOTE - There is similar code to this block above for handling
40d049e4 3626 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3627 check there too. */
7f69552c 3628 regnode *trie_node= scan;
786e8c11
YO
3629 regnode *tail= regnext(scan);
3630 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3631 I32 max1 = 0, min1 = I32_MAX;
3632 struct regnode_charclass_class accum;
3633
3634 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3635 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3636 if (flags & SCF_DO_STCLASS)
3637 cl_init_zero(pRExC_state, &accum);
3638
3639 if (!trie->jump) {
3640 min1= trie->minlen;
3641 max1= trie->maxlen;
3642 } else {
3643 const regnode *nextbranch= NULL;
3644 U32 word;
3645
3646 for ( word=1 ; word <= trie->wordcount ; word++)
3647 {
3648 I32 deltanext=0, minnext=0, f = 0, fake;
3649 struct regnode_charclass_class this_class;
3650
3651 data_fake.flags = 0;
3652 if (data) {
3653 data_fake.whilem_c = data->whilem_c;
3654 data_fake.last_closep = data->last_closep;
3655 }
3656 else
3657 data_fake.last_closep = &fake;
3658
3659 if (flags & SCF_DO_STCLASS) {
3660 cl_init(pRExC_state, &this_class);
3661 data_fake.start_class = &this_class;
3662 f = SCF_DO_STCLASS_AND;
3663 }
3664 if (flags & SCF_WHILEM_VISITED_POS)
3665 f |= SCF_WHILEM_VISITED_POS;
3666
3667 if (trie->jump[word]) {
3668 if (!nextbranch)
7f69552c
YO
3669 nextbranch = trie_node + trie->jump[0];
3670 scan= trie_node + trie->jump[word];
786e8c11
YO
3671 /* We go from the jump point to the branch that follows
3672 it. Note this means we need the vestigal unused branches
3673 even though they arent otherwise used.
3674 */
40d049e4
YO
3675 minnext = study_chunk(pRExC_state, &scan, minlenp,
3676 &deltanext, (regnode *)nextbranch, &data_fake,
3677 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3678 }
3679 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3680 nextbranch= regnext((regnode*)nextbranch);
3681
3682 if (min1 > (I32)(minnext + trie->minlen))
3683 min1 = minnext + trie->minlen;
3684 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3685 max1 = minnext + deltanext + trie->maxlen;
3686 if (deltanext == I32_MAX)
3687 is_inf = is_inf_internal = 1;
3688
3689 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3690 pars++;
e2e6a0f1
YO
3691 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3692 if ( stopmin > min + min1)
3693 stopmin = min + min1;
3694 flags &= ~SCF_DO_SUBSTR;
3695 if (data)
3696 data->flags |= SCF_SEEN_ACCEPT;
3697 }
786e8c11
YO
3698 if (data) {
3699 if (data_fake.flags & SF_HAS_EVAL)
3700 data->flags |= SF_HAS_EVAL;
3701 data->whilem_c = data_fake.whilem_c;
3702 }
3703 if (flags & SCF_DO_STCLASS)
3704 cl_or(pRExC_state, &accum, &this_class);
3705 }
3706 }
3707 if (flags & SCF_DO_SUBSTR) {
3708 data->pos_min += min1;
3709 data->pos_delta += max1 - min1;
3710 if (max1 != min1 || is_inf)
3711 data->longest = &(data->longest_float);
3712 }
3713 min += min1;
3714 delta += max1 - min1;
3715 if (flags & SCF_DO_STCLASS_OR) {
3716 cl_or(pRExC_state, data->start_class, &accum);
3717 if (min1) {
40d049e4 3718 cl_and(data->start_class, and_withp);
786e8c11
YO
3719 flags &= ~SCF_DO_STCLASS;
3720 }
3721 }
3722 else if (flags & SCF_DO_STCLASS_AND) {
3723 if (min1) {
3724 cl_and(data->start_class, &accum);
3725 flags &= ~SCF_DO_STCLASS;
3726 }
3727 else {
3728 /* Switch to OR mode: cache the old value of
3729 * data->start_class */
40d049e4
YO
3730 INIT_AND_WITHP;
3731 StructCopy(data->start_class, and_withp,
786e8c11
YO
3732 struct regnode_charclass_class);
3733 flags &= ~SCF_DO_STCLASS_AND;
3734 StructCopy(&accum, data->start_class,
3735 struct regnode_charclass_class);
3736 flags |= SCF_DO_STCLASS_OR;
3737 data->start_class->flags |= ANYOF_EOS;
3738 }
3739 }
3740 scan= tail;
3741 continue;
3742 }
3743#else
3744 else if (PL_regkind[OP(scan)] == TRIE) {
3745 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3746 U8*bang=NULL;
3747
3748 min += trie->minlen;
3749 delta += (trie->maxlen - trie->minlen);
3750 flags &= ~SCF_DO_STCLASS; /* xxx */
3751 if (flags & SCF_DO_SUBSTR) {
1de06328 3752 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3753 data->pos_min += trie->minlen;
3754 data->pos_delta += (trie->maxlen - trie->minlen);
3755 if (trie->maxlen != trie->minlen)
3756 data->longest = &(data->longest_float);
3757 }
3758 if (trie->jump) /* no more substrings -- for now /grr*/
3759 flags &= ~SCF_DO_SUBSTR;
3760 }
3761#endif /* old or new */
3762#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3763 /* Else: zero-length, ignore. */
3764 scan = regnext(scan);
3765 }
3766
3767 finish:
3768 *scanp = scan;
aca2d497 3769 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3770 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3771 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3772 if (is_par > (I32)U8_MAX)
c277df42
IZ
3773 is_par = 0;
3774 if (is_par && pars==1 && data) {
3775 data->flags |= SF_IN_PAR;
3776 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3777 }
3778 else if (pars && data) {
c277df42
IZ
3779 data->flags |= SF_HAS_PAR;
3780 data->flags &= ~SF_IN_PAR;
3781 }
653099ff 3782 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3783 cl_and(data->start_class, and_withp);
786e8c11
YO
3784 if (flags & SCF_TRIE_RESTUDY)
3785 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3786
3787 DEBUG_STUDYDATA(data,depth);
3788
e2e6a0f1 3789 return min < stopmin ? min : stopmin;
c277df42
IZ
3790}
3791
76e3520e 3792STATIC I32
5f66b61c 3793S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3794{
830247a4 3795 if (RExC_rx->data) {
2eb97020 3796 const U32 count = RExC_rx->data->count;
b81d288d 3797 Renewc(RExC_rx->data,
2eb97020 3798 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3799 char, struct reg_data);
2eb97020 3800 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3801 RExC_rx->data->count += n;
a0ed51b3
LW
3802 }
3803 else {
a02a5408 3804 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3805 char, struct reg_data);
a02a5408 3806 Newx(RExC_rx->data->what, n, U8);
830247a4 3807 RExC_rx->data->count = n;
c277df42 3808 }
830247a4
IZ
3809 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3810 return RExC_rx->data->count - n;
c277df42
IZ
3811}
3812
76234dfb 3813#ifndef PERL_IN_XSUB_RE
d88dccdf 3814void
864dbfa3 3815Perl_reginitcolors(pTHX)
d88dccdf 3816{
97aff369 3817 dVAR;
1df70142 3818 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3819 if (s) {
1df70142
AL
3820 char *t = savepv(s);
3821 int i = 0;
3822 PL_colors[0] = t;
d88dccdf 3823 while (++i < 6) {
1df70142
AL
3824 t = strchr(t, '\t');
3825 if (t) {
3826 *t = '\0';
3827 PL_colors[i] = ++t;
d88dccdf
IZ
3828 }
3829 else
1df70142 3830 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3831 }
3832 } else {
1df70142 3833 int i = 0;
b81d288d 3834 while (i < 6)
06b5626a 3835 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3836 }
3837 PL_colorset = 1;
3838}
76234dfb 3839#endif
8615cb43 3840
07be1b83 3841
786e8c11
YO
3842#ifdef TRIE_STUDY_OPT
3843#define CHECK_RESTUDY_GOTO \
3844 if ( \
3845 (data.flags & SCF_TRIE_RESTUDY) \
3846 && ! restudied++ \
3847 ) goto reStudy
3848#else
3849#define CHECK_RESTUDY_GOTO
3850#endif
f9f4320a 3851
a687059c 3852/*
e50aee73 3853 - pregcomp - compile a regular expression into internal code
a687059c
LW
3854 *
3855 * We can't allocate space until we know how big the compiled form will be,
3856 * but we can't compile it (and thus know how big it is) until we've got a
3857 * place to put the code. So we cheat: we compile it twice, once with code
3858 * generation turned off and size counting turned on, and once "for real".
3859 * This also means that we don't allocate space until we are sure that the
3860 * thing really will compile successfully, and we never have to move the
3861 * code and thus invalidate pointers into it. (Note that it has to be in
3862 * one piece because free() must be able to free it all.) [NB: not true in perl]
3863 *
3864 * Beware that the optimization-preparation code in here knows about some
3865 * of the structure of the compiled regexp. [I'll say.]
3866 */
b9b4dddf
YO
3867
3868
3869
f9f4320a 3870#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3871#define RE_ENGINE_PTR &PL_core_reg_engine
3872#else
f9f4320a
YO
3873extern const struct regexp_engine my_reg_engine;
3874#define RE_ENGINE_PTR &my_reg_engine
3875#endif
b9b4dddf
YO
3876/* these make a few things look better, to avoid indentation */
3877#define BEGIN_BLOCK {
f9f4320a
YO
3878#define END_BLOCK }
3879
a687059c 3880regexp *
864dbfa3 3881Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3882{
97aff369 3883 dVAR;
f9f4320a
YO
3884 GET_RE_DEBUG_FLAGS_DECL;
3885 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3886#ifndef PERL_IN_XSUB_RE
3887 BEGIN_BLOCK
f9f4320a
YO
3888 /* Dispatch a request to compile a regexp to correct
3889 regexp engine. */
3890 HV * const table = GvHV(PL_hintgv);
3891 if (table) {
3892 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3893 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3894 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3895 DEBUG_COMPILE_r({
8d8756e7 3896 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3897 SvIV(*ptr));
3898 });
f2f78491 3899 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3900 }
b9b4dddf
YO
3901 }
3902 END_BLOCK
3903#endif
3904 BEGIN_BLOCK
a0d0e21e 3905 register regexp *r;
c277df42 3906 regnode *scan;
c277df42 3907 regnode *first;
a0d0e21e 3908 I32 flags;
a0d0e21e
LW
3909 I32 minlen = 0;
3910 I32 sawplus = 0;
3911 I32 sawopen = 0;
2c2d71f5 3912 scan_data_t data;
830247a4 3913 RExC_state_t RExC_state;
be8e71aa 3914 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3915#ifdef TRIE_STUDY_OPT
3916 int restudied= 0;
3917 RExC_state_t copyRExC_state;
3918#endif
a0d0e21e 3919 if (exp == NULL)
c277df42 3920 FAIL("NULL regexp argument");
a0d0e21e 3921
a5961de5 3922 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3923
5cfc7842 3924 RExC_precomp = exp;
a3621e74 3925 DEBUG_COMPILE_r({
ab3bbdeb
YO
3926 SV *dsv= sv_newmortal();
3927 RE_PV_QUOTED_DECL(s, RExC_utf8,
3928 dsv, RExC_precomp, (xend - exp), 60);
3929 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3930 PL_colors[4],PL_colors[5],s);
a5961de5 3931 });
e2509266 3932 RExC_flags = pm->op_pmflags;
830247a4 3933 RExC_sawback = 0;
bbce6d69 3934
830247a4
IZ
3935 RExC_seen = 0;
3936 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3937 RExC_seen_evals = 0;
3938 RExC_extralen = 0;
c277df42 3939
bbce6d69 3940 /* First pass: determine size, legality. */
830247a4 3941 RExC_parse = exp;
fac92740 3942 RExC_start = exp;
830247a4
IZ
3943 RExC_end = xend;
3944 RExC_naughty = 0;
3945 RExC_npar = 1;
e2e6a0f1 3946 RExC_nestroot = 0;
830247a4
IZ
3947 RExC_size = 0L;
3948 RExC_emit = &PL_regdummy;
3949 RExC_whilem_seen = 0;
fc8cd66c 3950 RExC_charnames = NULL;
40d049e4
YO
3951 RExC_open_parens = NULL;
3952 RExC_close_parens = NULL;
3953 RExC_opend = NULL;
81714fb9 3954 RExC_paren_names = NULL;
40d049e4
YO
3955 RExC_recurse = NULL;
3956 RExC_recurse_count = 0;
81714fb9 3957
85ddcde9
JH
3958#if 0 /* REGC() is (currently) a NOP at the first pass.
3959 * Clever compilers notice this and complain. --jhi */
830247a4 3960 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3961#endif
3dab1dad
YO
3962 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3963 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3964 RExC_precomp = NULL;
a0d0e21e
LW
3965 return(NULL);
3966 }
07be1b83 3967 DEBUG_PARSE_r({
81714fb9
YO
3968 PerlIO_printf(Perl_debug_log,
3969 "Required size %"IVdf" nodes\n"
3970 "Starting second pass (creation)\n",
3971 (IV)RExC_size);
07be1b83
YO
3972 RExC_lastnum=0;
3973 RExC_lastparse=NULL;
3974 });
c277df42
IZ
3975 /* Small enough for pointer-storage convention?
3976 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3977 if (RExC_size >= 0x10000L && RExC_extralen)
3978 RExC_size += RExC_extralen;
c277df42 3979 else
830247a4
IZ
3980 RExC_extralen = 0;
3981 if (RExC_whilem_seen > 15)
3982 RExC_whilem_seen = 15;
a0d0e21e 3983
e2e6a0f1
YO
3984#ifdef DEBUGGING
3985 /* Make room for a sentinel value at the end of the program */
3986 RExC_size++;
3987#endif
3988
f9f4320a
YO
3989 /* Allocate space and zero-initialize. Note, the two step process
3990 of zeroing when in debug mode, thus anything assigned has to
3991 happen after that */
a02a5408 3992 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3993 char, regexp);
a0d0e21e 3994 if (r == NULL)
b45f050a 3995 FAIL("Regexp out of space");
0f79a09d
GS
3996#ifdef DEBUGGING
3997 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3998 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3999#endif
f9f4320a
YO
4000 /* initialization begins here */
4001 r->engine= RE_ENGINE_PTR;
c277df42 4002 r->refcnt = 1;
bbce6d69 4003 r->prelen = xend - exp;
5cfc7842 4004 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 4005 r->subbeg = NULL;
f8c7b90f 4006#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 4007 r->saved_copy = NULL;
ed252734 4008#endif
cf93c79d 4009 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 4010 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 4011 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
4012
4013 r->substrs = 0; /* Useful during FAIL. */
4014 r->startp = 0; /* Useful during FAIL. */
6bda09f9 4015 r->endp = 0;
81714fb9
YO
4016 r->paren_names = 0;
4017
6bda09f9 4018 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4019 Newxz(RExC_open_parens, RExC_npar,regnode *);
4020 SAVEFREEPV(RExC_open_parens);
4021 Newxz(RExC_close_parens,RExC_npar,regnode *);
4022 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4023 }
4024
4025 /* Useful during FAIL. */
a02a5408 4026 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 4027 if (r->offsets) {
2af232bd 4028 r->offsets[0] = RExC_size;
fac92740 4029 }
a3621e74 4030 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
4031 "%s %"UVuf" bytes for offset annotations.\n",
4032 r->offsets ? "Got" : "Couldn't get",
392fbf5d 4033 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4034
830247a4 4035 RExC_rx = r;
bbce6d69 4036
4037 /* Second pass: emit code. */
e2509266 4038 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4039 RExC_parse = exp;
4040 RExC_end = xend;
4041 RExC_naughty = 0;
4042 RExC_npar = 1;
fac92740 4043 RExC_emit_start = r->program;
830247a4 4044 RExC_emit = r->program;
e2e6a0f1
YO
4045#ifdef DEBUGGING
4046 /* put a sentinal on the end of the program so we can check for
4047 overwrites */
4048 r->program[RExC_size].type = 255;
4049#endif
2cd61cdb 4050 /* Store the count of eval-groups for security checks: */
786e8c11 4051 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4052 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4053 r->data = 0;
3dab1dad 4054 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4055 return(NULL);
6bda09f9 4056
07be1b83
YO
4057 /* XXXX To minimize changes to RE engine we always allocate
4058 3-units-long substrs field. */
4059 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4060 if (RExC_recurse_count) {
4061 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4062 SAVEFREEPV(RExC_recurse);
4063 }
a0d0e21e 4064
07be1b83 4065reStudy:
1de06328 4066 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4067 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4068
07be1b83
YO
4069#ifdef TRIE_STUDY_OPT
4070 if ( restudied ) {
5d458dd8 4071 U32 seen=RExC_seen;
07be1b83 4072 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4073
4074 RExC_state = copyRExC_state;
4075 if (seen & REG_TOP_LEVEL_BRANCHES)
4076 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4077 else
4078 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4079 if (data.last_found) {
07be1b83 4080 SvREFCNT_dec(data.longest_fixed);
07be1b83 4081 SvREFCNT_dec(data.longest_float);
07be1b83 4082 SvREFCNT_dec(data.last_found);
1de06328 4083 }
40d049e4 4084 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4085 } else {
40d049e4 4086 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4087 copyRExC_state = RExC_state;
07be1b83 4088 }
40d049e4
YO
4089#else
4090 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4091#endif
fc8cd66c 4092
a0d0e21e 4093 /* Dig out information for optimizations. */
cf93c79d 4094 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4095 pm->op_pmflags = RExC_flags;
a0ed51b3 4096 if (UTF)
5ff6fc6d 4097 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4098 r->regstclass = NULL;
830247a4 4099 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4100 r->reganch |= ROPT_NAUGHTY;
c277df42 4101 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4102
1de06328
YO
4103 /* testing for BRANCH here tells us whether there is "must appear"
4104 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4105 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4106 I32 fake;
c5254dd6 4107 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4108 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4109 int stclass_flag;
07be1b83 4110 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4111
4112 first = scan;
c277df42 4113 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4114 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4115 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4116 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4117 /* for now we can't handle lookbehind IFMATCH*/
4118 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4119 (OP(first) == PLUS) ||
4120 (OP(first) == MINMOD) ||
653099ff 4121 /* An {n,m} with n>0 */
07be1b83
YO
4122 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4123 {
786e8c11 4124
a0d0e21e
LW
4125 if (OP(first) == PLUS)
4126 sawplus = 1;
4127 else
3dab1dad 4128 first += regarglen[OP(first)];
07be1b83
YO
4129 if (OP(first) == IFMATCH) {
4130 first = NEXTOPER(first);
4131 first += EXTRA_STEP_2ARGS;
7c167cea 4132 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4133 first = NEXTOPER(first);
a687059c
LW
4134 }
4135
a0d0e21e
LW
4136 /* Starting-point info. */
4137 again:
786e8c11 4138 DEBUG_PEEP("first:",first,0);
07be1b83 4139 /* Ignore EXACT as we deal with it later. */
3dab1dad 4140 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4141 if (OP(first) == EXACT)
6f207bd3 4142 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4143 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4144 r->regstclass = first;
4145 }
07be1b83 4146#ifdef TRIE_STCLASS
786e8c11 4147 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4148 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4149 {
786e8c11 4150 regnode *trie_op;
07be1b83 4151 /* this can happen only on restudy */
786e8c11
YO
4152 if ( OP(first) == TRIE ) {
4153 struct regnode_1 *trieop;
4154 Newxz(trieop,1,struct regnode_1);
4155 StructCopy(first,trieop,struct regnode_1);
4156 trie_op=(regnode *)trieop;
4157 } else {
4158 struct regnode_charclass *trieop;
4159 Newxz(trieop,1,struct regnode_charclass);
4160 StructCopy(first,trieop,struct regnode_charclass);
4161 trie_op=(regnode *)trieop;
4162 }
1de06328 4163 OP(trie_op)+=2;
786e8c11
YO
4164 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4165 r->regstclass = trie_op;
07be1b83
YO
4166 }
4167#endif
bfed75c6 4168 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4169 r->regstclass = first;
3dab1dad
YO
4170 else if (PL_regkind[OP(first)] == BOUND ||
4171 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4172 r->regstclass = first;
3dab1dad 4173 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4174 r->reganch |= (OP(first) == MBOL
4175 ? ROPT_ANCH_MBOL
4176 : (OP(first) == SBOL
4177 ? ROPT_ANCH_SBOL
4178 : ROPT_ANCH_BOL));
a0d0e21e 4179 first = NEXTOPER(first);
774d564b 4180 goto again;
4181 }
4182 else if (OP(first) == GPOS) {
4183 r->reganch |= ROPT_ANCH_GPOS;
4184 first = NEXTOPER(first);
4185 goto again;
a0d0e21e 4186 }
e09294f4 4187 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4188 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4189 !(r->reganch & ROPT_ANCH) )
4190 {
4191 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4192 const int type =
4193 (OP(NEXTOPER(first)) == REG_ANY)
4194 ? ROPT_ANCH_MBOL
4195 : ROPT_ANCH_SBOL;
cad2e5aa 4196 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4197 first = NEXTOPER(first);
774d564b 4198 goto again;
a0d0e21e 4199 }
b81d288d 4200 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4201 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4202 /* x+ must match at the 1st pos of run of x's */
4203 r->reganch |= ROPT_SKIP;
a0d0e21e 4204
c277df42 4205 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4206#ifdef TRIE_STUDY_OPT
81714fb9 4207 DEBUG_PARSE_r(
be8e71aa
YO
4208 if (!restudied)
4209 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4210 (IV)(first - scan + 1))
4211 );
4212#else
81714fb9 4213 DEBUG_PARSE_r(
be8e71aa
YO
4214 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4215 (IV)(first - scan + 1))
4216 );
4217#endif
4218
4219
a0d0e21e
LW
4220 /*
4221 * If there's something expensive in the r.e., find the
4222 * longest literal string that must appear and make it the
4223 * regmust. Resolve ties in favor of later strings, since
4224 * the regstart check works with the beginning of the r.e.
4225 * and avoiding duplication strengthens checking. Not a
4226 * strong reason, but sufficient in the absence of others.
4227 * [Now we resolve ties in favor of the earlier string if
c277df42 4228 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4229 * earlier string may buy us something the later one won't.]
4230 */
de8c5301 4231
396482e1
GA
4232 data.longest_fixed = newSVpvs("");
4233 data.longest_float = newSVpvs("");
4234 data.last_found = newSVpvs("");
c277df42
IZ
4235 data.longest = &(data.longest_fixed);
4236 first = scan;
653099ff 4237 if (!r->regstclass) {
830247a4 4238 cl_init(pRExC_state, &ch_class);
653099ff
GS
4239 data.start_class = &ch_class;
4240 stclass_flag = SCF_DO_STCLASS_AND;
4241 } else /* XXXX Check for BOUND? */
4242 stclass_flag = 0;
cb434fcc 4243 data.last_closep = &last_close;
de8c5301 4244
1de06328 4245 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4246 &data, -1, NULL, NULL,
4247 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4248
07be1b83 4249
786e8c11
YO
4250 CHECK_RESTUDY_GOTO;
4251
4252
830247a4 4253 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4254 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4255 && !RExC_seen_zerolen
4256 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4257 r->reganch |= ROPT_CHECK_ALL;
1de06328 4258 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4259 SvREFCNT_dec(data.last_found);
4260
1de06328
YO
4261 /* Note that code very similar to this but for anchored string
4262 follows immediately below, changes may need to be made to both.
4263 Be careful.
4264 */
a0ed51b3 4265 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4266 if (longest_float_length
c277df42
IZ
4267 || (data.flags & SF_FL_BEFORE_EOL
4268 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4269 || (RExC_flags & PMf_MULTILINE))))
4270 {
1182767e 4271 I32 t,ml;
cf93c79d 4272
1de06328 4273 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4274 && data.offset_fixed == data.offset_float_min
4275 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4276 goto remove_float; /* As in (a)+. */
4277
1de06328
YO
4278 /* copy the information about the longest float from the reg_scan_data
4279 over to the program. */
33b8afdf
JH
4280 if (SvUTF8(data.longest_float)) {
4281 r->float_utf8 = data.longest_float;
c445ea15 4282 r->float_substr = NULL;
33b8afdf
JH
4283 } else {
4284 r->float_substr = data.longest_float;
c445ea15 4285 r->float_utf8 = NULL;
33b8afdf 4286 }
1de06328
YO
4287 /* float_end_shift is how many chars that must be matched that
4288 follow this item. We calculate it ahead of time as once the
4289 lookbehind offset is added in we lose the ability to correctly
4290 calculate it.*/
4291 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4292 : (I32)longest_float_length;
1de06328
YO
4293 r->float_end_shift = ml - data.offset_float_min
4294 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4295 + data.lookbehind_float;
4296 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4297 r->float_max_offset = data.offset_float_max;
1182767e 4298 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4299 r->float_max_offset -= data.lookbehind_float;
4300
cf93c79d
IZ
4301 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4302 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4303 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4304 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4305 }
4306 else {
aca2d497 4307 remove_float:
c445ea15 4308 r->float_substr = r->float_utf8 = NULL;
c277df42 4309 SvREFCNT_dec(data.longest_float);
c5254dd6 4310 longest_float_length = 0;
a0d0e21e 4311 }
c277df42 4312
1de06328
YO
4313 /* Note that code very similar to this but for floating string
4314 is immediately above, changes may need to be made to both.
4315 Be careful.
4316 */
a0ed51b3 4317 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4318 if (longest_fixed_length
c277df42
IZ
4319 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4320 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4321 || (RExC_flags & PMf_MULTILINE))))
4322 {
1182767e 4323 I32 t,ml;
cf93c79d 4324
1de06328
YO
4325 /* copy the information about the longest fixed
4326 from the reg_scan_data over to the program. */
33b8afdf
JH
4327 if (SvUTF8(data.longest_fixed)) {
4328 r->anchored_utf8 = data.longest_fixed;
c445ea15 4329 r->anchored_substr = NULL;
33b8afdf
JH
4330 } else {
4331 r->anchored_substr = data.longest_fixed;
c445ea15 4332 r->anchored_utf8 = NULL;
33b8afdf 4333 }
1de06328
YO
4334 /* fixed_end_shift is how many chars that must be matched that
4335 follow this item. We calculate it ahead of time as once the
4336 lookbehind offset is added in we lose the ability to correctly
4337 calculate it.*/
4338 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4339 : (I32)longest_fixed_length;
1de06328
YO
4340 r->anchored_end_shift = ml - data.offset_fixed
4341 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4342 + data.lookbehind_fixed;
4343 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4344
cf93c79d
IZ
4345 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4346 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4347 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4348 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4349 }
4350 else {
c445ea15 4351 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4352 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4353 longest_fixed_length = 0;
a0d0e21e 4354 }
b81d288d 4355 if (r->regstclass
ffc61ed2 4356 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4357 r->regstclass = NULL;
33b8afdf
JH
4358 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4359 && stclass_flag
653099ff 4360 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4361 && !cl_is_anything(data.start_class))
4362 {
1df70142 4363 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4364
a02a5408 4365 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4366 struct regnode_charclass_class);
4367 StructCopy(data.start_class,
830247a4 4368 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4369 struct regnode_charclass_class);
830247a4 4370 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4371 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4372 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4373 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4374 PerlIO_printf(Perl_debug_log,
a0288114 4375 "synthetic stclass \"%s\".\n",
3f7c398e 4376 SvPVX_const(sv));});
653099ff 4377 }
c277df42
IZ
4378
4379 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4380 if (longest_fixed_length > longest_float_length) {
1de06328 4381 r->check_end_shift = r->anchored_end_shift;
c277df42 4382 r->check_substr = r->anchored_substr;
33b8afdf 4383 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4384 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4385 if (r->reganch & ROPT_ANCH_SINGLE)
4386 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4387 }
4388 else {
1de06328 4389 r->check_end_shift = r->float_end_shift;
c277df42 4390 r->check_substr = r->float_substr;
33b8afdf 4391 r->check_utf8 = r->float_utf8;
1de06328
YO
4392 r->check_offset_min = r->float_min_offset;
4393 r->check_offset_max = r->float_max_offset;
a0d0e21e 4394 }
30382c73
IZ
4395 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4396 This should be changed ASAP! */
33b8afdf 4397 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4398 r->reganch |= RE_USE_INTUIT;
33b8afdf 4399 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4400 r->reganch |= RE_INTUIT_TAIL;
4401 }
1de06328
YO
4402 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4403 if ( (STRLEN)minlen < longest_float_length )
4404 minlen= longest_float_length;
4405 if ( (STRLEN)minlen < longest_fixed_length )
4406 minlen= longest_fixed_length;
4407 */
a0ed51b3
LW
4408 }
4409 else {
c277df42
IZ
4410 /* Several toplevels. Best we can is to set minlen. */
4411 I32 fake;
653099ff 4412 struct regnode_charclass_class ch_class;
cb434fcc 4413 I32 last_close = 0;
c277df42 4414
5d458dd8 4415 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4416
c277df42 4417 scan = r->program + 1;
830247a4 4418 cl_init(pRExC_state, &ch_class);
653099ff 4419 data.start_class = &ch_class;
cb434fcc 4420 data.last_closep = &last_close;
07be1b83 4421
de8c5301 4422
1de06328 4423 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4424 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4425
786e8c11 4426 CHECK_RESTUDY_GOTO;
07be1b83 4427
33b8afdf 4428 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4429 = r->float_substr = r->float_utf8 = NULL;
653099ff 4430 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4431 && !cl_is_anything(data.start_class))
4432 {
1df70142 4433 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4434
a02a5408 4435 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4436 struct regnode_charclass_class);
4437 StructCopy(data.start_class,
830247a4 4438 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4439 struct regnode_charclass_class);
830247a4 4440 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4441 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4442 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4443 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4444 PerlIO_printf(Perl_debug_log,
a0288114 4445 "synthetic stclass \"%s\".\n",
3f7c398e 4446 SvPVX_const(sv));});
653099ff 4447 }
a0d0e21e
LW
4448 }
4449
1de06328
YO
4450 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4451 the "real" pattern. */
cf9788e3
RGS
4452 DEBUG_OPTIMISE_r({
4453 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
4454 minlen, r->minlen);
4455 });
de8c5301 4456 r->minlenret = minlen;
1de06328
YO
4457 if (r->minlen < minlen)
4458 r->minlen = minlen;
4459
b81d288d 4460 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4461 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4462 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4463 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4464 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4465 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4466 if (RExC_seen & REG_SEEN_CANY)
4467 r->reganch |= ROPT_CANY_SEEN;
e2e6a0f1
YO
4468 if (RExC_seen & REG_SEEN_VERBARG)
4469 r->reganch |= ROPT_VERBARG_SEEN;
5d458dd8
YO
4470 if (RExC_seen & REG_SEEN_CUTGROUP)
4471 r->reganch |= ROPT_CUTGROUP_SEEN;
81714fb9
YO
4472 if (RExC_paren_names)
4473 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4474 else
4475 r->paren_names = NULL;
4476
40d049e4
YO
4477 if (RExC_recurse_count) {
4478 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4479 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4480 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4481 }
4482 }
a02a5408
JC
4483 Newxz(r->startp, RExC_npar, I32);
4484 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4485
be8e71aa
YO
4486 DEBUG_DUMP_r({
4487 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4488 regdump(r);
4489 });
8e9a8a48
YO
4490 DEBUG_OFFSETS_r(if (r->offsets) {
4491 const U32 len = r->offsets[0];
4492 U32 i;
4493 GET_RE_DEBUG_FLAGS_DECL;
4494 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4495 for (i = 1; i <= len; i++) {
4496 if (r->offsets[i*2-1] || r->offsets[i*2])
4497 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4498 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4499 }
4500 PerlIO_printf(Perl_debug_log, "\n");
4501 });
a0d0e21e 4502 return(r);
f9f4320a 4503 END_BLOCK
a687059c
LW
4504}
4505
f9f4320a
YO
4506#undef CORE_ONLY_BLOCK
4507#undef END_BLOCK
4508#undef RE_ENGINE_PTR
3dab1dad 4509
9af228c6 4510#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4511SV*
4512Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4513{
4514 I32 parno = 0; /* no match */
4515 if (PL_curpm) {
4516 const REGEXP * const rx = PM_GETRE(PL_curpm);
4517 if (rx && rx->paren_names) {
4518 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4519 if (he_str) {
4520 IV i;
4521 SV* sv_dat=HeVAL(he_str);
4522 I32 *nums=(I32*)SvPVX(sv_dat);
4523 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4524 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4525 rx->endp[nums[i]] != -1)
4526 {
4527 parno = nums[i];
4528 break;
4529 }
4530 }
4531 }
4532 }
4533 }
4534 if ( !parno ) {
4535 return 0;
4536 } else {
4537 GV *gv_paren;
4538 SV *sv= sv_newmortal();
4539 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4540 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4541 return GvSVn(gv_paren);
4542 }
4543}
9af228c6 4544#endif
0a4db386 4545
894be9b7 4546/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4547 * If flags is REG_RSN_RETURN_NULL returns null.
4548 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4549 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4550 * to the parsed name as looked up in the RExC_paren_names hash.
4551 * If there is an error throws a vFAIL().. type exception.
894be9b7 4552 */
0a4db386
YO
4553
4554#define REG_RSN_RETURN_NULL 0
4555#define REG_RSN_RETURN_NAME 1
4556#define REG_RSN_RETURN_DATA 2
4557
894be9b7
YO
4558STATIC SV*
4559S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4560 char *name_start = RExC_parse;
0a4db386 4561 if ( UTF ) {
894be9b7 4562 STRLEN numlen;
0a4db386
YO
4563 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4564 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4565 {
4566 RExC_parse += numlen;
4567 }
4568 } else {
4569 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4570 RExC_parse++;
4571 }
0a4db386
YO
4572 if ( flags ) {
4573 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4574 (int)(RExC_parse - name_start)));
894be9b7 4575 if (UTF)
0a4db386
YO
4576 SvUTF8_on(sv_name);
4577 if ( flags == REG_RSN_RETURN_NAME)
4578 return sv_name;
4579 else if (flags==REG_RSN_RETURN_DATA) {
4580 HE *he_str = NULL;
4581 SV *sv_dat = NULL;
4582 if ( ! sv_name ) /* should not happen*/
4583 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4584 if (RExC_paren_names)
4585 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4586 if ( he_str )
4587 sv_dat = HeVAL(he_str);
4588 if ( ! sv_dat )
4589 vFAIL("Reference to nonexistent named group");
4590 return sv_dat;
4591 }
4592 else {
4593 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4594 }
4595 /* NOT REACHED */
894be9b7 4596 }
0a4db386 4597 return NULL;
894be9b7
YO
4598}
4599
3dab1dad
YO
4600#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4601 int rem=(int)(RExC_end - RExC_parse); \
4602 int cut; \
4603 int num; \
4604 int iscut=0; \
4605 if (rem>10) { \
4606 rem=10; \
4607 iscut=1; \
4608 } \
4609 cut=10-rem; \
4610 if (RExC_lastparse!=RExC_parse) \
4611 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4612 rem, RExC_parse, \
4613 cut + 4, \
4614 iscut ? "..." : "<" \
4615 ); \
4616 else \
4617 PerlIO_printf(Perl_debug_log,"%16s",""); \
4618 \
4619 if (SIZE_ONLY) \
4620 num=RExC_size; \
4621 else \
4622 num=REG_NODE_NUM(RExC_emit); \
4623 if (RExC_lastnum!=num) \
0a4db386 4624 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4625 else \
0a4db386 4626 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4627 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4628 (int)((depth*2)), "", \
3dab1dad
YO
4629 (funcname) \
4630 ); \
4631 RExC_lastnum=num; \
4632 RExC_lastparse=RExC_parse; \
4633})
4634
07be1b83
YO
4635
4636
3dab1dad
YO
4637#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4638 DEBUG_PARSE_MSG((funcname)); \
4639 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4640})
6bda09f9
YO
4641#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4642 DEBUG_PARSE_MSG((funcname)); \
4643 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4644})
a687059c
LW
4645/*
4646 - reg - regular expression, i.e. main body or parenthesized thing
4647 *
4648 * Caller must absorb opening parenthesis.
4649 *
4650 * Combining parenthesis handling with the base level of regular expression
4651 * is a trifle forced, but the need to tie the tails of the branches to what
4652 * follows makes it hard to avoid.
4653 */
07be1b83
YO
4654#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4655#ifdef DEBUGGING
4656#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4657#else
4658#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4659#endif
3dab1dad 4660
e2e6a0f1
YO
4661/* this idea is borrowed from STR_WITH_LEN in handy.h */
4662#define CHECK_WORD(s,v,l) \
4663 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4664
76e3520e 4665STATIC regnode *
3dab1dad 4666S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4667 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4668{
27da23d5 4669 dVAR;
c277df42
IZ
4670 register regnode *ret; /* Will be the head of the group. */
4671 register regnode *br;
4672 register regnode *lastbr;
cbbf8932 4673 register regnode *ender = NULL;
a0d0e21e 4674 register I32 parno = 0;
cbbf8932
AL
4675 I32 flags;
4676 const I32 oregflags = RExC_flags;
6136c704
AL
4677 bool have_branch = 0;
4678 bool is_open = 0;
9d1d55b5
JP
4679
4680 /* for (?g), (?gc), and (?o) warnings; warning
4681 about (?c) will warn about (?g) -- japhy */
4682
6136c704
AL
4683#define WASTED_O 0x01
4684#define WASTED_G 0x02
4685#define WASTED_C 0x04
4686#define WASTED_GC (0x02|0x04)
cbbf8932 4687 I32 wastedflags = 0x00;
9d1d55b5 4688
fac92740 4689 char * parse_start = RExC_parse; /* MJD */
a28509cc 4690 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4691
3dab1dad
YO
4692 GET_RE_DEBUG_FLAGS_DECL;
4693 DEBUG_PARSE("reg ");
4694
4695
821b33a5 4696 *flagp = 0; /* Tentatively. */
a0d0e21e 4697
9d1d55b5 4698
a0d0e21e
LW
4699 /* Make an OPEN node, if parenthesized. */
4700 if (paren) {
e2e6a0f1
YO
4701 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4702 char *start_verb = RExC_parse;
4703 STRLEN verb_len = 0;
4704 char *start_arg = NULL;
4705 unsigned char op = 0;
4706 int argok = 1;
4707 int internal_argval = 0; /* internal_argval is only useful if !argok */
4708 while ( *RExC_parse && *RExC_parse != ')' ) {
4709 if ( *RExC_parse == ':' ) {
4710 start_arg = RExC_parse + 1;
4711 break;
4712 }
4713 RExC_parse++;
4714 }
4715 ++start_verb;
4716 verb_len = RExC_parse - start_verb;
4717 if ( start_arg ) {
4718 RExC_parse++;
4719 while ( *RExC_parse && *RExC_parse != ')' )
4720 RExC_parse++;
4721 if ( *RExC_parse != ')' )
4722 vFAIL("Unterminated verb pattern argument");
4723 if ( RExC_parse == start_arg )
4724 start_arg = NULL;
4725 } else {
4726 if ( *RExC_parse != ')' )
4727 vFAIL("Unterminated verb pattern");
4728 }
5d458dd8 4729
e2e6a0f1
YO
4730 switch ( *start_verb ) {
4731 case 'A': /* (*ACCEPT) */
4732 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4733 op = ACCEPT;
4734 internal_argval = RExC_nestroot;
4735 }
4736 break;
4737 case 'C': /* (*COMMIT) */
4738 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4739 op = COMMIT;
e2e6a0f1
YO
4740 break;
4741 case 'F': /* (*FAIL) */
4742 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4743 op = OPFAIL;
4744 argok = 0;
4745 }
4746 break;
5d458dd8
YO
4747 case ':': /* (*:NAME) */
4748 case 'M': /* (*MARK:NAME) */
4749 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4750 op = MARKPOINT;
5d458dd8
YO
4751 argok = -1;
4752 }
4753 break;
4754 case 'P': /* (*PRUNE) */
4755 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4756 op = PRUNE;
e2e6a0f1 4757 break;
5d458dd8
YO
4758 case 'S': /* (*SKIP) */
4759 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4760 op = SKIP;
4761 break;
4762 case 'T': /* (*THEN) */
4763 /* [19:06] <TimToady> :: is then */
4764 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4765 op = CUTGROUP;
4766 RExC_seen |= REG_SEEN_CUTGROUP;
4767 }
e2e6a0f1
YO
4768 break;
4769 }
4770 if ( ! op ) {
4771 RExC_parse++;
4772 vFAIL3("Unknown verb pattern '%.*s'",
4773 verb_len, start_verb);
4774 }
4775 if ( argok ) {
4776 if ( start_arg && internal_argval ) {
4777 vFAIL3("Verb pattern '%.*s' may not have an argument",
4778 verb_len, start_verb);
4779 } else if ( argok < 0 && !start_arg ) {
4780 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
4781 verb_len, start_verb);
4782 } else {
4783 ret = reganode(pRExC_state, op, internal_argval);
4784 if ( ! internal_argval && ! SIZE_ONLY ) {
4785 if (start_arg) {
4786 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
4787 ARG(ret) = add_data( pRExC_state, 1, "S" );
4788 RExC_rx->data->data[ARG(ret)]=(void*)sv;
4789 ret->flags = 0;
4790 } else {
4791 ret->flags = 1;
4792 }
4793 }
4794 }
4795 if (!internal_argval)
4796 RExC_seen |= REG_SEEN_VERBARG;
4797 } else if ( start_arg ) {
4798 vFAIL3("Verb pattern '%.*s' may not have an argument",
4799 verb_len, start_verb);
4800 } else {
4801 ret = reg_node(pRExC_state, op);
4802 }
4803 nextchar(pRExC_state);
4804 return ret;
4805 } else
fac92740 4806 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4807 U32 posflags = 0, negflags = 0;
4808 U32 *flagsp = &posflags;
6136c704 4809 bool is_logical = 0;
a28509cc 4810 const char * const seqstart = RExC_parse;
ca9dfc88 4811
830247a4
IZ
4812 RExC_parse++;
4813 paren = *RExC_parse++;
c277df42 4814 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4815 switch (paren) {
894be9b7 4816
fac92740 4817 case '<': /* (?<...) */
b81d288d 4818 if (*RExC_parse == '!')
c277df42 4819 paren = ',';
0a4db386
YO
4820 else if (*RExC_parse != '=')
4821 { /* (?<...>) */
81714fb9 4822 char *name_start;
894be9b7 4823 SV *svname;
81714fb9
YO
4824 paren= '>';
4825 case '\'': /* (?'...') */
4826 name_start= RExC_parse;
0a4db386
YO
4827 svname = reg_scan_name(pRExC_state,
4828 SIZE_ONLY ? /* reverse test from the others */
4829 REG_RSN_RETURN_NAME :
4830 REG_RSN_RETURN_NULL);
81714fb9
YO
4831 if (RExC_parse == name_start)
4832 goto unknown;
4833 if (*RExC_parse != paren)
4834 vFAIL2("Sequence (?%c... not terminated",
4835 paren=='>' ? '<' : paren);
4836 if (SIZE_ONLY) {
e62cc96a
YO
4837 HE *he_str;
4838 SV *sv_dat = NULL;
894be9b7
YO
4839 if (!svname) /* shouldnt happen */
4840 Perl_croak(aTHX_
4841 "panic: reg_scan_name returned NULL");
81714fb9
YO
4842 if (!RExC_paren_names) {
4843 RExC_paren_names= newHV();
4844 sv_2mortal((SV*)RExC_paren_names);
4845 }
4846 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4847 if ( he_str )
81714fb9 4848 sv_dat = HeVAL(he_str);
e62cc96a 4849 if ( ! sv_dat ) {
81714fb9 4850 /* croak baby croak */
e62cc96a
YO
4851 Perl_croak(aTHX_
4852 "panic: paren_name hash element allocation failed");
4853 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4854 IV count=SvIV(sv_dat);
4855 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4856 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4857 pv[count]=RExC_npar;
4858 SvIVX(sv_dat)++;
4859 } else {
4860 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4861 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4862 SvIOK_on(sv_dat);
4863 SvIVX(sv_dat)= 1;
e62cc96a
YO
4864 }
4865
81714fb9
YO
4866 /*sv_dump(sv_dat);*/
4867 }
4868 nextchar(pRExC_state);
4869 paren = 1;
4870 goto capturing_parens;
4871 }
4872 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4873 RExC_parse++;
fac92740
MJD
4874 case '=': /* (?=...) */
4875 case '!': /* (?!...) */
830247a4 4876 RExC_seen_zerolen++;
e2e6a0f1
YO
4877 if (*RExC_parse == ')') {
4878 ret=reg_node(pRExC_state, OPFAIL);
4879 nextchar(pRExC_state);
4880 return ret;
4881 }
fac92740
MJD
4882 case ':': /* (?:...) */
4883 case '>': /* (?>...) */
a0d0e21e 4884 break;
fac92740
MJD
4885 case '$': /* (?$...) */
4886 case '@': /* (?@...) */
8615cb43 4887 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4888 break;
fac92740 4889 case '#': /* (?#...) */
830247a4
IZ
4890 while (*RExC_parse && *RExC_parse != ')')
4891 RExC_parse++;
4892 if (*RExC_parse != ')')
c277df42 4893 FAIL("Sequence (?#... not terminated");
830247a4 4894 nextchar(pRExC_state);
a0d0e21e
LW
4895 *flagp = TRYAGAIN;
4896 return NULL;
894be9b7
YO
4897 case '0' : /* (?0) */
4898 case 'R' : /* (?R) */
4899 if (*RExC_parse != ')')
6bda09f9 4900 FAIL("Sequence (?R) not terminated");
1a147d38 4901 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4902 nextchar(pRExC_state);
4903 return ret;
4904 /*notreached*/
894be9b7
YO
4905 { /* named and numeric backreferences */
4906 I32 num;
4907 char * parse_start;
4908 case '&': /* (?&NAME) */
4909 parse_start = RExC_parse - 1;
4910 {
0a4db386
YO
4911 SV *sv_dat = reg_scan_name(pRExC_state,
4912 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4913 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4914 }
4915 goto gen_recurse_regop;
4916 /* NOT REACHED */
542fa716
YO
4917 case '+':
4918 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4919 RExC_parse++;
4920 vFAIL("Illegal pattern");
4921 }
4922 goto parse_recursion;
4923 /* NOT REACHED*/
4924 case '-': /* (?-1) */
4925 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
4926 RExC_parse--; /* rewind to let it be handled later */
4927 goto parse_flags;
4928 }
4929 /*FALLTHROUGH */
6bda09f9
YO
4930 case '1': case '2': case '3': case '4': /* (?1) */
4931 case '5': case '6': case '7': case '8': case '9':
4932 RExC_parse--;
542fa716 4933 parse_recursion:
894be9b7
YO
4934 num = atoi(RExC_parse);
4935 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
4936 if (*RExC_parse == '-')
4937 RExC_parse++;
6bda09f9
YO
4938 while (isDIGIT(*RExC_parse))
4939 RExC_parse++;
4940 if (*RExC_parse!=')')
4941 vFAIL("Expecting close bracket");
894be9b7
YO
4942
4943 gen_recurse_regop:
542fa716
YO
4944 if ( paren == '-' ) {
4945 /*
4946 Diagram of capture buffer numbering.
4947 Top line is the normal capture buffer numbers
4948 Botton line is the negative indexing as from
4949 the X (the (?-2))
4950
4951 + 1 2 3 4 5 X 6 7
4952 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
4953 - 5 4 3 2 1 X x x
4954
4955 */
4956 num = RExC_npar + num;
4957 if (num < 1) {
4958 RExC_parse++;
4959 vFAIL("Reference to nonexistent group");
4960 }
4961 } else if ( paren == '+' ) {
4962 num = RExC_npar + num - 1;
4963 }
4964
1a147d38 4965 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4966 if (!SIZE_ONLY) {
4967 if (num > (I32)RExC_rx->nparens) {
4968 RExC_parse++;
4969 vFAIL("Reference to nonexistent group");
4970 }
40d049e4 4971 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4972 RExC_emit++;
226de585 4973 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4974 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4975 } else {
6bda09f9 4976 RExC_size++;
6bda09f9 4977 }
0a4db386 4978 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4979 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4980 Set_Node_Offset(ret, parse_start); /* MJD */
4981
6bda09f9
YO
4982 nextchar(pRExC_state);
4983 return ret;
894be9b7
YO
4984 } /* named and numeric backreferences */
4985 /* NOT REACHED */
4986
fac92740 4987 case 'p': /* (?p...) */
9014280d 4988 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4989 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4990 /* FALL THROUGH*/
fac92740 4991 case '?': /* (??...) */
6136c704 4992 is_logical = 1;
438a3801
YST
4993 if (*RExC_parse != '{')
4994 goto unknown;
830247a4 4995 paren = *RExC_parse++;
0f5d15d6 4996 /* FALL THROUGH */
fac92740 4997 case '{': /* (?{...}) */
c277df42 4998 {
c277df42
IZ
4999 I32 count = 1, n = 0;
5000 char c;
830247a4 5001 char *s = RExC_parse;
c277df42 5002
830247a4
IZ
5003 RExC_seen_zerolen++;
5004 RExC_seen |= REG_SEEN_EVAL;
5005 while (count && (c = *RExC_parse)) {
6136c704
AL
5006 if (c == '\\') {
5007 if (RExC_parse[1])
5008 RExC_parse++;
5009 }
b81d288d 5010 else if (c == '{')
c277df42 5011 count++;
b81d288d 5012 else if (c == '}')
c277df42 5013 count--;
830247a4 5014 RExC_parse++;
c277df42 5015 }
6136c704 5016 if (*RExC_parse != ')') {
b81d288d 5017 RExC_parse = s;
b45f050a
JF
5018 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5019 }
c277df42 5020 if (!SIZE_ONLY) {
f3548bdc 5021 PAD *pad;
6136c704
AL
5022 OP_4tree *sop, *rop;
5023 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5024
569233ed
SB
5025 ENTER;
5026 Perl_save_re_context(aTHX);
f3548bdc 5027 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5028 sop->op_private |= OPpREFCOUNTED;
5029 /* re_dup will OpREFCNT_inc */
5030 OpREFCNT_set(sop, 1);
569233ed 5031 LEAVE;
c277df42 5032
830247a4
IZ
5033 n = add_data(pRExC_state, 3, "nop");
5034 RExC_rx->data->data[n] = (void*)rop;
5035 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 5036 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 5037 SvREFCNT_dec(sv);
a0ed51b3 5038 }
e24b16f9 5039 else { /* First pass */
830247a4 5040 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5041 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5042 /* No compiled RE interpolated, has runtime
5043 components ===> unsafe. */
5044 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5045 if (PL_tainting && PL_tainted)
cc6b7395 5046 FAIL("Eval-group in insecure regular expression");
54df2634 5047#if PERL_VERSION > 8
923e4eb5 5048 if (IN_PERL_COMPILETIME)
b5c19bd7 5049 PL_cv_has_eval = 1;
54df2634 5050#endif
c277df42 5051 }
b5c19bd7 5052
830247a4 5053 nextchar(pRExC_state);
6136c704 5054 if (is_logical) {
830247a4 5055 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5056 if (!SIZE_ONLY)
5057 ret->flags = 2;
3dab1dad 5058 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5059 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5060 return ret;
5061 }
ccb2c380
MP
5062 ret = reganode(pRExC_state, EVAL, n);
5063 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5064 Set_Node_Offset(ret, parse_start);
5065 return ret;
c277df42 5066 }
fac92740 5067 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5068 {
0a4db386 5069 int is_define= 0;
fac92740 5070 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5071 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5072 || RExC_parse[1] == '<'
830247a4 5073 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5074 I32 flag;
5075
830247a4 5076 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5077 if (!SIZE_ONLY)
5078 ret->flags = 1;
3dab1dad 5079 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5080 goto insert_if;
b81d288d 5081 }
a0ed51b3 5082 }
0a4db386
YO
5083 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5084 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5085 {
5086 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5087 char *name_start= RExC_parse++;
5088 I32 num = 0;
5089 SV *sv_dat=reg_scan_name(pRExC_state,
5090 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5091 if (RExC_parse == name_start || *RExC_parse != ch)
5092 vFAIL2("Sequence (?(%c... not terminated",
5093 (ch == '>' ? '<' : ch));
5094 RExC_parse++;
5095 if (!SIZE_ONLY) {
5096 num = add_data( pRExC_state, 1, "S" );
5097 RExC_rx->data->data[num]=(void*)sv_dat;
5098 SvREFCNT_inc(sv_dat);
5099 }
5100 ret = reganode(pRExC_state,NGROUPP,num);
5101 goto insert_if_check_paren;
5102 }
5103 else if (RExC_parse[0] == 'D' &&
5104 RExC_parse[1] == 'E' &&
5105 RExC_parse[2] == 'F' &&
5106 RExC_parse[3] == 'I' &&
5107 RExC_parse[4] == 'N' &&
5108 RExC_parse[5] == 'E')
5109 {
5110 ret = reganode(pRExC_state,DEFINEP,0);
5111 RExC_parse +=6 ;
5112 is_define = 1;
5113 goto insert_if_check_paren;
5114 }
5115 else if (RExC_parse[0] == 'R') {
5116 RExC_parse++;
5117 parno = 0;
5118 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5119 parno = atoi(RExC_parse++);
5120 while (isDIGIT(*RExC_parse))
5121 RExC_parse++;
5122 } else if (RExC_parse[0] == '&') {
5123 SV *sv_dat;
5124 RExC_parse++;
5125 sv_dat = reg_scan_name(pRExC_state,
5126 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5127 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5128 }
1a147d38 5129 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5130 goto insert_if_check_paren;
5131 }
830247a4 5132 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5133 /* (?(1)...) */
6136c704 5134 char c;
830247a4 5135 parno = atoi(RExC_parse++);
c277df42 5136
830247a4
IZ
5137 while (isDIGIT(*RExC_parse))
5138 RExC_parse++;
fac92740 5139 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5140
0a4db386 5141 insert_if_check_paren:
830247a4 5142 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5143 vFAIL("Switch condition not recognized");
c277df42 5144 insert_if:
3dab1dad
YO
5145 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5146 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5147 if (br == NULL)
830247a4 5148 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5149 else
3dab1dad 5150 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5151 c = *nextchar(pRExC_state);
d1b80229
IZ
5152 if (flags&HASWIDTH)
5153 *flagp |= HASWIDTH;
c277df42 5154 if (c == '|') {
0a4db386
YO
5155 if (is_define)
5156 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5157 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5158 regbranch(pRExC_state, &flags, 1,depth+1);
5159 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5160 if (flags&HASWIDTH)
5161 *flagp |= HASWIDTH;
830247a4 5162 c = *nextchar(pRExC_state);
a0ed51b3
LW
5163 }
5164 else
c277df42
IZ
5165 lastbr = NULL;
5166 if (c != ')')
8615cb43 5167 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5168 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5169 REGTAIL(pRExC_state, br, ender);
c277df42 5170 if (lastbr) {
3dab1dad
YO
5171 REGTAIL(pRExC_state, lastbr, ender);
5172 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5173 }
5174 else
3dab1dad 5175 REGTAIL(pRExC_state, ret, ender);
c277df42 5176 return ret;
a0ed51b3
LW
5177 }
5178 else {
830247a4 5179 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5180 }
5181 }
1b1626e4 5182 case 0:
830247a4 5183 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5184 vFAIL("Sequence (? incomplete");
1b1626e4 5185 break;
a0d0e21e 5186 default:
830247a4 5187 --RExC_parse;
fac92740 5188 parse_flags: /* (?i) */
830247a4 5189 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5190 /* (?g), (?gc) and (?o) are useless here
5191 and must be globally applied -- japhy */
5192
5193 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5194 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5195 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5196 if (! (wastedflags & wflagbit) ) {
5197 wastedflags |= wflagbit;
5198 vWARN5(
5199 RExC_parse + 1,
5200 "Useless (%s%c) - %suse /%c modifier",
5201 flagsp == &negflags ? "?-" : "?",
5202 *RExC_parse,
5203 flagsp == &negflags ? "don't " : "",
5204 *RExC_parse
5205 );
5206 }
5207 }
5208 }
5209 else if (*RExC_parse == 'c') {
5210 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5211 if (! (wastedflags & WASTED_C) ) {
5212 wastedflags |= WASTED_GC;
9d1d55b5
JP
5213 vWARN3(
5214 RExC_parse + 1,
5215 "Useless (%sc) - %suse /gc modifier",
5216 flagsp == &negflags ? "?-" : "?",
5217 flagsp == &negflags ? "don't " : ""
5218 );
5219 }
5220 }
5221 }
5222 else { pmflag(flagsp, *RExC_parse); }
5223
830247a4 5224 ++RExC_parse;
ca9dfc88 5225 }
830247a4 5226 if (*RExC_parse == '-') {
ca9dfc88 5227 flagsp = &negflags;
9d1d55b5 5228 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5229 ++RExC_parse;
ca9dfc88 5230 goto parse_flags;
48c036b1 5231 }
e2509266
JH
5232 RExC_flags |= posflags;
5233 RExC_flags &= ~negflags;
830247a4
IZ
5234 if (*RExC_parse == ':') {
5235 RExC_parse++;
ca9dfc88
IZ
5236 paren = ':';
5237 break;
5238 }
c277df42 5239 unknown:
830247a4
IZ
5240 if (*RExC_parse != ')') {
5241 RExC_parse++;
5242 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5243 }
830247a4 5244 nextchar(pRExC_state);
a0d0e21e
LW
5245 *flagp = TRYAGAIN;
5246 return NULL;
5247 }
5248 }
fac92740 5249 else { /* (...) */
81714fb9 5250 capturing_parens:
830247a4
IZ
5251 parno = RExC_npar;
5252 RExC_npar++;
e2e6a0f1 5253
830247a4 5254 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5255 if (!SIZE_ONLY ){
5256 if (!RExC_nestroot)
5257 RExC_nestroot = parno;
5258 if (RExC_seen & REG_SEEN_RECURSE) {
5259 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5260 "Setting open paren #%"IVdf" to %d\n",
5261 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5262 RExC_open_parens[parno-1]= ret;
5263 }
6bda09f9 5264 }
fac92740
MJD
5265 Set_Node_Length(ret, 1); /* MJD */
5266 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5267 is_open = 1;
a0d0e21e 5268 }
a0ed51b3 5269 }
fac92740 5270 else /* ! paren */
a0d0e21e
LW
5271 ret = NULL;
5272
5273 /* Pick up the branches, linking them together. */
fac92740 5274 parse_start = RExC_parse; /* MJD */
3dab1dad 5275 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5276 /* branch_len = (paren != 0); */
2af232bd 5277
a0d0e21e
LW
5278 if (br == NULL)
5279 return(NULL);
830247a4
IZ
5280 if (*RExC_parse == '|') {
5281 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5282 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5283 }
fac92740 5284 else { /* MJD */
6bda09f9 5285 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5286 Set_Node_Length(br, paren != 0);
5287 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5288 }
c277df42
IZ
5289 have_branch = 1;
5290 if (SIZE_ONLY)
830247a4 5291 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5292 }
5293 else if (paren == ':') {
c277df42
IZ
5294 *flagp |= flags&SIMPLE;
5295 }
6136c704 5296 if (is_open) { /* Starts with OPEN. */
3dab1dad 5297 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5298 }
5299 else if (paren != '?') /* Not Conditional */
a0d0e21e 5300 ret = br;
32a0ca98 5301 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5302 lastbr = br;
830247a4
IZ
5303 while (*RExC_parse == '|') {
5304 if (!SIZE_ONLY && RExC_extralen) {
5305 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5306 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5307 }
5308 if (SIZE_ONLY)
830247a4
IZ
5309 RExC_extralen += 2; /* Account for LONGJMP. */
5310 nextchar(pRExC_state);
3dab1dad 5311 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5312
a687059c 5313 if (br == NULL)
a0d0e21e 5314 return(NULL);
3dab1dad 5315 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5316 lastbr = br;
821b33a5
IZ
5317 if (flags&HASWIDTH)
5318 *flagp |= HASWIDTH;
a687059c 5319 *flagp |= flags&SPSTART;
a0d0e21e
LW
5320 }
5321
c277df42
IZ
5322 if (have_branch || paren != ':') {
5323 /* Make a closing node, and hook it on the end. */
5324 switch (paren) {
5325 case ':':
830247a4 5326 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5327 break;
5328 case 1:
830247a4 5329 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5330 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5331 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5332 "Setting close paren #%"IVdf" to %d\n",
5333 (IV)parno, REG_NODE_NUM(ender)));
5334 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5335 if (RExC_nestroot == parno)
5336 RExC_nestroot = 0;
40d049e4 5337 }
fac92740
MJD
5338 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5339 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5340 break;
5341 case '<':
c277df42
IZ
5342 case ',':
5343 case '=':
5344 case '!':
c277df42 5345 *flagp &= ~HASWIDTH;
821b33a5
IZ
5346 /* FALL THROUGH */
5347 case '>':
830247a4 5348 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5349 break;
5350 case 0:
830247a4 5351 ender = reg_node(pRExC_state, END);
40d049e4
YO
5352 if (!SIZE_ONLY) {
5353 assert(!RExC_opend); /* there can only be one! */
5354 RExC_opend = ender;
5355 }
c277df42
IZ
5356 break;
5357 }
eaf3ca90 5358 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5359
9674d46a 5360 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5361 if (depth==1)
5362 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5363
c277df42 5364 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5365 for (br = ret; br; br = regnext(br)) {
5366 const U8 op = PL_regkind[OP(br)];
5367 if (op == BRANCH) {
07be1b83 5368 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5369 }
5370 else if (op == BRANCHJ) {
07be1b83 5371 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5372 }
c277df42
IZ
5373 }
5374 }
a0d0e21e 5375 }
c277df42
IZ
5376
5377 {
e1ec3a88
AL
5378 const char *p;
5379 static const char parens[] = "=!<,>";
c277df42
IZ
5380
5381 if (paren && (p = strchr(parens, paren))) {
eb160463 5382 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5383 int flag = (p - parens) > 1;
5384
5385 if (paren == '>')
5386 node = SUSPEND, flag = 0;
6bda09f9 5387 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5388 Set_Node_Cur_Length(ret);
5389 Set_Node_Offset(ret, parse_start + 1);
c277df42 5390 ret->flags = flag;
07be1b83 5391 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5392 }
a0d0e21e
LW
5393 }
5394
5395 /* Check for proper termination. */
ce3e6498 5396 if (paren) {
e2509266 5397 RExC_flags = oregflags;
830247a4
IZ
5398 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5399 RExC_parse = oregcomp_parse;
380a0633 5400 vFAIL("Unmatched (");
ce3e6498 5401 }
a0ed51b3 5402 }
830247a4
IZ
5403 else if (!paren && RExC_parse < RExC_end) {
5404 if (*RExC_parse == ')') {
5405 RExC_parse++;
380a0633 5406 vFAIL("Unmatched )");
a0ed51b3
LW
5407 }
5408 else
b45f050a 5409 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5410 /* NOTREACHED */
5411 }
a687059c 5412
a0d0e21e 5413 return(ret);
a687059c
LW
5414}
5415
5416/*
5417 - regbranch - one alternative of an | operator
5418 *
5419 * Implements the concatenation operator.
5420 */
76e3520e 5421STATIC regnode *
3dab1dad 5422S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5423{
97aff369 5424 dVAR;
c277df42
IZ
5425 register regnode *ret;
5426 register regnode *chain = NULL;
5427 register regnode *latest;
5428 I32 flags = 0, c = 0;
3dab1dad
YO
5429 GET_RE_DEBUG_FLAGS_DECL;
5430 DEBUG_PARSE("brnc");
b81d288d 5431 if (first)
c277df42
IZ
5432 ret = NULL;
5433 else {
b81d288d 5434 if (!SIZE_ONLY && RExC_extralen)
830247a4 5435 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5436 else {
830247a4 5437 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5438 Set_Node_Length(ret, 1);
5439 }
c277df42
IZ
5440 }
5441
b81d288d 5442 if (!first && SIZE_ONLY)
830247a4 5443 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5444
c277df42 5445 *flagp = WORST; /* Tentatively. */
a0d0e21e 5446
830247a4
IZ
5447 RExC_parse--;
5448 nextchar(pRExC_state);
5449 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5450 flags &= ~TRYAGAIN;
3dab1dad 5451 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5452 if (latest == NULL) {
5453 if (flags & TRYAGAIN)
5454 continue;
5455 return(NULL);
a0ed51b3
LW
5456 }
5457 else if (ret == NULL)
c277df42 5458 ret = latest;
a0d0e21e 5459 *flagp |= flags&HASWIDTH;
c277df42 5460 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5461 *flagp |= flags&SPSTART;
5462 else {
830247a4 5463 RExC_naughty++;
3dab1dad 5464 REGTAIL(pRExC_state, chain, latest);
a687059c 5465 }
a0d0e21e 5466 chain = latest;
c277df42
IZ
5467 c++;
5468 }
5469 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5470 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5471 if (ret == NULL)
5472 ret = chain;
5473 }
5474 if (c == 1) {
5475 *flagp |= flags&SIMPLE;
a0d0e21e 5476 }
a687059c 5477
d4c19fe8 5478 return ret;
a687059c
LW
5479}
5480
5481/*
5482 - regpiece - something followed by possible [*+?]
5483 *
5484 * Note that the branching code sequences used for ? and the general cases
5485 * of * and + are somewhat optimized: they use the same NOTHING node as
5486 * both the endmarker for their branch list and the body of the last branch.
5487 * It might seem that this node could be dispensed with entirely, but the
5488 * endmarker role is not redundant.
5489 */
76e3520e 5490STATIC regnode *
3dab1dad 5491S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5492{
97aff369 5493 dVAR;
c277df42 5494 register regnode *ret;
a0d0e21e
LW
5495 register char op;
5496 register char *next;
5497 I32 flags;
1df70142 5498 const char * const origparse = RExC_parse;
a0d0e21e 5499 I32 min;
c277df42 5500 I32 max = REG_INFTY;
fac92740 5501 char *parse_start;
10edeb5d 5502 const char *maxpos = NULL;
3dab1dad
YO
5503 GET_RE_DEBUG_FLAGS_DECL;
5504 DEBUG_PARSE("piec");
a0d0e21e 5505
3dab1dad 5506 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5507 if (ret == NULL) {
5508 if (flags & TRYAGAIN)
5509 *flagp |= TRYAGAIN;
5510 return(NULL);
5511 }
5512
830247a4 5513 op = *RExC_parse;
a0d0e21e 5514
830247a4 5515 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5516 maxpos = NULL;
fac92740 5517 parse_start = RExC_parse; /* MJD */
830247a4 5518 next = RExC_parse + 1;
a0d0e21e
LW
5519 while (isDIGIT(*next) || *next == ',') {
5520 if (*next == ',') {
5521 if (maxpos)
5522 break;
5523 else
5524 maxpos = next;
a687059c 5525 }
a0d0e21e
LW
5526 next++;
5527 }
5528 if (*next == '}') { /* got one */
5529 if (!maxpos)
5530 maxpos = next;
830247a4
IZ
5531 RExC_parse++;
5532 min = atoi(RExC_parse);
a0d0e21e
LW
5533 if (*maxpos == ',')
5534 maxpos++;
5535 else
830247a4 5536 maxpos = RExC_parse;
a0d0e21e
LW
5537 max = atoi(maxpos);
5538 if (!max && *maxpos != '0')
c277df42
IZ
5539 max = REG_INFTY; /* meaning "infinity" */
5540 else if (max >= REG_INFTY)
8615cb43 5541 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5542 RExC_parse = next;
5543 nextchar(pRExC_state);
a0d0e21e
LW
5544
5545 do_curly:
5546 if ((flags&SIMPLE)) {
830247a4 5547 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5548 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5549 Set_Node_Offset(ret, parse_start+1); /* MJD */
5550 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5551 }
5552 else {
3dab1dad 5553 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5554
5555 w->flags = 0;
3dab1dad 5556 REGTAIL(pRExC_state, ret, w);
830247a4 5557 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5558 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5559 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5560 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5561 }
6bda09f9 5562 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5563 /* MJD hk */
5564 Set_Node_Offset(ret, parse_start+1);
2af232bd 5565 Set_Node_Length(ret,
fac92740 5566 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5567
830247a4 5568 if (!SIZE_ONLY && RExC_extralen)
c277df42 5569 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5570 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5571 if (SIZE_ONLY)
830247a4
IZ
5572 RExC_whilem_seen++, RExC_extralen += 3;
5573 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5574 }
c277df42 5575 ret->flags = 0;
a0d0e21e
LW
5576
5577 if (min > 0)
821b33a5
IZ
5578 *flagp = WORST;
5579 if (max > 0)
5580 *flagp |= HASWIDTH;
a0d0e21e 5581 if (max && max < min)
8615cb43 5582 vFAIL("Can't do {n,m} with n > m");
c277df42 5583 if (!SIZE_ONLY) {
eb160463
GS
5584 ARG1_SET(ret, (U16)min);
5585 ARG2_SET(ret, (U16)max);
a687059c 5586 }
a687059c 5587
a0d0e21e 5588 goto nest_check;
a687059c 5589 }
a0d0e21e 5590 }
a687059c 5591
a0d0e21e
LW
5592 if (!ISMULT1(op)) {
5593 *flagp = flags;
a687059c 5594 return(ret);
a0d0e21e 5595 }
bb20fd44 5596
c277df42 5597#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5598
5599 /* if this is reinstated, don't forget to put this back into perldiag:
5600
5601 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5602
5603 (F) The part of the regexp subject to either the * or + quantifier
5604 could match an empty string. The {#} shows in the regular
5605 expression about where the problem was discovered.
5606
5607 */
5608
bb20fd44 5609 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5610 vFAIL("Regexp *+ operand could be empty");
b81d288d 5611#endif
bb20fd44 5612
fac92740 5613 parse_start = RExC_parse;
830247a4 5614 nextchar(pRExC_state);
a0d0e21e 5615
821b33a5 5616 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5617
5618 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5619 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5620 ret->flags = 0;
830247a4 5621 RExC_naughty += 4;
a0d0e21e
LW
5622 }
5623 else if (op == '*') {
5624 min = 0;
5625 goto do_curly;
a0ed51b3
LW
5626 }
5627 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5628 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5629 ret->flags = 0;
830247a4 5630 RExC_naughty += 3;
a0d0e21e
LW
5631 }
5632 else if (op == '+') {
5633 min = 1;
5634 goto do_curly;
a0ed51b3
LW
5635 }
5636 else if (op == '?') {
a0d0e21e
LW
5637 min = 0; max = 1;
5638 goto do_curly;
5639 }
5640 nest_check:
041457d9 5641 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5642 vWARN3(RExC_parse,
b45f050a 5643 "%.*s matches null string many times",
afd78fd5 5644 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5645 origparse);
a0d0e21e
LW
5646 }
5647
b9b4dddf 5648 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5649 nextchar(pRExC_state);
6bda09f9 5650 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5651 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5652 }
b9b4dddf
YO
5653#ifndef REG_ALLOW_MINMOD_SUSPEND
5654 else
5655#endif
5656 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5657 regnode *ender;
5658 nextchar(pRExC_state);
5659 ender = reg_node(pRExC_state, SUCCEED);
5660 REGTAIL(pRExC_state, ret, ender);
5661 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5662 ret->flags = 0;
5663 ender = reg_node(pRExC_state, TAIL);
5664 REGTAIL(pRExC_state, ret, ender);
5665 /*ret= ender;*/
5666 }
5667
5668 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5669 RExC_parse++;
b45f050a
JF
5670 vFAIL("Nested quantifiers");
5671 }
a0d0e21e
LW
5672
5673 return(ret);
a687059c
LW
5674}
5675
fc8cd66c
YO
5676
5677/* reg_namedseq(pRExC_state,UVp)
5678
5679 This is expected to be called by a parser routine that has
5680 recognized'\N' and needs to handle the rest. RExC_parse is
5681 expected to point at the first char following the N at the time
5682 of the call.
5683
5684 If valuep is non-null then it is assumed that we are parsing inside
5685 of a charclass definition and the first codepoint in the resolved
5686 string is returned via *valuep and the routine will return NULL.
5687 In this mode if a multichar string is returned from the charnames
5688 handler a warning will be issued, and only the first char in the
5689 sequence will be examined. If the string returned is zero length
5690 then the value of *valuep is undefined and NON-NULL will
5691 be returned to indicate failure. (This will NOT be a valid pointer
5692 to a regnode.)
5693
5694 If value is null then it is assumed that we are parsing normal text
5695 and inserts a new EXACT node into the program containing the resolved
5696 string and returns a pointer to the new node. If the string is
5697 zerolength a NOTHING node is emitted.
5698
5699 On success RExC_parse is set to the char following the endbrace.
5700 Parsing failures will generate a fatal errorvia vFAIL(...)
5701
5702 NOTE: We cache all results from the charnames handler locally in
5703 the RExC_charnames hash (created on first use) to prevent a charnames
5704 handler from playing silly-buggers and returning a short string and
5705 then a long string for a given pattern. Since the regexp program
5706 size is calculated during an initial parse this would result
5707 in a buffer overrun so we cache to prevent the charname result from
5708 changing during the course of the parse.
5709
5710 */
5711STATIC regnode *
5712S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5713{
5714 char * name; /* start of the content of the name */
5715 char * endbrace; /* endbrace following the name */
5716 SV *sv_str = NULL;
5717 SV *sv_name = NULL;
5718 STRLEN len; /* this has various purposes throughout the code */
5719 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5720 regnode *ret = NULL;
5721
5722 if (*RExC_parse != '{') {
5723 vFAIL("Missing braces on \\N{}");
5724 }
5725 name = RExC_parse+1;
5726 endbrace = strchr(RExC_parse, '}');
5727 if ( ! endbrace ) {
5728 RExC_parse++;
5729 vFAIL("Missing right brace on \\N{}");
5730 }
5731 RExC_parse = endbrace + 1;
5732
5733
5734 /* RExC_parse points at the beginning brace,
5735 endbrace points at the last */
5736 if ( name[0]=='U' && name[1]=='+' ) {
5737 /* its a "unicode hex" notation {U+89AB} */
5738 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5739 | PERL_SCAN_DISALLOW_PREFIX
5740 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5741 UV cp;
196f1508 5742 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5743 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5744 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5745 cp = 0xFFFD;
5746 }
5747 if (cp > 0xff)
5748 RExC_utf8 = 1;
5749 if ( valuep ) {
5750 *valuep = cp;
5751 return NULL;
5752 }
5753 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5754 } else {
5755 /* fetch the charnames handler for this scope */
5756 HV * const table = GvHV(PL_hintgv);
5757 SV **cvp= table ?
5758 hv_fetchs(table, "charnames", FALSE) :
5759 NULL;
5760 SV *cv= cvp ? *cvp : NULL;
5761 HE *he_str;
5762 int count;
5763 /* create an SV with the name as argument */
5764 sv_name = newSVpvn(name, endbrace - name);
5765
5766 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5767 vFAIL2("Constant(\\N{%s}) unknown: "
5768 "(possibly a missing \"use charnames ...\")",
5769 SvPVX(sv_name));
5770 }
5771 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5772 vFAIL2("Constant(\\N{%s}): "
5773 "$^H{charnames} is not defined",SvPVX(sv_name));
5774 }
5775
5776
5777
5778 if (!RExC_charnames) {
5779 /* make sure our cache is allocated */
5780 RExC_charnames = newHV();
6bda09f9 5781 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5782 }
5783 /* see if we have looked this one up before */
5784 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5785 if ( he_str ) {
5786 sv_str = HeVAL(he_str);
5787 cached = 1;
5788 } else {
5789 dSP ;
5790
5791 ENTER ;
5792 SAVETMPS ;
5793 PUSHMARK(SP) ;
5794
5795 XPUSHs(sv_name);
5796
5797 PUTBACK ;
5798
5799 count= call_sv(cv, G_SCALAR);
5800
5801 if (count == 1) { /* XXXX is this right? dmq */
5802 sv_str = POPs;
5803 SvREFCNT_inc_simple_void(sv_str);
5804 }
5805
5806 SPAGAIN ;
5807 PUTBACK ;
5808 FREETMPS ;
5809 LEAVE ;
5810
5811 if ( !sv_str || !SvOK(sv_str) ) {
5812 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5813 "did not return a defined value",SvPVX(sv_name));
5814 }
5815 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5816 cached = 1;
5817 }
5818 }
5819 if (valuep) {
5820 char *p = SvPV(sv_str, len);
5821 if (len) {
5822 STRLEN numlen = 1;
5823 if ( SvUTF8(sv_str) ) {
196f1508 5824 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5825 if (*valuep > 0x7F)
5826 RExC_utf8 = 1;
5827 /* XXXX
5828 We have to turn on utf8 for high bit chars otherwise
5829 we get failures with
5830
5831 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5832 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5833
5834 This is different from what \x{} would do with the same
5835 codepoint, where the condition is > 0xFF.
5836 - dmq
5837 */
5838
5839
5840 } else {
5841 *valuep = (UV)*p;
5842 /* warn if we havent used the whole string? */
5843 }
5844 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5845 vWARN2(RExC_parse,
5846 "Ignoring excess chars from \\N{%s} in character class",
5847 SvPVX(sv_name)
5848 );
5849 }
5850 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5851 vWARN2(RExC_parse,
5852 "Ignoring zero length \\N{%s} in character class",
5853 SvPVX(sv_name)
5854 );
5855 }
5856 if (sv_name)
5857 SvREFCNT_dec(sv_name);
5858 if (!cached)
5859 SvREFCNT_dec(sv_str);
5860 return len ? NULL : (regnode *)&len;
5861 } else if(SvCUR(sv_str)) {
5862
5863 char *s;
5864 char *p, *pend;
5865 STRLEN charlen = 1;
5866 char * parse_start = name-3; /* needed for the offsets */
5867 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5868
5869 ret = reg_node(pRExC_state,
5870 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5871 s= STRING(ret);
5872
5873 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5874 sv_utf8_upgrade(sv_str);
5875 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5876 RExC_utf8= 1;
5877 }
5878
5879 p = SvPV(sv_str, len);
5880 pend = p + len;
5881 /* len is the length written, charlen is the size the char read */
5882 for ( len = 0; p < pend; p += charlen ) {
5883 if (UTF) {
196f1508 5884 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5885 if (FOLD) {
5886 STRLEN foldlen,numlen;
5887 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5888 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5889 /* Emit all the Unicode characters. */
5890
5891 for (foldbuf = tmpbuf;
5892 foldlen;
5893 foldlen -= numlen)
5894 {
5895 uvc = utf8_to_uvchr(foldbuf, &numlen);
5896 if (numlen > 0) {
5897 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5898 s += unilen;
5899 len += unilen;
5900 /* In EBCDIC the numlen
5901 * and unilen can differ. */
5902 foldbuf += numlen;
5903 if (numlen >= foldlen)
5904 break;
5905 }
5906 else
5907 break; /* "Can't happen." */
5908 }
5909 } else {
5910 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5911 if (unilen > 0) {
5912 s += unilen;
5913 len += unilen;
5914 }
5915 }
5916 } else {
5917 len++;
5918 REGC(*p, s++);
5919 }
5920 }
5921 if (SIZE_ONLY) {
5922 RExC_size += STR_SZ(len);
5923 } else {
5924 STR_LEN(ret) = len;
5925 RExC_emit += STR_SZ(len);
5926 }
5927 Set_Node_Cur_Length(ret); /* MJD */
5928 RExC_parse--;
5929 nextchar(pRExC_state);
5930 } else {
5931 ret = reg_node(pRExC_state,NOTHING);
5932 }
5933 if (!cached) {
5934 SvREFCNT_dec(sv_str);
5935 }
5936 if (sv_name) {
5937 SvREFCNT_dec(sv_name);
5938 }
5939 return ret;
5940
5941}
5942
5943
9e08bc66
TS
5944/*
5945 * reg_recode
5946 *
5947 * It returns the code point in utf8 for the value in *encp.
5948 * value: a code value in the source encoding
5949 * encp: a pointer to an Encode object
5950 *
5951 * If the result from Encode is not a single character,
5952 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5953 */
5954STATIC UV
5955S_reg_recode(pTHX_ const char value, SV **encp)
5956{
5957 STRLEN numlen = 1;
5958 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5959 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5960 : SvPVX(sv);
5961 const STRLEN newlen = SvCUR(sv);
5962 UV uv = UNICODE_REPLACEMENT;
5963
5964 if (newlen)
5965 uv = SvUTF8(sv)
5966 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5967 : *(U8*)s;
5968
5969 if (!newlen || numlen != newlen) {
5970 uv = UNICODE_REPLACEMENT;
5971 if (encp)
5972 *encp = NULL;
5973 }
5974 return uv;
5975}
5976
fc8cd66c 5977
a687059c
LW
5978/*
5979 - regatom - the lowest level
5980 *
5981 * Optimization: gobbles an entire sequence of ordinary characters so that
5982 * it can turn them into a single node, which is smaller to store and
5983 * faster to run. Backslashed characters are exceptions, each becoming a
5984 * separate node; the code is simpler that way and it's not worth fixing.
5985 *
7f6f358c
YO
5986 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5987 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5988 */
76e3520e 5989STATIC regnode *
3dab1dad 5990S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5991{
97aff369 5992 dVAR;
cbbf8932 5993 register regnode *ret = NULL;
a0d0e21e 5994 I32 flags;
45948336 5995 char *parse_start = RExC_parse;
3dab1dad
YO
5996 GET_RE_DEBUG_FLAGS_DECL;
5997 DEBUG_PARSE("atom");
a0d0e21e
LW
5998 *flagp = WORST; /* Tentatively. */
5999
6000tryagain:
830247a4 6001 switch (*RExC_parse) {
a0d0e21e 6002 case '^':
830247a4
IZ
6003 RExC_seen_zerolen++;
6004 nextchar(pRExC_state);
e2509266 6005 if (RExC_flags & PMf_MULTILINE)
830247a4 6006 ret = reg_node(pRExC_state, MBOL);
e2509266 6007 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6008 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6009 else
830247a4 6010 ret = reg_node(pRExC_state, BOL);
fac92740 6011 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6012 break;
6013 case '$':
830247a4 6014 nextchar(pRExC_state);
b81d288d 6015 if (*RExC_parse)
830247a4 6016 RExC_seen_zerolen++;
e2509266 6017 if (RExC_flags & PMf_MULTILINE)
830247a4 6018 ret = reg_node(pRExC_state, MEOL);
e2509266 6019 else if (RExC_flags & PMf_SINGLELINE)
830247a4 6020 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6021 else
830247a4 6022 ret = reg_node(pRExC_state, EOL);
fac92740 6023 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6024 break;
6025 case '.':
830247a4 6026 nextchar(pRExC_state);
e2509266 6027 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
6028 ret = reg_node(pRExC_state, SANY);
6029 else
6030 ret = reg_node(pRExC_state, REG_ANY);
6031 *flagp |= HASWIDTH|SIMPLE;
830247a4 6032 RExC_naughty++;
fac92740 6033 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6034 break;
6035 case '[':
b45f050a 6036 {
3dab1dad
YO
6037 char * const oregcomp_parse = ++RExC_parse;
6038 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6039 if (*RExC_parse != ']') {
6040 RExC_parse = oregcomp_parse;
b45f050a
JF
6041 vFAIL("Unmatched [");
6042 }
830247a4 6043 nextchar(pRExC_state);
a0d0e21e 6044 *flagp |= HASWIDTH|SIMPLE;
fac92740 6045 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6046 break;
b45f050a 6047 }
a0d0e21e 6048 case '(':
830247a4 6049 nextchar(pRExC_state);
3dab1dad 6050 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6051 if (ret == NULL) {
bf93d4cc 6052 if (flags & TRYAGAIN) {
830247a4 6053 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6054 /* Make parent create an empty node if needed. */
6055 *flagp |= TRYAGAIN;
6056 return(NULL);
6057 }
a0d0e21e 6058 goto tryagain;
bf93d4cc 6059 }
a0d0e21e
LW
6060 return(NULL);
6061 }
c277df42 6062 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6063 break;
6064 case '|':
6065 case ')':
6066 if (flags & TRYAGAIN) {
6067 *flagp |= TRYAGAIN;
6068 return NULL;
6069 }
b45f050a 6070 vFAIL("Internal urp");
a0d0e21e
LW
6071 /* Supposed to be caught earlier. */
6072 break;
85afd4ae 6073 case '{':
830247a4
IZ
6074 if (!regcurly(RExC_parse)) {
6075 RExC_parse++;
85afd4ae
CS
6076 goto defchar;
6077 }
6078 /* FALL THROUGH */
a0d0e21e
LW
6079 case '?':
6080 case '+':
6081 case '*':
830247a4 6082 RExC_parse++;
b45f050a 6083 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6084 break;
6085 case '\\':
830247a4 6086 switch (*++RExC_parse) {
a0d0e21e 6087 case 'A':
830247a4
IZ
6088 RExC_seen_zerolen++;
6089 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6090 *flagp |= SIMPLE;
830247a4 6091 nextchar(pRExC_state);
fac92740 6092 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6093 break;
6094 case 'G':
830247a4
IZ
6095 ret = reg_node(pRExC_state, GPOS);
6096 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6097 *flagp |= SIMPLE;
830247a4 6098 nextchar(pRExC_state);
fac92740 6099 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6100 break;
6101 case 'Z':
830247a4 6102 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6103 *flagp |= SIMPLE;
a1917ab9 6104 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 6105 nextchar(pRExC_state);
a0d0e21e 6106 break;
b85d18e9 6107 case 'z':
830247a4 6108 ret = reg_node(pRExC_state, EOS);
b85d18e9 6109 *flagp |= SIMPLE;
830247a4
IZ
6110 RExC_seen_zerolen++; /* Do not optimize RE away */
6111 nextchar(pRExC_state);
fac92740 6112 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 6113 break;
4a2d328f 6114 case 'C':
f33976b4
DB
6115 ret = reg_node(pRExC_state, CANY);
6116 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6117 *flagp |= HASWIDTH|SIMPLE;
830247a4 6118 nextchar(pRExC_state);
fac92740 6119 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
6120 break;
6121 case 'X':
830247a4 6122 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6123 *flagp |= HASWIDTH;
830247a4 6124 nextchar(pRExC_state);
fac92740 6125 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 6126 break;
a0d0e21e 6127 case 'w':
eb160463 6128 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6129 *flagp |= HASWIDTH|SIMPLE;
830247a4 6130 nextchar(pRExC_state);
fac92740 6131 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6132 break;
6133 case 'W':
eb160463 6134 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6135 *flagp |= HASWIDTH|SIMPLE;
830247a4 6136 nextchar(pRExC_state);
fac92740 6137 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6138 break;
6139 case 'b':
830247a4
IZ
6140 RExC_seen_zerolen++;
6141 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6142 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6143 *flagp |= SIMPLE;
830247a4 6144 nextchar(pRExC_state);
fac92740 6145 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6146 break;
6147 case 'B':
830247a4
IZ
6148 RExC_seen_zerolen++;
6149 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6150 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6151 *flagp |= SIMPLE;
830247a4 6152 nextchar(pRExC_state);
fac92740 6153 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6154 break;
6155 case 's':
eb160463 6156 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6157 *flagp |= HASWIDTH|SIMPLE;
830247a4 6158 nextchar(pRExC_state);
fac92740 6159 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6160 break;
6161 case 'S':
eb160463 6162 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6163 *flagp |= HASWIDTH|SIMPLE;
830247a4 6164 nextchar(pRExC_state);
fac92740 6165 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6166 break;
6167 case 'd':
ffc61ed2 6168 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6169 *flagp |= HASWIDTH|SIMPLE;
830247a4 6170 nextchar(pRExC_state);
fac92740 6171 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6172 break;
6173 case 'D':
ffc61ed2 6174 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6175 *flagp |= HASWIDTH|SIMPLE;
830247a4 6176 nextchar(pRExC_state);
fac92740 6177 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6178 break;
a14b48bc
LW
6179 case 'p':
6180 case 'P':
3568d838 6181 {
3dab1dad 6182 char* const oldregxend = RExC_end;
ccb2c380 6183 char* parse_start = RExC_parse - 2;
a14b48bc 6184
830247a4 6185 if (RExC_parse[1] == '{') {
3568d838 6186 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6187 RExC_end = strchr(RExC_parse, '}');
6188 if (!RExC_end) {
3dab1dad 6189 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6190 RExC_parse += 2;
6191 RExC_end = oldregxend;
0da60cf5 6192 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6193 }
830247a4 6194 RExC_end++;
a14b48bc 6195 }
af6f566e 6196 else {
830247a4 6197 RExC_end = RExC_parse + 2;
af6f566e
HS
6198 if (RExC_end > oldregxend)
6199 RExC_end = oldregxend;
6200 }
830247a4 6201 RExC_parse--;
a14b48bc 6202
3dab1dad 6203 ret = regclass(pRExC_state,depth+1);
a14b48bc 6204
830247a4
IZ
6205 RExC_end = oldregxend;
6206 RExC_parse--;
ccb2c380
MP
6207
6208 Set_Node_Offset(ret, parse_start + 2);
6209 Set_Node_Cur_Length(ret);
830247a4 6210 nextchar(pRExC_state);
a14b48bc
LW
6211 *flagp |= HASWIDTH|SIMPLE;
6212 }
6213 break;
fc8cd66c
YO
6214 case 'N':
6215 /* Handle \N{NAME} here and not below because it can be
6216 multicharacter. join_exact() will join them up later on.
6217 Also this makes sure that things like /\N{BLAH}+/ and
6218 \N{BLAH} being multi char Just Happen. dmq*/
6219 ++RExC_parse;
6220 ret= reg_namedseq(pRExC_state, NULL);
6221 break;
0a4db386 6222 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6223 {
6224 char ch= RExC_parse[1];
6225 if (ch != '<' && ch != '\'') {
6226 if (SIZE_ONLY)
6227 vWARN( RExC_parse + 1,
6228 "Possible broken named back reference treated as literal k");
6229 parse_start--;
6230 goto defchar;
6231 } else {
6232 char* name_start = (RExC_parse += 2);
6233 I32 num = 0;
0a4db386
YO
6234 SV *sv_dat = reg_scan_name(pRExC_state,
6235 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6236 ch= (ch == '<') ? '>' : '\'';
6237
81714fb9
YO
6238 if (RExC_parse == name_start || *RExC_parse != ch)
6239 vFAIL2("Sequence \\k%c... not terminated",
6240 (ch == '>' ? '<' : ch));
6241
6242 RExC_sawback = 1;
6243 ret = reganode(pRExC_state,
6244 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6245 num);
6246 *flagp |= HASWIDTH;
6247
6248
6249 if (!SIZE_ONLY) {
81714fb9
YO
6250 num = add_data( pRExC_state, 1, "S" );
6251 ARG_SET(ret,num);
6252 RExC_rx->data->data[num]=(void*)sv_dat;
6253 SvREFCNT_inc(sv_dat);
6254 }
6255 /* override incorrect value set in reganode MJD */
6256 Set_Node_Offset(ret, parse_start+1);
6257 Set_Node_Cur_Length(ret); /* MJD */
6258 nextchar(pRExC_state);
6259
6260 }
6261 break;
6262 }
a0d0e21e
LW
6263 case 'n':
6264 case 'r':
6265 case 't':
6266 case 'f':
6267 case 'e':
6268 case 'a':
6269 case 'x':
6270 case 'c':
6271 case '0':
6272 goto defchar;
6273 case '1': case '2': case '3': case '4':
6274 case '5': case '6': case '7': case '8': case '9':
6275 {
1df70142 6276 const I32 num = atoi(RExC_parse);
a0d0e21e 6277
830247a4 6278 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6279 goto defchar;
6280 else {
3dab1dad 6281 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6282 while (isDIGIT(*RExC_parse))
6283 RExC_parse++;
b45f050a 6284
eb160463 6285 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 6286 vFAIL("Reference to nonexistent group");
830247a4 6287 RExC_sawback = 1;
eb160463
GS
6288 ret = reganode(pRExC_state,
6289 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6290 num);
a0d0e21e 6291 *flagp |= HASWIDTH;
2af232bd 6292
fac92740 6293 /* override incorrect value set in reganode MJD */
2af232bd 6294 Set_Node_Offset(ret, parse_start+1);
fac92740 6295 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6296 RExC_parse--;
6297 nextchar(pRExC_state);
a0d0e21e
LW
6298 }
6299 }
6300 break;
6301 case '\0':
830247a4 6302 if (RExC_parse >= RExC_end)
b45f050a 6303 FAIL("Trailing \\");
a0d0e21e
LW
6304 /* FALL THROUGH */
6305 default:
a0288114 6306 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6307 back into the quick-grab loop below */
45948336 6308 parse_start--;
a0d0e21e
LW
6309 goto defchar;
6310 }
6311 break;
4633a7c4
LW
6312
6313 case '#':
e2509266 6314 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6315 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6316 RExC_parse++;
830247a4 6317 if (RExC_parse < RExC_end)
4633a7c4
LW
6318 goto tryagain;
6319 }
6320 /* FALL THROUGH */
6321
a0d0e21e 6322 default: {
ba210ebe 6323 register STRLEN len;
58ae7d3f 6324 register UV ender;
a0d0e21e 6325 register char *p;
3dab1dad 6326 char *s;
80aecb99 6327 STRLEN foldlen;
89ebb4a3 6328 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6329
6330 parse_start = RExC_parse - 1;
a0d0e21e 6331
830247a4 6332 RExC_parse++;
a0d0e21e
LW
6333
6334 defchar:
58ae7d3f 6335 ender = 0;
eb160463
GS
6336 ret = reg_node(pRExC_state,
6337 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6338 s = STRING(ret);
830247a4
IZ
6339 for (len = 0, p = RExC_parse - 1;
6340 len < 127 && p < RExC_end;
a0d0e21e
LW
6341 len++)
6342 {
3dab1dad 6343 char * const oldp = p;
5b5a24f7 6344
e2509266 6345 if (RExC_flags & PMf_EXTENDED)
830247a4 6346 p = regwhite(p, RExC_end);
a0d0e21e
LW
6347 switch (*p) {
6348 case '^':
6349 case '$':
6350 case '.':
6351 case '[':
6352 case '(':
6353 case ')':
6354 case '|':
6355 goto loopdone;
6356 case '\\':
6357 switch (*++p) {
6358 case 'A':
1ed8eac0
JF
6359 case 'C':
6360 case 'X':
a0d0e21e
LW
6361 case 'G':
6362 case 'Z':
b85d18e9 6363 case 'z':
a0d0e21e
LW
6364 case 'w':
6365 case 'W':
6366 case 'b':
6367 case 'B':
6368 case 's':
6369 case 'S':
6370 case 'd':
6371 case 'D':
a14b48bc
LW
6372 case 'p':
6373 case 'P':
fc8cd66c 6374 case 'N':
a0d0e21e
LW
6375 --p;
6376 goto loopdone;
6377 case 'n':
6378 ender = '\n';
6379 p++;
a687059c 6380 break;
a0d0e21e
LW
6381 case 'r':
6382 ender = '\r';
6383 p++;
a687059c 6384 break;
a0d0e21e
LW
6385 case 't':
6386 ender = '\t';
6387 p++;
a687059c 6388 break;
a0d0e21e
LW
6389 case 'f':
6390 ender = '\f';
6391 p++;
a687059c 6392 break;
a0d0e21e 6393 case 'e':
c7f1f016 6394 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6395 p++;
a687059c 6396 break;
a0d0e21e 6397 case 'a':
c7f1f016 6398 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6399 p++;
a687059c 6400 break;
a0d0e21e 6401 case 'x':
a0ed51b3 6402 if (*++p == '{') {
1df70142 6403 char* const e = strchr(p, '}');
b81d288d 6404
b45f050a 6405 if (!e) {
830247a4 6406 RExC_parse = p + 1;
b45f050a
JF
6407 vFAIL("Missing right brace on \\x{}");
6408 }
de5f0749 6409 else {
a4c04bdc
NC
6410 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6411 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6412 STRLEN numlen = e - p - 1;
53305cf1 6413 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6414 if (ender > 0xff)
6415 RExC_utf8 = 1;
a0ed51b3
LW
6416 p = e + 1;
6417 }
a0ed51b3
LW
6418 }
6419 else {
a4c04bdc 6420 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6421 STRLEN numlen = 2;
53305cf1 6422 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6423 p += numlen;
6424 }
9e08bc66
TS
6425 if (PL_encoding && ender < 0x100)
6426 goto recode_encoding;
a687059c 6427 break;
a0d0e21e
LW
6428 case 'c':
6429 p++;
bbce6d69 6430 ender = UCHARAT(p++);
6431 ender = toCTRL(ender);
a687059c 6432 break;
a0d0e21e
LW
6433 case '0': case '1': case '2': case '3':case '4':
6434 case '5': case '6': case '7': case '8':case '9':
6435 if (*p == '0' ||
830247a4 6436 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6437 I32 flags = 0;
1df70142 6438 STRLEN numlen = 3;
53305cf1 6439 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6440 p += numlen;
6441 }
6442 else {
6443 --p;
6444 goto loopdone;
a687059c 6445 }
9e08bc66
TS
6446 if (PL_encoding && ender < 0x100)
6447 goto recode_encoding;
6448 break;
6449 recode_encoding:
6450 {
6451 SV* enc = PL_encoding;
6452 ender = reg_recode((const char)(U8)ender, &enc);
6453 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6454 vWARN(p, "Invalid escape in the specified encoding");
6455 RExC_utf8 = 1;
6456 }
a687059c 6457 break;
a0d0e21e 6458 case '\0':
830247a4 6459 if (p >= RExC_end)
b45f050a 6460 FAIL("Trailing \\");
a687059c 6461 /* FALL THROUGH */
a0d0e21e 6462 default:
041457d9 6463 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6464 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6465 goto normal_default;
a0d0e21e
LW
6466 }
6467 break;
a687059c 6468 default:
a0ed51b3 6469 normal_default:
fd400ab9 6470 if (UTF8_IS_START(*p) && UTF) {
1df70142 6471 STRLEN numlen;
5e12f4fb 6472 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6473 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6474 p += numlen;
6475 }
6476 else
6477 ender = *p++;
a0d0e21e 6478 break;
a687059c 6479 }
e2509266 6480 if (RExC_flags & PMf_EXTENDED)
830247a4 6481 p = regwhite(p, RExC_end);
60a8b682
JH
6482 if (UTF && FOLD) {
6483 /* Prime the casefolded buffer. */
ac7e0132 6484 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6485 }
a0d0e21e
LW
6486 if (ISMULT2(p)) { /* Back off on ?+*. */
6487 if (len)
6488 p = oldp;
16ea2a2e 6489 else if (UTF) {
80aecb99 6490 if (FOLD) {
60a8b682 6491 /* Emit all the Unicode characters. */
1df70142 6492 STRLEN numlen;
80aecb99
JH
6493 for (foldbuf = tmpbuf;
6494 foldlen;
6495 foldlen -= numlen) {
6496 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6497 if (numlen > 0) {
71207a34 6498 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6499 s += unilen;
6500 len += unilen;
6501 /* In EBCDIC the numlen
6502 * and unilen can differ. */
9dc45d57 6503 foldbuf += numlen;
47654450
JH
6504 if (numlen >= foldlen)
6505 break;
9dc45d57
JH
6506 }
6507 else
6508 break; /* "Can't happen." */
80aecb99
JH
6509 }
6510 }
6511 else {
71207a34 6512 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6513 if (unilen > 0) {
0ebc6274
JH
6514 s += unilen;
6515 len += unilen;
9dc45d57 6516 }
80aecb99 6517 }
a0ed51b3 6518 }
a0d0e21e
LW
6519 else {
6520 len++;
eb160463 6521 REGC((char)ender, s++);
a0d0e21e
LW
6522 }
6523 break;
a687059c 6524 }
16ea2a2e 6525 if (UTF) {
80aecb99 6526 if (FOLD) {
60a8b682 6527 /* Emit all the Unicode characters. */
1df70142 6528 STRLEN numlen;
80aecb99
JH
6529 for (foldbuf = tmpbuf;
6530 foldlen;
6531 foldlen -= numlen) {
6532 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6533 if (numlen > 0) {
71207a34 6534 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6535 len += unilen;
6536 s += unilen;
6537 /* In EBCDIC the numlen
6538 * and unilen can differ. */
9dc45d57 6539 foldbuf += numlen;
47654450
JH
6540 if (numlen >= foldlen)
6541 break;
9dc45d57
JH
6542 }
6543 else
6544 break;
80aecb99
JH
6545 }
6546 }
6547 else {
71207a34 6548 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6549 if (unilen > 0) {
0ebc6274
JH
6550 s += unilen;
6551 len += unilen;
9dc45d57 6552 }
80aecb99
JH
6553 }
6554 len--;
a0ed51b3
LW
6555 }
6556 else
eb160463 6557 REGC((char)ender, s++);
a0d0e21e
LW
6558 }
6559 loopdone:
830247a4 6560 RExC_parse = p - 1;
fac92740 6561 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6562 nextchar(pRExC_state);
793db0cb
JH
6563 {
6564 /* len is STRLEN which is unsigned, need to copy to signed */
6565 IV iv = len;
6566 if (iv < 0)
6567 vFAIL("Internal disaster");
6568 }
a0d0e21e
LW
6569 if (len > 0)
6570 *flagp |= HASWIDTH;
090f7165 6571 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6572 *flagp |= SIMPLE;
3dab1dad 6573
cd439c50 6574 if (SIZE_ONLY)
830247a4 6575 RExC_size += STR_SZ(len);
3dab1dad
YO
6576 else {
6577 STR_LEN(ret) = len;
830247a4 6578 RExC_emit += STR_SZ(len);
07be1b83 6579 }
3dab1dad 6580 }
a0d0e21e
LW
6581 break;
6582 }
a687059c 6583
a0d0e21e 6584 return(ret);
a687059c
LW
6585}
6586
873ef191 6587STATIC char *
5f66b61c 6588S_regwhite(char *p, const char *e)
5b5a24f7
CS
6589{
6590 while (p < e) {
6591 if (isSPACE(*p))
6592 ++p;
6593 else if (*p == '#') {
6594 do {
6595 p++;
6596 } while (p < e && *p != '\n');
6597 }
6598 else
6599 break;
6600 }
6601 return p;
6602}
6603
b8c5462f
JH
6604/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6605 Character classes ([:foo:]) can also be negated ([:^foo:]).
6606 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6607 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6608 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6609
6610#define POSIXCC_DONE(c) ((c) == ':')
6611#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6612#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6613
b8c5462f 6614STATIC I32
830247a4 6615S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6616{
97aff369 6617 dVAR;
936ed897 6618 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6619
830247a4 6620 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6621 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6622 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6623 const char c = UCHARAT(RExC_parse);
097eb12c 6624 char* const s = RExC_parse++;
b81d288d 6625
9a86a77b 6626 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6627 RExC_parse++;
6628 if (RExC_parse == RExC_end)
620e46c5 6629 /* Grandfather lone [:, [=, [. */
830247a4 6630 RExC_parse = s;
620e46c5 6631 else {
3dab1dad 6632 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6633 assert(*t == c);
6634
9a86a77b 6635 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6636 const char *posixcc = s + 1;
830247a4 6637 RExC_parse++; /* skip over the ending ] */
3dab1dad 6638
b8c5462f 6639 if (*s == ':') {
1df70142
AL
6640 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6641 const I32 skip = t - posixcc;
80916619
NC
6642
6643 /* Initially switch on the length of the name. */
6644 switch (skip) {
6645 case 4:
3dab1dad
YO
6646 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6647 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6648 break;
80916619
NC
6649 case 5:
6650 /* Names all of length 5. */
6651 /* alnum alpha ascii blank cntrl digit graph lower
6652 print punct space upper */
6653 /* Offset 4 gives the best switch position. */
6654 switch (posixcc[4]) {
6655 case 'a':
3dab1dad
YO
6656 if (memEQ(posixcc, "alph", 4)) /* alpha */
6657 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6658 break;
6659 case 'e':
3dab1dad
YO
6660 if (memEQ(posixcc, "spac", 4)) /* space */
6661 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6662 break;
6663 case 'h':
3dab1dad
YO
6664 if (memEQ(posixcc, "grap", 4)) /* graph */
6665 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6666 break;
6667 case 'i':
3dab1dad
YO
6668 if (memEQ(posixcc, "asci", 4)) /* ascii */
6669 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6670 break;
6671 case 'k':
3dab1dad
YO
6672 if (memEQ(posixcc, "blan", 4)) /* blank */
6673 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6674 break;
6675 case 'l':
3dab1dad
YO
6676 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6677 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6678 break;
6679 case 'm':
3dab1dad
YO
6680 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6681 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6682 break;
6683 case 'r':
3dab1dad
YO
6684 if (memEQ(posixcc, "lowe", 4)) /* lower */
6685 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6686 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6687 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6688 break;
6689 case 't':
3dab1dad
YO
6690 if (memEQ(posixcc, "digi", 4)) /* digit */
6691 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6692 else if (memEQ(posixcc, "prin", 4)) /* print */
6693 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6694 else if (memEQ(posixcc, "punc", 4)) /* punct */
6695 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6696 break;
b8c5462f
JH
6697 }
6698 break;
80916619 6699 case 6:
3dab1dad
YO
6700 if (memEQ(posixcc, "xdigit", 6))
6701 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6702 break;
6703 }
80916619
NC
6704
6705 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6706 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6707 t - s - 1, s + 1);
80916619
NC
6708 assert (posixcc[skip] == ':');
6709 assert (posixcc[skip+1] == ']');
b45f050a 6710 } else if (!SIZE_ONLY) {
b8c5462f 6711 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6712
830247a4 6713 /* adjust RExC_parse so the warning shows after
b45f050a 6714 the class closes */
9a86a77b 6715 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6716 RExC_parse++;
b45f050a
JF
6717 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6718 }
b8c5462f
JH
6719 } else {
6720 /* Maternal grandfather:
6721 * "[:" ending in ":" but not in ":]" */
830247a4 6722 RExC_parse = s;
767d463e 6723 }
620e46c5
JH
6724 }
6725 }
6726
b8c5462f
JH
6727 return namedclass;
6728}
6729
6730STATIC void
830247a4 6731S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6732{
97aff369 6733 dVAR;
3dab1dad 6734 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6735 const char *s = RExC_parse;
6736 const char c = *s++;
b8c5462f 6737
3dab1dad 6738 while (isALNUM(*s))
b8c5462f
JH
6739 s++;
6740 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6741 if (ckWARN(WARN_REGEXP))
6742 vWARN3(s+2,
6743 "POSIX syntax [%c %c] belongs inside character classes",
6744 c, c);
b45f050a
JF
6745
6746 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6747 if (POSIXCC_NOTYET(c)) {
830247a4 6748 /* adjust RExC_parse so the error shows after
b45f050a 6749 the class closes */
9a86a77b 6750 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6751 NOOP;
b45f050a
JF
6752 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6753 }
b8c5462f
JH
6754 }
6755 }
620e46c5
JH
6756}
6757
7f6f358c
YO
6758
6759/*
6760 parse a class specification and produce either an ANYOF node that
6761 matches the pattern. If the pattern matches a single char only and
6762 that char is < 256 then we produce an EXACT node instead.
6763*/
76e3520e 6764STATIC regnode *
3dab1dad 6765S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6766{
97aff369 6767 dVAR;
9ef43ace 6768 register UV value = 0;
9a86a77b 6769 register UV nextvalue;
3568d838 6770 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6771 register IV range = 0;
c277df42 6772 register regnode *ret;
ba210ebe 6773 STRLEN numlen;
ffc61ed2 6774 IV namedclass;
cbbf8932 6775 char *rangebegin = NULL;
936ed897 6776 bool need_class = 0;
c445ea15 6777 SV *listsv = NULL;
ffc61ed2 6778 UV n;
9e55ce06 6779 bool optimize_invert = TRUE;
cbbf8932 6780 AV* unicode_alternate = NULL;
1b2d223b
JH
6781#ifdef EBCDIC
6782 UV literal_endpoint = 0;
6783#endif
7f6f358c 6784 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6785
3dab1dad 6786 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6787 case we need to change the emitted regop to an EXACT. */
07be1b83 6788 const char * orig_parse = RExC_parse;
72f13be8 6789 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6790#ifndef DEBUGGING
6791 PERL_UNUSED_ARG(depth);
6792#endif
72f13be8 6793
3dab1dad 6794 DEBUG_PARSE("clas");
7f6f358c
YO
6795
6796 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6797 ret = reganode(pRExC_state, ANYOF, 0);
6798
6799 if (!SIZE_ONLY)
6800 ANYOF_FLAGS(ret) = 0;
6801
9a86a77b 6802 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6803 RExC_naughty++;
6804 RExC_parse++;
6805 if (!SIZE_ONLY)
6806 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6807 }
a0d0e21e 6808
73060fc4 6809 if (SIZE_ONLY) {
830247a4 6810 RExC_size += ANYOF_SKIP;
73060fc4
JH
6811 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6812 }
936ed897 6813 else {
830247a4 6814 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6815 if (FOLD)
6816 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6817 if (LOC)
6818 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6819 ANYOF_BITMAP_ZERO(ret);
396482e1 6820 listsv = newSVpvs("# comment\n");
a0d0e21e 6821 }
b8c5462f 6822
9a86a77b
JH
6823 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6824
b938889d 6825 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6826 checkposixcc(pRExC_state);
b8c5462f 6827
f064b6ad
HS
6828 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6829 if (UCHARAT(RExC_parse) == ']')
6830 goto charclassloop;
ffc61ed2 6831
fc8cd66c 6832parseit:
9a86a77b 6833 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6834
6835 charclassloop:
6836
6837 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6838
73b437c8 6839 if (!range)
830247a4 6840 rangebegin = RExC_parse;
ffc61ed2 6841 if (UTF) {
5e12f4fb 6842 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6843 RExC_end - RExC_parse,
9f7f3913 6844 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6845 RExC_parse += numlen;
6846 }
6847 else
6848 value = UCHARAT(RExC_parse++);
7f6f358c 6849
9a86a77b
JH
6850 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6851 if (value == '[' && POSIXCC(nextvalue))
830247a4 6852 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6853 else if (value == '\\') {
ffc61ed2 6854 if (UTF) {
5e12f4fb 6855 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6856 RExC_end - RExC_parse,
9f7f3913 6857 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6858 RExC_parse += numlen;
6859 }
6860 else
6861 value = UCHARAT(RExC_parse++);
470c3474 6862 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6863 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6864 * be a problem later if we want switch on Unicode.
6865 * A similar issue a little bit later when switching on
6866 * namedclass. --jhi */
ffc61ed2 6867 switch ((I32)value) {
b8c5462f
JH
6868 case 'w': namedclass = ANYOF_ALNUM; break;
6869 case 'W': namedclass = ANYOF_NALNUM; break;
6870 case 's': namedclass = ANYOF_SPACE; break;
6871 case 'S': namedclass = ANYOF_NSPACE; break;
6872 case 'd': namedclass = ANYOF_DIGIT; break;
6873 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6874 case 'N': /* Handle \N{NAME} in class */
6875 {
6876 /* We only pay attention to the first char of
6877 multichar strings being returned. I kinda wonder
6878 if this makes sense as it does change the behaviour
6879 from earlier versions, OTOH that behaviour was broken
6880 as well. */
6881 UV v; /* value is register so we cant & it /grrr */
6882 if (reg_namedseq(pRExC_state, &v)) {
6883 goto parseit;
6884 }
6885 value= v;
6886 }
6887 break;
ffc61ed2
JH
6888 case 'p':
6889 case 'P':
3dab1dad
YO
6890 {
6891 char *e;
af6f566e 6892 if (RExC_parse >= RExC_end)
2a4859cd 6893 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6894 if (*RExC_parse == '{') {
1df70142 6895 const U8 c = (U8)value;
ffc61ed2
JH
6896 e = strchr(RExC_parse++, '}');
6897 if (!e)
0da60cf5 6898 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6899 while (isSPACE(UCHARAT(RExC_parse)))
6900 RExC_parse++;
6901 if (e == RExC_parse)
0da60cf5 6902 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6903 n = e - RExC_parse;
ab13f0c7
JH
6904 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6905 n--;
ffc61ed2
JH
6906 }
6907 else {
6908 e = RExC_parse;
6909 n = 1;
6910 }
6911 if (!SIZE_ONLY) {
ab13f0c7
JH
6912 if (UCHARAT(RExC_parse) == '^') {
6913 RExC_parse++;
6914 n--;
6915 value = value == 'p' ? 'P' : 'p'; /* toggle */
6916 while (isSPACE(UCHARAT(RExC_parse))) {
6917 RExC_parse++;
6918 n--;
6919 }
6920 }
097eb12c
AL
6921 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6922 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6923 }
6924 RExC_parse = e + 1;
6925 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6926 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6927 }
f81125e2 6928 break;
b8c5462f
JH
6929 case 'n': value = '\n'; break;
6930 case 'r': value = '\r'; break;
6931 case 't': value = '\t'; break;
6932 case 'f': value = '\f'; break;
6933 case 'b': value = '\b'; break;
c7f1f016
NIS
6934 case 'e': value = ASCII_TO_NATIVE('\033');break;
6935 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6936 case 'x':
ffc61ed2 6937 if (*RExC_parse == '{') {
a4c04bdc
NC
6938 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6939 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6940 char * const e = strchr(RExC_parse++, '}');
b81d288d 6941 if (!e)
ffc61ed2 6942 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6943
6944 numlen = e - RExC_parse;
6945 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6946 RExC_parse = e + 1;
6947 }
6948 else {
a4c04bdc 6949 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6950 numlen = 2;
6951 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6952 RExC_parse += numlen;
6953 }
9e08bc66
TS
6954 if (PL_encoding && value < 0x100)
6955 goto recode_encoding;
b8c5462f
JH
6956 break;
6957 case 'c':
830247a4 6958 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6959 value = toCTRL(value);
6960 break;
6961 case '0': case '1': case '2': case '3': case '4':
6962 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
6963 {
6964 I32 flags = 0;
6965 numlen = 3;
6966 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6967 RExC_parse += numlen;
6968 if (PL_encoding && value < 0x100)
6969 goto recode_encoding;
6970 break;
6971 }
6972 recode_encoding:
6973 {
6974 SV* enc = PL_encoding;
6975 value = reg_recode((const char)(U8)value, &enc);
6976 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6977 vWARN(RExC_parse,
6978 "Invalid escape in the specified encoding");
6979 break;
6980 }
1028017a 6981 default:
041457d9 6982 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6983 vWARN2(RExC_parse,
6984 "Unrecognized escape \\%c in character class passed through",
6985 (int)value);
1028017a 6986 break;
b8c5462f 6987 }
ffc61ed2 6988 } /* end of \blah */
1b2d223b
JH
6989#ifdef EBCDIC
6990 else
6991 literal_endpoint++;
6992#endif
ffc61ed2
JH
6993
6994 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6995
6996 if (!SIZE_ONLY && !need_class)
936ed897 6997 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6998
936ed897 6999 need_class = 1;
ffc61ed2
JH
7000
7001 /* a bad range like a-\d, a-[:digit:] ? */
7002 if (range) {
73b437c8 7003 if (!SIZE_ONLY) {
afd78fd5 7004 if (ckWARN(WARN_REGEXP)) {
097eb12c 7005 const int w =
afd78fd5
JH
7006 RExC_parse >= rangebegin ?
7007 RExC_parse - rangebegin : 0;
830247a4 7008 vWARN4(RExC_parse,
b45f050a 7009 "False [] range \"%*.*s\"",
097eb12c 7010 w, w, rangebegin);
afd78fd5 7011 }
3568d838
JH
7012 if (prevvalue < 256) {
7013 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7014 ANYOF_BITMAP_SET(ret, '-');
7015 }
7016 else {
7017 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7018 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7019 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7020 }
b8c5462f 7021 }
ffc61ed2
JH
7022
7023 range = 0; /* this was not a true range */
73b437c8 7024 }
ffc61ed2 7025
73b437c8 7026 if (!SIZE_ONLY) {
c49a72a9
NC
7027 const char *what = NULL;
7028 char yesno = 0;
7029
3568d838
JH
7030 if (namedclass > OOB_NAMEDCLASS)
7031 optimize_invert = FALSE;
e2962f66
JH
7032 /* Possible truncation here but in some 64-bit environments
7033 * the compiler gets heartburn about switch on 64-bit values.
7034 * A similar issue a little earlier when switching on value.
98f323fa 7035 * --jhi */
e2962f66 7036 switch ((I32)namedclass) {
73b437c8
JH
7037 case ANYOF_ALNUM:
7038 if (LOC)
936ed897 7039 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
7040 else {
7041 for (value = 0; value < 256; value++)
7042 if (isALNUM(value))
936ed897 7043 ANYOF_BITMAP_SET(ret, value);
73b437c8 7044 }
c49a72a9
NC
7045 yesno = '+';
7046 what = "Word";
73b437c8
JH
7047 break;
7048 case ANYOF_NALNUM:
7049 if (LOC)
936ed897 7050 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
7051 else {
7052 for (value = 0; value < 256; value++)
7053 if (!isALNUM(value))
936ed897 7054 ANYOF_BITMAP_SET(ret, value);
73b437c8 7055 }
c49a72a9
NC
7056 yesno = '!';
7057 what = "Word";
73b437c8 7058 break;
ffc61ed2 7059 case ANYOF_ALNUMC:
73b437c8 7060 if (LOC)
ffc61ed2 7061 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
7062 else {
7063 for (value = 0; value < 256; value++)
ffc61ed2 7064 if (isALNUMC(value))
936ed897 7065 ANYOF_BITMAP_SET(ret, value);
73b437c8 7066 }
c49a72a9
NC
7067 yesno = '+';
7068 what = "Alnum";
73b437c8
JH
7069 break;
7070 case ANYOF_NALNUMC:
7071 if (LOC)
936ed897 7072 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
7073 else {
7074 for (value = 0; value < 256; value++)
7075 if (!isALNUMC(value))
936ed897 7076 ANYOF_BITMAP_SET(ret, value);
73b437c8 7077 }
c49a72a9
NC
7078 yesno = '!';
7079 what = "Alnum";
73b437c8
JH
7080 break;
7081 case ANYOF_ALPHA:
7082 if (LOC)
936ed897 7083 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
7084 else {
7085 for (value = 0; value < 256; value++)
7086 if (isALPHA(value))
936ed897 7087 ANYOF_BITMAP_SET(ret, value);
73b437c8 7088 }
c49a72a9
NC
7089 yesno = '+';
7090 what = "Alpha";
73b437c8
JH
7091 break;
7092 case ANYOF_NALPHA:
7093 if (LOC)
936ed897 7094 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
7095 else {
7096 for (value = 0; value < 256; value++)
7097 if (!isALPHA(value))
936ed897 7098 ANYOF_BITMAP_SET(ret, value);
73b437c8 7099 }
c49a72a9
NC
7100 yesno = '!';
7101 what = "Alpha";
73b437c8
JH
7102 break;
7103 case ANYOF_ASCII:
7104 if (LOC)
936ed897 7105 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7106 else {
c7f1f016 7107#ifndef EBCDIC
1ba5c669
JH
7108 for (value = 0; value < 128; value++)
7109 ANYOF_BITMAP_SET(ret, value);
7110#else /* EBCDIC */
ffbc6a93 7111 for (value = 0; value < 256; value++) {
3a3c4447
JH
7112 if (isASCII(value))
7113 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7114 }
1ba5c669 7115#endif /* EBCDIC */
73b437c8 7116 }
c49a72a9
NC
7117 yesno = '+';
7118 what = "ASCII";
73b437c8
JH
7119 break;
7120 case ANYOF_NASCII:
7121 if (LOC)
936ed897 7122 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7123 else {
c7f1f016 7124#ifndef EBCDIC
1ba5c669
JH
7125 for (value = 128; value < 256; value++)
7126 ANYOF_BITMAP_SET(ret, value);
7127#else /* EBCDIC */
ffbc6a93 7128 for (value = 0; value < 256; value++) {
3a3c4447
JH
7129 if (!isASCII(value))
7130 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7131 }
1ba5c669 7132#endif /* EBCDIC */
73b437c8 7133 }
c49a72a9
NC
7134 yesno = '!';
7135 what = "ASCII";
73b437c8 7136 break;
aaa51d5e
JF
7137 case ANYOF_BLANK:
7138 if (LOC)
7139 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
7140 else {
7141 for (value = 0; value < 256; value++)
7142 if (isBLANK(value))
7143 ANYOF_BITMAP_SET(ret, value);
7144 }
c49a72a9
NC
7145 yesno = '+';
7146 what = "Blank";
aaa51d5e
JF
7147 break;
7148 case ANYOF_NBLANK:
7149 if (LOC)
7150 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
7151 else {
7152 for (value = 0; value < 256; value++)
7153 if (!isBLANK(value))
7154 ANYOF_BITMAP_SET(ret, value);
7155 }
c49a72a9
NC
7156 yesno = '!';
7157 what = "Blank";
aaa51d5e 7158 break;
73b437c8
JH
7159 case ANYOF_CNTRL:
7160 if (LOC)
936ed897 7161 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7162 else {
7163 for (value = 0; value < 256; value++)
7164 if (isCNTRL(value))
936ed897 7165 ANYOF_BITMAP_SET(ret, value);
73b437c8 7166 }
c49a72a9
NC
7167 yesno = '+';
7168 what = "Cntrl";
73b437c8
JH
7169 break;
7170 case ANYOF_NCNTRL:
7171 if (LOC)
936ed897 7172 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7173 else {
7174 for (value = 0; value < 256; value++)
7175 if (!isCNTRL(value))
936ed897 7176 ANYOF_BITMAP_SET(ret, value);
73b437c8 7177 }
c49a72a9
NC
7178 yesno = '!';
7179 what = "Cntrl";
ffc61ed2
JH
7180 break;
7181 case ANYOF_DIGIT:
7182 if (LOC)
7183 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7184 else {
7185 /* consecutive digits assumed */
7186 for (value = '0'; value <= '9'; value++)
7187 ANYOF_BITMAP_SET(ret, value);
7188 }
c49a72a9
NC
7189 yesno = '+';
7190 what = "Digit";
ffc61ed2
JH
7191 break;
7192 case ANYOF_NDIGIT:
7193 if (LOC)
7194 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7195 else {
7196 /* consecutive digits assumed */
7197 for (value = 0; value < '0'; value++)
7198 ANYOF_BITMAP_SET(ret, value);
7199 for (value = '9' + 1; value < 256; value++)
7200 ANYOF_BITMAP_SET(ret, value);
7201 }
c49a72a9
NC
7202 yesno = '!';
7203 what = "Digit";
73b437c8
JH
7204 break;
7205 case ANYOF_GRAPH:
7206 if (LOC)
936ed897 7207 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7208 else {
7209 for (value = 0; value < 256; value++)
7210 if (isGRAPH(value))
936ed897 7211 ANYOF_BITMAP_SET(ret, value);
73b437c8 7212 }
c49a72a9
NC
7213 yesno = '+';
7214 what = "Graph";
73b437c8
JH
7215 break;
7216 case ANYOF_NGRAPH:
7217 if (LOC)
936ed897 7218 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7219 else {
7220 for (value = 0; value < 256; value++)
7221 if (!isGRAPH(value))
936ed897 7222 ANYOF_BITMAP_SET(ret, value);
73b437c8 7223 }
c49a72a9
NC
7224 yesno = '!';
7225 what = "Graph";
73b437c8
JH
7226 break;
7227 case ANYOF_LOWER:
7228 if (LOC)
936ed897 7229 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7230 else {
7231 for (value = 0; value < 256; value++)
7232 if (isLOWER(value))
936ed897 7233 ANYOF_BITMAP_SET(ret, value);
73b437c8 7234 }
c49a72a9
NC
7235 yesno = '+';
7236 what = "Lower";
73b437c8
JH
7237 break;
7238 case ANYOF_NLOWER:
7239 if (LOC)
936ed897 7240 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7241 else {
7242 for (value = 0; value < 256; value++)
7243 if (!isLOWER(value))
936ed897 7244 ANYOF_BITMAP_SET(ret, value);
73b437c8 7245 }
c49a72a9
NC
7246 yesno = '!';
7247 what = "Lower";
73b437c8
JH
7248 break;
7249 case ANYOF_PRINT:
7250 if (LOC)
936ed897 7251 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7252 else {
7253 for (value = 0; value < 256; value++)
7254 if (isPRINT(value))
936ed897 7255 ANYOF_BITMAP_SET(ret, value);
73b437c8 7256 }
c49a72a9
NC
7257 yesno = '+';
7258 what = "Print";
73b437c8
JH
7259 break;
7260 case ANYOF_NPRINT:
7261 if (LOC)
936ed897 7262 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7263 else {
7264 for (value = 0; value < 256; value++)
7265 if (!isPRINT(value))
936ed897 7266 ANYOF_BITMAP_SET(ret, value);
73b437c8 7267 }
c49a72a9
NC
7268 yesno = '!';
7269 what = "Print";
73b437c8 7270 break;
aaa51d5e
JF
7271 case ANYOF_PSXSPC:
7272 if (LOC)
7273 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7274 else {
7275 for (value = 0; value < 256; value++)
7276 if (isPSXSPC(value))
7277 ANYOF_BITMAP_SET(ret, value);
7278 }
c49a72a9
NC
7279 yesno = '+';
7280 what = "Space";
aaa51d5e
JF
7281 break;
7282 case ANYOF_NPSXSPC:
7283 if (LOC)
7284 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7285 else {
7286 for (value = 0; value < 256; value++)
7287 if (!isPSXSPC(value))
7288 ANYOF_BITMAP_SET(ret, value);
7289 }
c49a72a9
NC
7290 yesno = '!';
7291 what = "Space";
aaa51d5e 7292 break;
73b437c8
JH
7293 case ANYOF_PUNCT:
7294 if (LOC)
936ed897 7295 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7296 else {
7297 for (value = 0; value < 256; value++)
7298 if (isPUNCT(value))
936ed897 7299 ANYOF_BITMAP_SET(ret, value);
73b437c8 7300 }
c49a72a9
NC
7301 yesno = '+';
7302 what = "Punct";
73b437c8
JH
7303 break;
7304 case ANYOF_NPUNCT:
7305 if (LOC)
936ed897 7306 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7307 else {
7308 for (value = 0; value < 256; value++)
7309 if (!isPUNCT(value))
936ed897 7310 ANYOF_BITMAP_SET(ret, value);
73b437c8 7311 }
c49a72a9
NC
7312 yesno = '!';
7313 what = "Punct";
ffc61ed2
JH
7314 break;
7315 case ANYOF_SPACE:
7316 if (LOC)
7317 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7318 else {
7319 for (value = 0; value < 256; value++)
7320 if (isSPACE(value))
7321 ANYOF_BITMAP_SET(ret, value);
7322 }
c49a72a9
NC
7323 yesno = '+';
7324 what = "SpacePerl";
ffc61ed2
JH
7325 break;
7326 case ANYOF_NSPACE:
7327 if (LOC)
7328 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7329 else {
7330 for (value = 0; value < 256; value++)
7331 if (!isSPACE(value))
7332 ANYOF_BITMAP_SET(ret, value);
7333 }
c49a72a9
NC
7334 yesno = '!';
7335 what = "SpacePerl";
73b437c8
JH
7336 break;
7337 case ANYOF_UPPER:
7338 if (LOC)
936ed897 7339 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7340 else {
7341 for (value = 0; value < 256; value++)
7342 if (isUPPER(value))
936ed897 7343 ANYOF_BITMAP_SET(ret, value);
73b437c8 7344 }
c49a72a9
NC
7345 yesno = '+';
7346 what = "Upper";
73b437c8
JH
7347 break;
7348 case ANYOF_NUPPER:
7349 if (LOC)
936ed897 7350 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7351 else {
7352 for (value = 0; value < 256; value++)
7353 if (!isUPPER(value))
936ed897 7354 ANYOF_BITMAP_SET(ret, value);
73b437c8 7355 }
c49a72a9
NC
7356 yesno = '!';
7357 what = "Upper";
73b437c8
JH
7358 break;
7359 case ANYOF_XDIGIT:
7360 if (LOC)
936ed897 7361 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7362 else {
7363 for (value = 0; value < 256; value++)
7364 if (isXDIGIT(value))
936ed897 7365 ANYOF_BITMAP_SET(ret, value);
73b437c8 7366 }
c49a72a9
NC
7367 yesno = '+';
7368 what = "XDigit";
73b437c8
JH
7369 break;
7370 case ANYOF_NXDIGIT:
7371 if (LOC)
936ed897 7372 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7373 else {
7374 for (value = 0; value < 256; value++)
7375 if (!isXDIGIT(value))
936ed897 7376 ANYOF_BITMAP_SET(ret, value);
73b437c8 7377 }
c49a72a9
NC
7378 yesno = '!';
7379 what = "XDigit";
73b437c8 7380 break;
f81125e2
JP
7381 case ANYOF_MAX:
7382 /* this is to handle \p and \P */
7383 break;
73b437c8 7384 default:
b45f050a 7385 vFAIL("Invalid [::] class");
73b437c8 7386 break;
b8c5462f 7387 }
c49a72a9
NC
7388 if (what) {
7389 /* Strings such as "+utf8::isWord\n" */
7390 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7391 }
b8c5462f 7392 if (LOC)
936ed897 7393 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7394 continue;
a0d0e21e 7395 }
ffc61ed2
JH
7396 } /* end of namedclass \blah */
7397
a0d0e21e 7398 if (range) {
eb160463 7399 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7400 const int w = RExC_parse - rangebegin;
7401 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7402 range = 0; /* not a valid range */
73b437c8 7403 }
a0d0e21e
LW
7404 }
7405 else {
3568d838 7406 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7407 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7408 RExC_parse[1] != ']') {
7409 RExC_parse++;
ffc61ed2
JH
7410
7411 /* a bad range like \w-, [:word:]- ? */
7412 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7413 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7414 const int w =
afd78fd5
JH
7415 RExC_parse >= rangebegin ?
7416 RExC_parse - rangebegin : 0;
830247a4 7417 vWARN4(RExC_parse,
b45f050a 7418 "False [] range \"%*.*s\"",
097eb12c 7419 w, w, rangebegin);
afd78fd5 7420 }
73b437c8 7421 if (!SIZE_ONLY)
936ed897 7422 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7423 } else
ffc61ed2
JH
7424 range = 1; /* yeah, it's a range! */
7425 continue; /* but do it the next time */
a0d0e21e 7426 }
a687059c 7427 }
ffc61ed2 7428
93733859 7429 /* now is the next time */
07be1b83 7430 /*stored += (value - prevvalue + 1);*/
ae5c130c 7431 if (!SIZE_ONLY) {
3568d838 7432 if (prevvalue < 256) {
1df70142 7433 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7434 IV i;
3568d838 7435#ifdef EBCDIC
1b2d223b
JH
7436 /* In EBCDIC [\x89-\x91] should include
7437 * the \x8e but [i-j] should not. */
7438 if (literal_endpoint == 2 &&
7439 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7440 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7441 {
3568d838
JH
7442 if (isLOWER(prevvalue)) {
7443 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7444 if (isLOWER(i))
7445 ANYOF_BITMAP_SET(ret, i);
7446 } else {
3568d838 7447 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7448 if (isUPPER(i))
7449 ANYOF_BITMAP_SET(ret, i);
7450 }
8ada0baa 7451 }
ffc61ed2 7452 else
8ada0baa 7453#endif
07be1b83
YO
7454 for (i = prevvalue; i <= ceilvalue; i++) {
7455 if (!ANYOF_BITMAP_TEST(ret,i)) {
7456 stored++;
7457 ANYOF_BITMAP_SET(ret, i);
7458 }
7459 }
3568d838 7460 }
a5961de5 7461 if (value > 255 || UTF) {
1df70142
AL
7462 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7463 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7464 stored+=2; /* can't optimize this class */
ffc61ed2 7465 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7466 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7467 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7468 prevnatvalue, natvalue);
7469 }
7470 else if (prevnatvalue == natvalue) {
7471 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7472 if (FOLD) {
89ebb4a3 7473 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7474 STRLEN foldlen;
1df70142 7475 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7476
e294cc5d
JH
7477#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7478 if (RExC_precomp[0] == ':' &&
7479 RExC_precomp[1] == '[' &&
7480 (f == 0xDF || f == 0x92)) {
7481 f = NATIVE_TO_UNI(f);
7482 }
7483#endif
c840d2a2
JH
7484 /* If folding and foldable and a single
7485 * character, insert also the folded version
7486 * to the charclass. */
9e55ce06 7487 if (f != value) {
e294cc5d
JH
7488#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7489 if ((RExC_precomp[0] == ':' &&
7490 RExC_precomp[1] == '[' &&
7491 (f == 0xA2 &&
7492 (value == 0xFB05 || value == 0xFB06))) ?
7493 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7494 foldlen == (STRLEN)UNISKIP(f) )
7495#else
eb160463 7496 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7497#endif
9e55ce06
JH
7498 Perl_sv_catpvf(aTHX_ listsv,
7499 "%04"UVxf"\n", f);
7500 else {
7501 /* Any multicharacter foldings
7502 * require the following transform:
7503 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7504 * where E folds into "pq" and F folds
7505 * into "rst", all other characters
7506 * fold to single characters. We save
7507 * away these multicharacter foldings,
7508 * to be later saved as part of the
7509 * additional "s" data. */
7510 SV *sv;
7511
7512 if (!unicode_alternate)
7513 unicode_alternate = newAV();
7514 sv = newSVpvn((char*)foldbuf, foldlen);
7515 SvUTF8_on(sv);
7516 av_push(unicode_alternate, sv);
7517 }
7518 }
254ba52a 7519
60a8b682
JH
7520 /* If folding and the value is one of the Greek
7521 * sigmas insert a few more sigmas to make the
7522 * folding rules of the sigmas to work right.
7523 * Note that not all the possible combinations
7524 * are handled here: some of them are handled
9e55ce06
JH
7525 * by the standard folding rules, and some of
7526 * them (literal or EXACTF cases) are handled
7527 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7528 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7529 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7530 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7531 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7532 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7533 }
7534 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7535 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7536 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7537 }
7538 }
ffc61ed2 7539 }
1b2d223b
JH
7540#ifdef EBCDIC
7541 literal_endpoint = 0;
7542#endif
8ada0baa 7543 }
ffc61ed2
JH
7544
7545 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7546 }
ffc61ed2 7547
936ed897 7548 if (need_class) {
4f66b38d 7549 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7550 if (SIZE_ONLY)
830247a4 7551 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7552 else
830247a4 7553 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7554 }
ffc61ed2 7555
7f6f358c
YO
7556
7557 if (SIZE_ONLY)
7558 return ret;
7559 /****** !SIZE_ONLY AFTER HERE *********/
7560
7561 if( stored == 1 && value < 256
7562 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7563 ) {
7564 /* optimize single char class to an EXACT node
7565 but *only* when its not a UTF/high char */
07be1b83
YO
7566 const char * cur_parse= RExC_parse;
7567 RExC_emit = (regnode *)orig_emit;
7568 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7569 ret = reg_node(pRExC_state,
7570 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7571 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7572 *STRING(ret)= (char)value;
7573 STR_LEN(ret)= 1;
7574 RExC_emit += STR_SZ(1);
7575 return ret;
7576 }
ae5c130c 7577 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7578 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7579 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7580 ) {
a0ed51b3 7581 for (value = 0; value < 256; ++value) {
936ed897 7582 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7583 UV fold = PL_fold[value];
ffc61ed2
JH
7584
7585 if (fold != value)
7586 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7587 }
7588 }
936ed897 7589 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7590 }
ffc61ed2 7591
ae5c130c 7592 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7593 if (optimize_invert &&
ffc61ed2
JH
7594 /* If the only flag is inversion. */
7595 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7596 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7597 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7598 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7599 }
7f6f358c 7600 {
097eb12c 7601 AV * const av = newAV();
ffc61ed2 7602 SV *rv;
9e55ce06 7603 /* The 0th element stores the character class description
6a0407ee 7604 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7605 * to initialize the appropriate swash (which gets stored in
7606 * the 1st element), and also useful for dumping the regnode.
7607 * The 2nd element stores the multicharacter foldings,
6a0407ee 7608 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7609 av_store(av, 0, listsv);
7610 av_store(av, 1, NULL);
9e55ce06 7611 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7612 rv = newRV_noinc((SV*)av);
19860706 7613 n = add_data(pRExC_state, 1, "s");
830247a4 7614 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7615 ARG_SET(ret, n);
a0ed51b3 7616 }
a0ed51b3
LW
7617 return ret;
7618}
7619
76e3520e 7620STATIC char*
830247a4 7621S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7622{
097eb12c 7623 char* const retval = RExC_parse++;
a0d0e21e 7624
4633a7c4 7625 for (;;) {
830247a4
IZ
7626 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7627 RExC_parse[2] == '#') {
e994fd66
AE
7628 while (*RExC_parse != ')') {
7629 if (RExC_parse == RExC_end)
7630 FAIL("Sequence (?#... not terminated");
830247a4 7631 RExC_parse++;
e994fd66 7632 }
830247a4 7633 RExC_parse++;
4633a7c4
LW
7634 continue;
7635 }
e2509266 7636 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7637 if (isSPACE(*RExC_parse)) {
7638 RExC_parse++;
748a9306
LW
7639 continue;
7640 }
830247a4 7641 else if (*RExC_parse == '#') {
e994fd66
AE
7642 while (RExC_parse < RExC_end)
7643 if (*RExC_parse++ == '\n') break;
748a9306
LW
7644 continue;
7645 }
748a9306 7646 }
4633a7c4 7647 return retval;
a0d0e21e 7648 }
a687059c
LW
7649}
7650
7651/*
c277df42 7652- reg_node - emit a node
a0d0e21e 7653*/
76e3520e 7654STATIC regnode * /* Location. */
830247a4 7655S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7656{
97aff369 7657 dVAR;
c277df42 7658 register regnode *ptr;
504618e9 7659 regnode * const ret = RExC_emit;
07be1b83 7660 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7661
c277df42 7662 if (SIZE_ONLY) {
830247a4
IZ
7663 SIZE_ALIGN(RExC_size);
7664 RExC_size += 1;
a0d0e21e
LW
7665 return(ret);
7666 }
e2e6a0f1
YO
7667#ifdef DEBUGGING
7668 if (OP(RExC_emit) == 255)
7669 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7670 reg_name[op], OP(RExC_emit));
7671#endif
c277df42 7672 NODE_ALIGN_FILL(ret);
a0d0e21e 7673 ptr = ret;
c277df42 7674 FILL_ADVANCE_NODE(ptr, op);
fac92740 7675 if (RExC_offsets) { /* MJD */
07be1b83 7676 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7677 "reg_node", __LINE__,
7678 reg_name[op],
07be1b83
YO
7679 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7680 ? "Overwriting end of array!\n" : "OK",
7681 (UV)(RExC_emit - RExC_emit_start),
7682 (UV)(RExC_parse - RExC_start),
7683 (UV)RExC_offsets[0]));
ccb2c380 7684 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7685 }
07be1b83 7686
830247a4 7687 RExC_emit = ptr;
a0d0e21e 7688 return(ret);
a687059c
LW
7689}
7690
7691/*
a0d0e21e
LW
7692- reganode - emit a node with an argument
7693*/
76e3520e 7694STATIC regnode * /* Location. */
830247a4 7695S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7696{
97aff369 7697 dVAR;
c277df42 7698 register regnode *ptr;
504618e9 7699 regnode * const ret = RExC_emit;
07be1b83 7700 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7701
c277df42 7702 if (SIZE_ONLY) {
830247a4
IZ
7703 SIZE_ALIGN(RExC_size);
7704 RExC_size += 2;
6bda09f9
YO
7705 /*
7706 We can't do this:
7707
7708 assert(2==regarglen[op]+1);
7709
7710 Anything larger than this has to allocate the extra amount.
7711 If we changed this to be:
7712
7713 RExC_size += (1 + regarglen[op]);
7714
7715 then it wouldn't matter. Its not clear what side effect
7716 might come from that so its not done so far.
7717 -- dmq
7718 */
a0d0e21e
LW
7719 return(ret);
7720 }
e2e6a0f1
YO
7721#ifdef DEBUGGING
7722 if (OP(RExC_emit) == 255)
7723 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7724#endif
c277df42 7725 NODE_ALIGN_FILL(ret);
a0d0e21e 7726 ptr = ret;
c277df42 7727 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7728 if (RExC_offsets) { /* MJD */
07be1b83 7729 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7730 "reganode",
ccb2c380
MP
7731 __LINE__,
7732 reg_name[op],
07be1b83 7733 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7734 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7735 (UV)(RExC_emit - RExC_emit_start),
7736 (UV)(RExC_parse - RExC_start),
7737 (UV)RExC_offsets[0]));
ccb2c380 7738 Set_Cur_Node_Offset;
fac92740
MJD
7739 }
7740
830247a4 7741 RExC_emit = ptr;
a0d0e21e 7742 return(ret);
fe14fcc3
LW
7743}
7744
7745/*
cd439c50 7746- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7747*/
71207a34
AL
7748STATIC STRLEN
7749S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7750{
97aff369 7751 dVAR;
71207a34 7752 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7753}
7754
7755/*
a0d0e21e
LW
7756- reginsert - insert an operator in front of already-emitted operand
7757*
7758* Means relocating the operand.
7759*/
76e3520e 7760STATIC void
6bda09f9 7761S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7762{
97aff369 7763 dVAR;
c277df42
IZ
7764 register regnode *src;
7765 register regnode *dst;
7766 register regnode *place;
504618e9 7767 const int offset = regarglen[(U8)op];
6bda09f9 7768 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7769 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7770/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7771 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7772 if (SIZE_ONLY) {
6bda09f9 7773 RExC_size += size;
a0d0e21e
LW
7774 return;
7775 }
a687059c 7776
830247a4 7777 src = RExC_emit;
6bda09f9 7778 RExC_emit += size;
830247a4 7779 dst = RExC_emit;
40d049e4 7780 if (RExC_open_parens) {
6bda09f9 7781 int paren;
6d99fb9b 7782 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7783 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7784 if ( RExC_open_parens[paren] >= opnd ) {
7785 DEBUG_PARSE_FMT("open"," - %d",size);
7786 RExC_open_parens[paren] += size;
7787 } else {
7788 DEBUG_PARSE_FMT("open"," - %s","ok");
7789 }
7790 if ( RExC_close_parens[paren] >= opnd ) {
7791 DEBUG_PARSE_FMT("close"," - %d",size);
7792 RExC_close_parens[paren] += size;
7793 } else {
7794 DEBUG_PARSE_FMT("close"," - %s","ok");
7795 }
7796 }
6bda09f9 7797 }
40d049e4 7798
fac92740 7799 while (src > opnd) {
c277df42 7800 StructCopy(--src, --dst, regnode);
fac92740 7801 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7802 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7803 "reg_insert",
ccb2c380
MP
7804 __LINE__,
7805 reg_name[op],
07be1b83
YO
7806 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7807 ? "Overwriting end of array!\n" : "OK",
7808 (UV)(src - RExC_emit_start),
7809 (UV)(dst - RExC_emit_start),
7810 (UV)RExC_offsets[0]));
ccb2c380
MP
7811 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7812 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7813 }
7814 }
7815
a0d0e21e
LW
7816
7817 place = opnd; /* Op node, where operand used to be. */
fac92740 7818 if (RExC_offsets) { /* MJD */
07be1b83 7819 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7820 "reginsert",
ccb2c380
MP
7821 __LINE__,
7822 reg_name[op],
07be1b83 7823 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7824 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7825 (UV)(place - RExC_emit_start),
7826 (UV)(RExC_parse - RExC_start),
786e8c11 7827 (UV)RExC_offsets[0]));
ccb2c380 7828 Set_Node_Offset(place, RExC_parse);
45948336 7829 Set_Node_Length(place, 1);
fac92740 7830 }
c277df42
IZ
7831 src = NEXTOPER(place);
7832 FILL_ADVANCE_NODE(place, op);
7833 Zero(src, offset, regnode);
a687059c
LW
7834}
7835
7836/*
c277df42 7837- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7838- SEE ALSO: regtail_study
a0d0e21e 7839*/
097eb12c 7840/* TODO: All three parms should be const */
76e3520e 7841STATIC void
3dab1dad 7842S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7843{
97aff369 7844 dVAR;
c277df42 7845 register regnode *scan;
72f13be8 7846 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7847#ifndef DEBUGGING
7848 PERL_UNUSED_ARG(depth);
7849#endif
a0d0e21e 7850
c277df42 7851 if (SIZE_ONLY)
a0d0e21e
LW
7852 return;
7853
7854 /* Find last node. */
7855 scan = p;
7856 for (;;) {
504618e9 7857 regnode * const temp = regnext(scan);
3dab1dad
YO
7858 DEBUG_PARSE_r({
7859 SV * const mysv=sv_newmortal();
7860 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7861 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7862 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7863 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7864 (temp == NULL ? "->" : ""),
7865 (temp == NULL ? reg_name[OP(val)] : "")
7866 );
3dab1dad
YO
7867 });
7868 if (temp == NULL)
7869 break;
7870 scan = temp;
7871 }
7872
7873 if (reg_off_by_arg[OP(scan)]) {
7874 ARG_SET(scan, val - scan);
7875 }
7876 else {
7877 NEXT_OFF(scan) = val - scan;
7878 }
7879}
7880
07be1b83 7881#ifdef DEBUGGING
3dab1dad
YO
7882/*
7883- regtail_study - set the next-pointer at the end of a node chain of p to val.
7884- Look for optimizable sequences at the same time.
7885- currently only looks for EXACT chains.
07be1b83
YO
7886
7887This is expermental code. The idea is to use this routine to perform
7888in place optimizations on branches and groups as they are constructed,
7889with the long term intention of removing optimization from study_chunk so
7890that it is purely analytical.
7891
7892Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7893to control which is which.
7894
3dab1dad
YO
7895*/
7896/* TODO: All four parms should be const */
07be1b83 7897
3dab1dad
YO
7898STATIC U8
7899S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7900{
7901 dVAR;
7902 register regnode *scan;
07be1b83
YO
7903 U8 exact = PSEUDO;
7904#ifdef EXPERIMENTAL_INPLACESCAN
7905 I32 min = 0;
7906#endif
7907
3dab1dad
YO
7908 GET_RE_DEBUG_FLAGS_DECL;
7909
07be1b83 7910
3dab1dad
YO
7911 if (SIZE_ONLY)
7912 return exact;
7913
7914 /* Find last node. */
7915
7916 scan = p;
7917 for (;;) {
7918 regnode * const temp = regnext(scan);
07be1b83
YO
7919#ifdef EXPERIMENTAL_INPLACESCAN
7920 if (PL_regkind[OP(scan)] == EXACT)
7921 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7922 return EXACT;
7923#endif
3dab1dad
YO
7924 if ( exact ) {
7925 switch (OP(scan)) {
7926 case EXACT:
7927 case EXACTF:
7928 case EXACTFL:
7929 if( exact == PSEUDO )
7930 exact= OP(scan);
07be1b83
YO
7931 else if ( exact != OP(scan) )
7932 exact= 0;
3dab1dad
YO
7933 case NOTHING:
7934 break;
7935 default:
7936 exact= 0;
7937 }
7938 }
7939 DEBUG_PARSE_r({
7940 SV * const mysv=sv_newmortal();
7941 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7942 regprop(RExC_rx, mysv, scan);
eaf3ca90 7943 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7944 SvPV_nolen_const(mysv),
eaf3ca90
YO
7945 REG_NODE_NUM(scan),
7946 reg_name[exact]);
3dab1dad 7947 });
a0d0e21e
LW
7948 if (temp == NULL)
7949 break;
7950 scan = temp;
7951 }
07be1b83
YO
7952 DEBUG_PARSE_r({
7953 SV * const mysv_val=sv_newmortal();
7954 DEBUG_PARSE_MSG("");
7955 regprop(RExC_rx, mysv_val, val);
7956 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7957 SvPV_nolen_const(mysv_val),
7958 REG_NODE_NUM(val),
7959 val - scan
7960 );
7961 });
c277df42
IZ
7962 if (reg_off_by_arg[OP(scan)]) {
7963 ARG_SET(scan, val - scan);
a0ed51b3
LW
7964 }
7965 else {
c277df42
IZ
7966 NEXT_OFF(scan) = val - scan;
7967 }
3dab1dad
YO
7968
7969 return exact;
a687059c 7970}
07be1b83 7971#endif
a687059c
LW
7972
7973/*
a687059c
LW
7974 - regcurly - a little FSA that accepts {\d+,?\d*}
7975 */
79072805 7976STATIC I32
5f66b61c 7977S_regcurly(register const char *s)
a687059c
LW
7978{
7979 if (*s++ != '{')
7980 return FALSE;
f0fcb552 7981 if (!isDIGIT(*s))
a687059c 7982 return FALSE;
f0fcb552 7983 while (isDIGIT(*s))
a687059c
LW
7984 s++;
7985 if (*s == ',')
7986 s++;
f0fcb552 7987 while (isDIGIT(*s))
a687059c
LW
7988 s++;
7989 if (*s != '}')
7990 return FALSE;
7991 return TRUE;
7992}
7993
a687059c
LW
7994
7995/*
fd181c75 7996 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7997 */
7998void
097eb12c 7999Perl_regdump(pTHX_ const regexp *r)
a687059c 8000{
35ff7856 8001#ifdef DEBUGGING
97aff369 8002 dVAR;
c445ea15 8003 SV * const sv = sv_newmortal();
ab3bbdeb 8004 SV *dsv= sv_newmortal();
a687059c 8005
786e8c11 8006 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8007
8008 /* Header fields of interest. */
ab3bbdeb
YO
8009 if (r->anchored_substr) {
8010 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8011 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8012 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8013 "anchored %s%s at %"IVdf" ",
8014 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8015 (IV)r->anchored_offset);
ab3bbdeb
YO
8016 } else if (r->anchored_utf8) {
8017 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8018 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8019 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8020 "anchored utf8 %s%s at %"IVdf" ",
8021 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8022 (IV)r->anchored_offset);
ab3bbdeb
YO
8023 }
8024 if (r->float_substr) {
8025 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8026 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8027 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8028 "floating %s%s at %"IVdf"..%"UVuf" ",
8029 s, RE_SV_TAIL(r->float_substr),
7b0972df 8030 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8031 } else if (r->float_utf8) {
8032 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8033 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8034 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8035 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8036 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8037 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8038 }
33b8afdf 8039 if (r->check_substr || r->check_utf8)
b81d288d 8040 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8041 (const char *)
8042 (r->check_substr == r->float_substr
8043 && r->check_utf8 == r->float_utf8
8044 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
8045 if (r->reganch & ROPT_NOSCAN)
8046 PerlIO_printf(Perl_debug_log, " noscan");
8047 if (r->reganch & ROPT_CHECK_ALL)
8048 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8049 if (r->check_substr || r->check_utf8)
c277df42
IZ
8050 PerlIO_printf(Perl_debug_log, ") ");
8051
46fc3d4c 8052 if (r->regstclass) {
32fc9b6a 8053 regprop(r, sv, r->regstclass);
1de06328 8054 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8055 }
774d564b 8056 if (r->reganch & ROPT_ANCH) {
8057 PerlIO_printf(Perl_debug_log, "anchored");
8058 if (r->reganch & ROPT_ANCH_BOL)
8059 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
8060 if (r->reganch & ROPT_ANCH_MBOL)
8061 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
8062 if (r->reganch & ROPT_ANCH_SBOL)
8063 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 8064 if (r->reganch & ROPT_ANCH_GPOS)
8065 PerlIO_printf(Perl_debug_log, "(GPOS)");
8066 PerlIO_putc(Perl_debug_log, ' ');
8067 }
c277df42
IZ
8068 if (r->reganch & ROPT_GPOS_SEEN)
8069 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 8070 if (r->reganch & ROPT_SKIP)
760ac839 8071 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 8072 if (r->reganch & ROPT_IMPLICIT)
760ac839 8073 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 8074 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
8075 if (r->reganch & ROPT_EVAL_SEEN)
8076 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8077 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8078#else
96a5add6 8079 PERL_UNUSED_CONTEXT;
65e66c80 8080 PERL_UNUSED_ARG(r);
17c3b450 8081#endif /* DEBUGGING */
a687059c
LW
8082}
8083
8084/*
a0d0e21e
LW
8085- regprop - printable representation of opcode
8086*/
46fc3d4c 8087void
32fc9b6a 8088Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8089{
35ff7856 8090#ifdef DEBUGGING
97aff369 8091 dVAR;
9b155405 8092 register int k;
1de06328 8093 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 8094
54dc92de 8095 sv_setpvn(sv, "", 0);
03363afd 8096 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8097 /* It would be nice to FAIL() here, but this may be called from
8098 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8099 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8100 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8101
3dab1dad 8102 k = PL_regkind[OP(o)];
9b155405 8103
2a782b5b 8104 if (k == EXACT) {
396482e1 8105 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8106 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8107 * is a crude hack but it may be the best for now since
8108 * we have no flag "this EXACTish node was UTF-8"
8109 * --jhi */
8110 const char * const s =
ddc5bc0f 8111 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8112 PL_colors[0], PL_colors[1],
8113 PERL_PV_ESCAPE_UNI_DETECT |
8114 PERL_PV_PRETTY_ELIPSES |
8115 PERL_PV_PRETTY_LTGT
8116 );
8117 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8118 } else if (k == TRIE) {
3dab1dad 8119 /* print the details of the trie in dumpuntil instead, as
4f639d21 8120 * prog->data isn't available here */
1de06328
YO
8121 const char op = OP(o);
8122 const I32 n = ARG(o);
8123 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
8124 (reg_ac_data *)prog->data->data[n] :
8125 NULL;
8126 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
8127 (reg_trie_data*)prog->data->data[n] :
8128 ac->trie;
8129
8130 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8131 DEBUG_TRIE_COMPILE_r(
8132 Perl_sv_catpvf(aTHX_ sv,
8133 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8134 (UV)trie->startstate,
1e2e3d02 8135 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8136 (UV)trie->wordcount,
8137 (UV)trie->minlen,
8138 (UV)trie->maxlen,
8139 (UV)TRIE_CHARCOUNT(trie),
8140 (UV)trie->uniquecharcount
8141 )
8142 );
8143 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8144 int i;
8145 int rangestart = -1;
f46cb337 8146 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8147 Perl_sv_catpvf(aTHX_ sv, "[");
8148 for (i = 0; i <= 256; i++) {
8149 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8150 if (rangestart == -1)
8151 rangestart = i;
8152 } else if (rangestart != -1) {
8153 if (i <= rangestart + 3)
8154 for (; rangestart < i; rangestart++)
8155 put_byte(sv, rangestart);
8156 else {
8157 put_byte(sv, rangestart);
8158 sv_catpvs(sv, "-");
8159 put_byte(sv, i - 1);
8160 }
8161 rangestart = -1;
8162 }
8163 }
8164 Perl_sv_catpvf(aTHX_ sv, "]");
8165 }
8166
a3621e74 8167 } else if (k == CURLY) {
cb434fcc 8168 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8169 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8170 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8171 }
2c2d71f5
JH
8172 else if (k == WHILEM && o->flags) /* Ordinal/of */
8173 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
e2e6a0f1 8174 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT)
894356b3 8175 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8176 else if (k == GOSUB)
6bda09f9 8177 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8178 else if (k == VERB) {
8179 if (!o->flags)
8180 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
8181 (SV*)prog->data->data[ ARG( o ) ]);
8182 } else if (k == LOGICAL)
04ebc1ab 8183 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8184 else if (k == ANYOF) {
8185 int i, rangestart = -1;
2d03de9c 8186 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8187
8188 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8189 static const char * const anyofs[] = {
653099ff
GS
8190 "\\w",
8191 "\\W",
8192 "\\s",
8193 "\\S",
8194 "\\d",
8195 "\\D",
8196 "[:alnum:]",
8197 "[:^alnum:]",
8198 "[:alpha:]",
8199 "[:^alpha:]",
8200 "[:ascii:]",
8201 "[:^ascii:]",
8202 "[:ctrl:]",
8203 "[:^ctrl:]",
8204 "[:graph:]",
8205 "[:^graph:]",
8206 "[:lower:]",
8207 "[:^lower:]",
8208 "[:print:]",
8209 "[:^print:]",
8210 "[:punct:]",
8211 "[:^punct:]",
8212 "[:upper:]",
aaa51d5e 8213 "[:^upper:]",
653099ff 8214 "[:xdigit:]",
aaa51d5e
JF
8215 "[:^xdigit:]",
8216 "[:space:]",
8217 "[:^space:]",
8218 "[:blank:]",
8219 "[:^blank:]"
653099ff
GS
8220 };
8221
19860706 8222 if (flags & ANYOF_LOCALE)
396482e1 8223 sv_catpvs(sv, "{loc}");
19860706 8224 if (flags & ANYOF_FOLD)
396482e1 8225 sv_catpvs(sv, "{i}");
653099ff 8226 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8227 if (flags & ANYOF_INVERT)
396482e1 8228 sv_catpvs(sv, "^");
ffc61ed2
JH
8229 for (i = 0; i <= 256; i++) {
8230 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8231 if (rangestart == -1)
8232 rangestart = i;
8233 } else if (rangestart != -1) {
8234 if (i <= rangestart + 3)
8235 for (; rangestart < i; rangestart++)
653099ff 8236 put_byte(sv, rangestart);
ffc61ed2
JH
8237 else {
8238 put_byte(sv, rangestart);
396482e1 8239 sv_catpvs(sv, "-");
ffc61ed2 8240 put_byte(sv, i - 1);
653099ff 8241 }
ffc61ed2 8242 rangestart = -1;
653099ff 8243 }
847a199f 8244 }
ffc61ed2
JH
8245
8246 if (o->flags & ANYOF_CLASS)
bb7a0f54 8247 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8248 if (ANYOF_CLASS_TEST(o,i))
8249 sv_catpv(sv, anyofs[i]);
8250
8251 if (flags & ANYOF_UNICODE)
396482e1 8252 sv_catpvs(sv, "{unicode}");
1aa99e6b 8253 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8254 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8255
8256 {
8257 SV *lv;
32fc9b6a 8258 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8259
ffc61ed2
JH
8260 if (lv) {
8261 if (sw) {
89ebb4a3 8262 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8263
ffc61ed2 8264 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8265 uvchr_to_utf8(s, i);
ffc61ed2 8266
3568d838 8267 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8268 if (rangestart == -1)
8269 rangestart = i;
8270 } else if (rangestart != -1) {
ffc61ed2
JH
8271 if (i <= rangestart + 3)
8272 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8273 const U8 * const e = uvchr_to_utf8(s,rangestart);
8274 U8 *p;
8275 for(p = s; p < e; p++)
ffc61ed2
JH
8276 put_byte(sv, *p);
8277 }
8278 else {
2d03de9c
AL
8279 const U8 *e = uvchr_to_utf8(s,rangestart);
8280 U8 *p;
8281 for (p = s; p < e; p++)
ffc61ed2 8282 put_byte(sv, *p);
396482e1 8283 sv_catpvs(sv, "-");
2d03de9c
AL
8284 e = uvchr_to_utf8(s, i-1);
8285 for (p = s; p < e; p++)
1df70142 8286 put_byte(sv, *p);
ffc61ed2
JH
8287 }
8288 rangestart = -1;
8289 }
19860706 8290 }
ffc61ed2 8291
396482e1 8292 sv_catpvs(sv, "..."); /* et cetera */
19860706 8293 }
fde631ed 8294
ffc61ed2 8295 {
2e0de35c 8296 char *s = savesvpv(lv);
c445ea15 8297 char * const origs = s;
b81d288d 8298
3dab1dad
YO
8299 while (*s && *s != '\n')
8300 s++;
b81d288d 8301
ffc61ed2 8302 if (*s == '\n') {
2d03de9c 8303 const char * const t = ++s;
ffc61ed2
JH
8304
8305 while (*s) {
8306 if (*s == '\n')
8307 *s = ' ';
8308 s++;
8309 }
8310 if (s[-1] == ' ')
8311 s[-1] = 0;
8312
8313 sv_catpv(sv, t);
fde631ed 8314 }
b81d288d 8315
ffc61ed2 8316 Safefree(origs);
fde631ed
JH
8317 }
8318 }
653099ff 8319 }
ffc61ed2 8320
653099ff
GS
8321 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8322 }
9b155405 8323 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8324 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8325#else
96a5add6 8326 PERL_UNUSED_CONTEXT;
65e66c80
SP
8327 PERL_UNUSED_ARG(sv);
8328 PERL_UNUSED_ARG(o);
f9049ba1 8329 PERL_UNUSED_ARG(prog);
17c3b450 8330#endif /* DEBUGGING */
35ff7856 8331}
a687059c 8332
cad2e5aa
JH
8333SV *
8334Perl_re_intuit_string(pTHX_ regexp *prog)
8335{ /* Assume that RE_INTUIT is set */
97aff369 8336 dVAR;
a3621e74 8337 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8338 PERL_UNUSED_CONTEXT;
8339
a3621e74 8340 DEBUG_COMPILE_r(
cfd0369c 8341 {
2d03de9c 8342 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8343 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8344
8345 if (!PL_colorset) reginitcolors();
8346 PerlIO_printf(Perl_debug_log,
a0288114 8347 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8348 PL_colors[4],
8349 prog->check_substr ? "" : "utf8 ",
8350 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8351 s,
8352 PL_colors[1],
8353 (strlen(s) > 60 ? "..." : ""));
8354 } );
8355
33b8afdf 8356 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8357}
8358
84da74a7
YO
8359/*
8360 pregfree - free a regexp
8361
8362 See regdupe below if you change anything here.
8363*/
8364
2b69d0c2 8365void
864dbfa3 8366Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8367{
27da23d5 8368 dVAR;
0df25f3d 8369
fc32ee4a 8370 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8371
7821416a
IZ
8372 if (!r || (--r->refcnt > 0))
8373 return;
ab3bbdeb 8374 DEBUG_COMPILE_r({
0df25f3d
YO
8375 if (!PL_colorset)
8376 reginitcolors();
5c9f2f80
YO
8377 {
8378 SV *dsv= sv_newmortal();
ab3bbdeb
YO
8379 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8380 dsv, r->precomp, r->prelen, 60);
8381 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8382 PL_colors[4],PL_colors[5],s);
8383 }
9e55ce06 8384 });
cad2e5aa 8385
43c5f42d
NC
8386 /* gcov results gave these as non-null 100% of the time, so there's no
8387 optimisation in checking them before calling Safefree */
8388 Safefree(r->precomp);
8389 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8390 RX_MATCH_COPY_FREE(r);
f8c7b90f 8391#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8392 if (r->saved_copy)
8393 SvREFCNT_dec(r->saved_copy);
8394#endif
a193d654
GS
8395 if (r->substrs) {
8396 if (r->anchored_substr)
8397 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8398 if (r->anchored_utf8)
8399 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8400 if (r->float_substr)
8401 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8402 if (r->float_utf8)
8403 SvREFCNT_dec(r->float_utf8);
2779dcf1 8404 Safefree(r->substrs);
a193d654 8405 }
81714fb9
YO
8406 if (r->paren_names)
8407 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8408 if (r->data) {
8409 int n = r->data->count;
f3548bdc
DM
8410 PAD* new_comppad = NULL;
8411 PAD* old_comppad;
4026c95a 8412 PADOFFSET refcnt;
dfad63ad 8413
c277df42 8414 while (--n >= 0) {
261faec3 8415 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8416 switch (r->data->what[n]) {
8417 case 's':
81714fb9 8418 case 'S':
c277df42
IZ
8419 SvREFCNT_dec((SV*)r->data->data[n]);
8420 break;
653099ff
GS
8421 case 'f':
8422 Safefree(r->data->data[n]);
8423 break;
dfad63ad
HS
8424 case 'p':
8425 new_comppad = (AV*)r->data->data[n];
8426 break;
c277df42 8427 case 'o':
dfad63ad 8428 if (new_comppad == NULL)
cea2e8a9 8429 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8430 PAD_SAVE_LOCAL(old_comppad,
8431 /* Watch out for global destruction's random ordering. */
c445ea15 8432 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8433 );
b34c0dd4 8434 OP_REFCNT_LOCK;
4026c95a
SH
8435 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8436 OP_REFCNT_UNLOCK;
8437 if (!refcnt)
9b978d73 8438 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8439
f3548bdc 8440 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8441 SvREFCNT_dec((SV*)new_comppad);
8442 new_comppad = NULL;
c277df42
IZ
8443 break;
8444 case 'n':
9e55ce06 8445 break;
07be1b83 8446 case 'T':
be8e71aa
YO
8447 { /* Aho Corasick add-on structure for a trie node.
8448 Used in stclass optimization only */
07be1b83
YO
8449 U32 refcount;
8450 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8451 OP_REFCNT_LOCK;
8452 refcount = --aho->refcount;
8453 OP_REFCNT_UNLOCK;
8454 if ( !refcount ) {
8455 Safefree(aho->states);
8456 Safefree(aho->fail);
8457 aho->trie=NULL; /* not necessary to free this as it is
8458 handled by the 't' case */
8459 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8460 Safefree(r->regstclass);
07be1b83
YO
8461 }
8462 }
8463 break;
a3621e74 8464 case 't':
07be1b83 8465 {
be8e71aa 8466 /* trie structure. */
07be1b83
YO
8467 U32 refcount;
8468 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8469 OP_REFCNT_LOCK;
8470 refcount = --trie->refcount;
8471 OP_REFCNT_UNLOCK;
8472 if ( !refcount ) {
8473 Safefree(trie->charmap);
8474 if (trie->widecharmap)
8475 SvREFCNT_dec((SV*)trie->widecharmap);
8476 Safefree(trie->states);
8477 Safefree(trie->trans);
8478 if (trie->bitmap)
8479 Safefree(trie->bitmap);
8480 if (trie->wordlen)
8481 Safefree(trie->wordlen);
786e8c11
YO
8482 if (trie->jump)
8483 Safefree(trie->jump);
8484 if (trie->nextword)
8485 Safefree(trie->nextword);
a3621e74 8486#ifdef DEBUGGING
5c9f2f80
YO
8487 if (trie->words)
8488 SvREFCNT_dec((SV*)trie->words);
8489 if (trie->revcharmap)
8490 SvREFCNT_dec((SV*)trie->revcharmap);
a3621e74 8491#endif
07be1b83 8492 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8493 }
07be1b83
YO
8494 }
8495 break;
c277df42 8496 default:
830247a4 8497 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8498 }
8499 }
8500 Safefree(r->data->what);
8501 Safefree(r->data);
a0d0e21e
LW
8502 }
8503 Safefree(r->startp);
8504 Safefree(r->endp);
8505 Safefree(r);
a687059c 8506}
c277df42 8507
84da74a7
YO
8508#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8509#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8510#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8511#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8512
8513/*
8514 regdupe - duplicate a regexp.
8515
8516 This routine is called by sv.c's re_dup and is expected to clone a
8517 given regexp structure. It is a no-op when not under USE_ITHREADS.
8518 (Originally this *was* re_dup() for change history see sv.c)
8519
8520 See pregfree() above if you change anything here.
8521*/
a3c0e9ca 8522#if defined(USE_ITHREADS)
84da74a7
YO
8523regexp *
8524Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8525{
84da74a7
YO
8526 dVAR;
8527 REGEXP *ret;
8528 int i, len, npar;
8529 struct reg_substr_datum *s;
8530
8531 if (!r)
8532 return (REGEXP *)NULL;
8533
8534 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8535 return ret;
8536
8537 len = r->offsets[0];
8538 npar = r->nparens+1;
8539
8540 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8541 Copy(r->program, ret->program, len+1, regnode);
8542
8543 Newx(ret->startp, npar, I32);
8544 Copy(r->startp, ret->startp, npar, I32);
8545 Newx(ret->endp, npar, I32);
8546 Copy(r->startp, ret->startp, npar, I32);
8547
8548 Newx(ret->substrs, 1, struct reg_substr_data);
8549 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8550 s->min_offset = r->substrs->data[i].min_offset;
8551 s->max_offset = r->substrs->data[i].max_offset;
8552 s->end_shift = r->substrs->data[i].end_shift;
8553 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8554 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8555 }
8556
8557 ret->regstclass = NULL;
8558 if (r->data) {
8559 struct reg_data *d;
8560 const int count = r->data->count;
8561 int i;
8562
8563 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8564 char, struct reg_data);
8565 Newx(d->what, count, U8);
8566
8567 d->count = count;
8568 for (i = 0; i < count; i++) {
8569 d->what[i] = r->data->what[i];
8570 switch (d->what[i]) {
e2e6a0f1 8571 /* legal options are one of: sSfpont
84da74a7
YO
8572 see also regcomp.h and pregfree() */
8573 case 's':
81714fb9 8574 case 'S':
84da74a7
YO
8575 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8576 break;
8577 case 'p':
8578 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8579 break;
8580 case 'f':
8581 /* This is cheating. */
8582 Newx(d->data[i], 1, struct regnode_charclass_class);
8583 StructCopy(r->data->data[i], d->data[i],
8584 struct regnode_charclass_class);
8585 ret->regstclass = (regnode*)d->data[i];
8586 break;
8587 case 'o':
8588 /* Compiled op trees are readonly, and can thus be
8589 shared without duplication. */
8590 OP_REFCNT_LOCK;
8591 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8592 OP_REFCNT_UNLOCK;
8593 break;
8594 case 'n':
8595 d->data[i] = r->data->data[i];
8596 break;
8597 case 't':
8598 d->data[i] = r->data->data[i];
8599 OP_REFCNT_LOCK;
8600 ((reg_trie_data*)d->data[i])->refcount++;
8601 OP_REFCNT_UNLOCK;
8602 break;
8603 case 'T':
8604 d->data[i] = r->data->data[i];
8605 OP_REFCNT_LOCK;
8606 ((reg_ac_data*)d->data[i])->refcount++;
8607 OP_REFCNT_UNLOCK;
8608 /* Trie stclasses are readonly and can thus be shared
8609 * without duplication. We free the stclass in pregfree
8610 * when the corresponding reg_ac_data struct is freed.
8611 */
8612 ret->regstclass= r->regstclass;
8613 break;
8614 default:
8615 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8616 }
8617 }
8618
8619 ret->data = d;
8620 }
8621 else
8622 ret->data = NULL;
8623
8624 Newx(ret->offsets, 2*len+1, U32);
8625 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8626
8627 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8628 ret->refcnt = r->refcnt;
8629 ret->minlen = r->minlen;
de8c5301 8630 ret->minlenret = r->minlenret;
84da74a7
YO
8631 ret->prelen = r->prelen;
8632 ret->nparens = r->nparens;
8633 ret->lastparen = r->lastparen;
8634 ret->lastcloseparen = r->lastcloseparen;
8635 ret->reganch = r->reganch;
8636
8637 ret->sublen = r->sublen;
8638
f9f4320a 8639 ret->engine = r->engine;
81714fb9
YO
8640
8641 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8642
84da74a7
YO
8643 if (RX_MATCH_COPIED(ret))
8644 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8645 else
8646 ret->subbeg = NULL;
8647#ifdef PERL_OLD_COPY_ON_WRITE
8648 ret->saved_copy = NULL;
8649#endif
8650
8651 ptr_table_store(PL_ptr_table, r, ret);
8652 return ret;
84da74a7 8653}
a3c0e9ca 8654#endif
84da74a7 8655
de8c5301
YO
8656/*
8657 reg_stringify()
8658
8659 converts a regexp embedded in a MAGIC struct to its stringified form,
8660 caching the converted form in the struct and returns the cached
8661 string.
8662
8663 If lp is nonnull then it is used to return the length of the
8664 resulting string
8665
8666 If flags is nonnull and the returned string contains UTF8 then
8667 (flags & 1) will be true.
8668
8669 If haseval is nonnull then it is used to return whether the pattern
8670 contains evals.
8671
8672 Normally called via macro:
8673
8674 CALLREG_STRINGIFY(mg,0,0);
8675
8676 And internally with
8677
8678 CALLREG_AS_STR(mg,lp,flags,haseval)
8679
8680 See sv_2pv_flags() in sv.c for an example of internal usage.
8681
8682 */
8683
8684char *
8685Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8686 dVAR;
8687 const regexp * const re = (regexp *)mg->mg_obj;
8688
8689 if (!mg->mg_ptr) {
8690 const char *fptr = "msix";
8691 char reflags[6];
8692 char ch;
8693 int left = 0;
8694 int right = 4;
8695 bool need_newline = 0;
8696 U16 reganch = (U16)((re->reganch & PMf_COMPILETIME) >> 12);
8697
8698 while((ch = *fptr++)) {
8699 if(reganch & 1) {
8700 reflags[left++] = ch;
8701 }
8702 else {
8703 reflags[right--] = ch;
8704 }
8705 reganch >>= 1;
8706 }
8707 if(left != 4) {
8708 reflags[left] = '-';
8709 left = 5;
8710 }
8711
8712 mg->mg_len = re->prelen + 4 + left;
8713 /*
8714 * If /x was used, we have to worry about a regex ending with a
8715 * comment later being embedded within another regex. If so, we don't
8716 * want this regex's "commentization" to leak out to the right part of
8717 * the enclosing regex, we must cap it with a newline.
8718 *
8719 * So, if /x was used, we scan backwards from the end of the regex. If
8720 * we find a '#' before we find a newline, we need to add a newline
8721 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8722 * we don't need to add anything. -jfriedl
8723 */
8724 if (PMf_EXTENDED & re->reganch) {
8725 const char *endptr = re->precomp + re->prelen;
8726 while (endptr >= re->precomp) {
8727 const char c = *(endptr--);
8728 if (c == '\n')
8729 break; /* don't need another */
8730 if (c == '#') {
8731 /* we end while in a comment, so we need a newline */
8732 mg->mg_len++; /* save space for it */
8733 need_newline = 1; /* note to add it */
8734 break;
8735 }
8736 }
8737 }
8738
8739 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8740 mg->mg_ptr[0] = '(';
8741 mg->mg_ptr[1] = '?';
8742 Copy(reflags, mg->mg_ptr+2, left, char);
8743 *(mg->mg_ptr+left+2) = ':';
8744 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8745 if (need_newline)
8746 mg->mg_ptr[mg->mg_len - 2] = '\n';
8747 mg->mg_ptr[mg->mg_len - 1] = ')';
8748 mg->mg_ptr[mg->mg_len] = 0;
8749 }
8750 if (haseval)
8751 *haseval = re->program[0].next_off;
8752 if (flags)
8753 *flags = ((re->reganch & ROPT_UTF8) ? 1 : 0);
8754
8755 if (lp)
8756 *lp = mg->mg_len;
8757 return mg->mg_ptr;
8758}
8759
8760
76234dfb 8761#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8762/*
8763 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8764 */
8765regnode *
864dbfa3 8766Perl_regnext(pTHX_ register regnode *p)
c277df42 8767{
97aff369 8768 dVAR;
c277df42
IZ
8769 register I32 offset;
8770
3280af22 8771 if (p == &PL_regdummy)
c277df42
IZ
8772 return(NULL);
8773
8774 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8775 if (offset == 0)
8776 return(NULL);
8777
c277df42 8778 return(p+offset);
c277df42 8779}
76234dfb 8780#endif
c277df42 8781
01f988be 8782STATIC void
cea2e8a9 8783S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8784{
8785 va_list args;
8786 STRLEN l1 = strlen(pat1);
8787 STRLEN l2 = strlen(pat2);
8788 char buf[512];
06bf62c7 8789 SV *msv;
73d840c0 8790 const char *message;
c277df42
IZ
8791
8792 if (l1 > 510)
8793 l1 = 510;
8794 if (l1 + l2 > 510)
8795 l2 = 510 - l1;
8796 Copy(pat1, buf, l1 , char);
8797 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8798 buf[l1 + l2] = '\n';
8799 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8800#ifdef I_STDARG
8801 /* ANSI variant takes additional second argument */
c277df42 8802 va_start(args, pat2);
8736538c
AS
8803#else
8804 va_start(args);
8805#endif
5a844595 8806 msv = vmess(buf, &args);
c277df42 8807 va_end(args);
cfd0369c 8808 message = SvPV_const(msv,l1);
c277df42
IZ
8809 if (l1 > 512)
8810 l1 = 512;
8811 Copy(message, buf, l1 , char);
197cf9b9 8812 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8813 Perl_croak(aTHX_ "%s", buf);
c277df42 8814}
a0ed51b3
LW
8815
8816/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8817
76234dfb 8818#ifndef PERL_IN_XSUB_RE
a0ed51b3 8819void
864dbfa3 8820Perl_save_re_context(pTHX)
b81d288d 8821{
97aff369 8822 dVAR;
1ade1aa1
NC
8823
8824 struct re_save_state *state;
8825
8826 SAVEVPTR(PL_curcop);
8827 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8828
8829 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8830 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8831 SSPUSHINT(SAVEt_RE_STATE);
8832
46ab3289 8833 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8834
a0ed51b3 8835 PL_reg_start_tmp = 0;
a0ed51b3 8836 PL_reg_start_tmpl = 0;
c445ea15 8837 PL_reg_oldsaved = NULL;
a5db57d6 8838 PL_reg_oldsavedlen = 0;
a5db57d6 8839 PL_reg_maxiter = 0;
a5db57d6 8840 PL_reg_leftiter = 0;
c445ea15 8841 PL_reg_poscache = NULL;
a5db57d6 8842 PL_reg_poscache_size = 0;
1ade1aa1
NC
8843#ifdef PERL_OLD_COPY_ON_WRITE
8844 PL_nrs = NULL;
8845#endif
ada6e8a9 8846
c445ea15
AL
8847 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8848 if (PL_curpm) {
8849 const REGEXP * const rx = PM_GETRE(PL_curpm);
8850 if (rx) {
1df70142 8851 U32 i;
ada6e8a9 8852 for (i = 1; i <= rx->nparens; i++) {
1df70142 8853 char digits[TYPE_CHARS(long)];
d9fad198 8854 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8855 GV *const *const gvp
8856 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8857
b37c2d43
AL
8858 if (gvp) {
8859 GV * const gv = *gvp;
8860 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8861 save_scalar(gv);
49f27e4b 8862 }
ada6e8a9
AMS
8863 }
8864 }
8865 }
a0ed51b3 8866}
76234dfb 8867#endif
51371543 8868
51371543 8869static void
acfe0abc 8870clear_re(pTHX_ void *r)
51371543 8871{
97aff369 8872 dVAR;
51371543
GS
8873 ReREFCNT_dec((regexp *)r);
8874}
ffbc6a93 8875
a28509cc
AL
8876#ifdef DEBUGGING
8877
8878STATIC void
8879S_put_byte(pTHX_ SV *sv, int c)
8880{
8881 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8882 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8883 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8884 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8885 else
8886 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8887}
8888
786e8c11 8889
3dab1dad
YO
8890#define CLEAR_OPTSTART \
8891 if (optstart) STMT_START { \
07be1b83 8892 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8893 optstart=NULL; \
8894 } STMT_END
8895
786e8c11 8896#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8897
b5a2f8d8
NC
8898STATIC const regnode *
8899S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8900 const regnode *last, const regnode *plast,
8901 SV* sv, I32 indent, U32 depth)
a28509cc 8902{
97aff369 8903 dVAR;
786e8c11 8904 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8905 register const regnode *next;
3dab1dad
YO
8906 const regnode *optstart= NULL;
8907 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8908
786e8c11
YO
8909#ifdef DEBUG_DUMPUNTIL
8910 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8911 last ? last-start : 0,plast ? plast-start : 0);
8912#endif
8913
8914 if (plast && plast < last)
8915 last= plast;
8916
8917 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8918 /* While that wasn't END last time... */
8919
8920 NODE_ALIGN(node);
8921 op = OP(node);
8922 if (op == CLOSE)
786e8c11 8923 indent--;
b5a2f8d8 8924 next = regnext((regnode *)node);
07be1b83 8925
a28509cc 8926 /* Where, what. */
8e11feef 8927 if (OP(node) == OPTIMIZED) {
e68ec53f 8928 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8929 optstart = node;
3dab1dad 8930 else
8e11feef 8931 goto after_print;
3dab1dad
YO
8932 } else
8933 CLEAR_OPTSTART;
07be1b83 8934
32fc9b6a 8935 regprop(r, sv, node);
a28509cc 8936 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8937 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8938
8939 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8940 if (next == NULL) /* Next ptr. */
8941 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8942 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8943 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8944 else
8945 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8946
1de06328 8947 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8948 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8949 }
8950
a28509cc
AL
8951 after_print:
8952 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8953 assert(next);
8954 {
8955 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8956 ? regnext((regnode *)next)
8957 : next);
be8e71aa
YO
8958 if (last && nnode > last)
8959 nnode = last;
786e8c11 8960 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8961 }
a28509cc
AL
8962 }
8963 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8964 assert(next);
786e8c11 8965 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8966 }
8967 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 8968 const regnode *this_trie = node;
1de06328 8969 const char op = OP(node);
a28509cc 8970 const I32 n = ARG(node);
1de06328
YO
8971 const reg_ac_data * const ac = op>=AHOCORASICK ?
8972 (reg_ac_data *)r->data->data[n] :
8973 NULL;
8974 const reg_trie_data * const trie = op<AHOCORASICK ?
8975 (reg_trie_data*)r->data->data[n] :
8976 ac->trie;
786e8c11 8977 const regnode *nextbranch= NULL;
a28509cc 8978 I32 word_idx;
1de06328 8979 sv_setpvn(sv, "", 0);
786e8c11 8980 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8981 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8982
8983 PerlIO_printf(Perl_debug_log, "%*s%s ",
8984 (int)(2*(indent+3)), "",
8985 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8986 PL_colors[0], PL_colors[1],
8987 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8988 PERL_PV_PRETTY_ELIPSES |
7f69552c 8989 PERL_PV_PRETTY_LTGT
786e8c11
YO
8990 )
8991 : "???"
8992 );
8993 if (trie->jump) {
40d049e4 8994 U16 dist= trie->jump[word_idx+1];
7f69552c 8995 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 8996 (dist ? this_trie + dist : next) - start);
786e8c11
YO
8997 if (dist) {
8998 if (!nextbranch)
24b23f37 8999 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9000 DUMPUNTIL(this_trie + dist, nextbranch);
9001 }
786e8c11
YO
9002 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9003 nextbranch= regnext((regnode *)nextbranch);
9004 } else {
9005 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9006 }
786e8c11
YO
9007 }
9008 if (last && next > last)
9009 node= last;
9010 else
9011 node= next;
a28509cc 9012 }
786e8c11
YO
9013 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9014 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9015 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9016 }
9017 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9018 assert(next);
786e8c11 9019 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9020 }
9021 else if ( op == PLUS || op == STAR) {
786e8c11 9022 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9023 }
9024 else if (op == ANYOF) {
9025 /* arglen 1 + class block */
9026 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9027 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9028 node = NEXTOPER(node);
9029 }
9030 else if (PL_regkind[(U8)op] == EXACT) {
9031 /* Literal string, where present. */
9032 node += NODE_SZ_STR(node) - 1;
9033 node = NEXTOPER(node);
9034 }
9035 else {
9036 node = NEXTOPER(node);
9037 node += regarglen[(U8)op];
9038 }
9039 if (op == CURLYX || op == OPEN)
786e8c11 9040 indent++;
a28509cc 9041 else if (op == WHILEM)
786e8c11 9042 indent--;
a28509cc 9043 }
3dab1dad 9044 CLEAR_OPTSTART;
786e8c11
YO
9045#ifdef DEBUG_DUMPUNTIL
9046 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
9047#endif
1de06328 9048 return node;
a28509cc
AL
9049}
9050
9051#endif /* DEBUGGING */
9052
241d1a3b
NC
9053/*
9054 * Local variables:
9055 * c-indentation-style: bsd
9056 * c-basic-offset: 4
9057 * indent-tabs-mode: t
9058 * End:
9059 *
37442d52
RGS
9060 * ex: set ts=8 sts=4 sw=4 noet:
9061 */