This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
assert() that SvPV_set() isn't used on arrays.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
40d049e4
YO
120 regnode **open_parens; /* pointers to open parens */
121 regnode **close_parens; /* pointers to close parens */
122 regnode *opend; /* END node in program */
1aa99e6b 123 I32 utf8;
6bda09f9 124 HV *charnames; /* cache of named sequences */
81714fb9 125 HV *paren_names; /* Paren names */
40d049e4
YO
126 regnode **recurse; /* Recurse regops */
127 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
128#if ADD_TO_REGEXEC
129 char *starttry; /* -Dr: where regtry was called. */
130#define RExC_starttry (pRExC_state->starttry)
131#endif
3dab1dad 132#ifdef DEBUGGING
be8e71aa 133 const char *lastparse;
3dab1dad
YO
134 I32 lastnum;
135#define RExC_lastparse (pRExC_state->lastparse)
136#define RExC_lastnum (pRExC_state->lastnum)
137#endif
830247a4
IZ
138} RExC_state_t;
139
e2509266 140#define RExC_flags (pRExC_state->flags)
830247a4
IZ
141#define RExC_precomp (pRExC_state->precomp)
142#define RExC_rx (pRExC_state->rx)
fac92740 143#define RExC_start (pRExC_state->start)
830247a4
IZ
144#define RExC_end (pRExC_state->end)
145#define RExC_parse (pRExC_state->parse)
146#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 147#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 148#define RExC_emit (pRExC_state->emit)
fac92740 149#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
150#define RExC_naughty (pRExC_state->naughty)
151#define RExC_sawback (pRExC_state->sawback)
152#define RExC_seen (pRExC_state->seen)
153#define RExC_size (pRExC_state->size)
154#define RExC_npar (pRExC_state->npar)
155#define RExC_extralen (pRExC_state->extralen)
156#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
157#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 158#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 159#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
160#define RExC_open_parens (pRExC_state->open_parens)
161#define RExC_close_parens (pRExC_state->close_parens)
162#define RExC_opend (pRExC_state->opend)
81714fb9 163#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
164#define RExC_recurse (pRExC_state->recurse)
165#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 166
a687059c
LW
167#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
168#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
169 ((*s) == '{' && regcurly(s)))
a687059c 170
35c8bce7
LW
171#ifdef SPSTART
172#undef SPSTART /* dratted cpp namespace... */
173#endif
a687059c
LW
174/*
175 * Flags to be passed up and down.
176 */
a687059c 177#define WORST 0 /* Worst case. */
821b33a5 178#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
179#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
180#define SPSTART 0x4 /* Starts with * or +. */
181#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 182
3dab1dad
YO
183#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
184
07be1b83
YO
185/* whether trie related optimizations are enabled */
186#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
187#define TRIE_STUDY_OPT
786e8c11 188#define FULL_TRIE_STUDY
07be1b83
YO
189#define TRIE_STCLASS
190#endif
1de06328
YO
191
192
40d049e4
YO
193
194#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
195#define PBITVAL(paren) (1 << ((paren) & 7))
196#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
197#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
198#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
199
200
1de06328
YO
201/* About scan_data_t.
202
203 During optimisation we recurse through the regexp program performing
204 various inplace (keyhole style) optimisations. In addition study_chunk
205 and scan_commit populate this data structure with information about
206 what strings MUST appear in the pattern. We look for the longest
207 string that must appear for at a fixed location, and we look for the
208 longest string that may appear at a floating location. So for instance
209 in the pattern:
210
211 /FOO[xX]A.*B[xX]BAR/
212
213 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
214 strings (because they follow a .* construct). study_chunk will identify
215 both FOO and BAR as being the longest fixed and floating strings respectively.
216
217 The strings can be composites, for instance
218
219 /(f)(o)(o)/
220
221 will result in a composite fixed substring 'foo'.
222
223 For each string some basic information is maintained:
224
225 - offset or min_offset
226 This is the position the string must appear at, or not before.
227 It also implicitly (when combined with minlenp) tells us how many
228 character must match before the string we are searching.
229 Likewise when combined with minlenp and the length of the string
230 tells us how many characters must appear after the string we have
231 found.
232
233 - max_offset
234 Only used for floating strings. This is the rightmost point that
235 the string can appear at. Ifset to I32 max it indicates that the
236 string can occur infinitely far to the right.
237
238 - minlenp
239 A pointer to the minimum length of the pattern that the string
240 was found inside. This is important as in the case of positive
241 lookahead or positive lookbehind we can have multiple patterns
242 involved. Consider
243
244 /(?=FOO).*F/
245
246 The minimum length of the pattern overall is 3, the minimum length
247 of the lookahead part is 3, but the minimum length of the part that
248 will actually match is 1. So 'FOO's minimum length is 3, but the
249 minimum length for the F is 1. This is important as the minimum length
250 is used to determine offsets in front of and behind the string being
251 looked for. Since strings can be composites this is the length of the
252 pattern at the time it was commited with a scan_commit. Note that
253 the length is calculated by study_chunk, so that the minimum lengths
254 are not known until the full pattern has been compiled, thus the
255 pointer to the value.
256
257 - lookbehind
258
259 In the case of lookbehind the string being searched for can be
260 offset past the start point of the final matching string.
261 If this value was just blithely removed from the min_offset it would
262 invalidate some of the calculations for how many chars must match
263 before or after (as they are derived from min_offset and minlen and
264 the length of the string being searched for).
265 When the final pattern is compiled and the data is moved from the
266 scan_data_t structure into the regexp structure the information
267 about lookbehind is factored in, with the information that would
268 have been lost precalculated in the end_shift field for the
269 associated string.
270
271 The fields pos_min and pos_delta are used to store the minimum offset
272 and the delta to the maximum offset at the current point in the pattern.
273
274*/
2c2d71f5
JH
275
276typedef struct scan_data_t {
1de06328
YO
277 /*I32 len_min; unused */
278 /*I32 len_delta; unused */
2c2d71f5
JH
279 I32 pos_min;
280 I32 pos_delta;
281 SV *last_found;
1de06328 282 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
283 I32 last_start_min;
284 I32 last_start_max;
1de06328
YO
285 SV **longest; /* Either &l_fixed, or &l_float. */
286 SV *longest_fixed; /* longest fixed string found in pattern */
287 I32 offset_fixed; /* offset where it starts */
288 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
289 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
290 SV *longest_float; /* longest floating string found in pattern */
291 I32 offset_float_min; /* earliest point in string it can appear */
292 I32 offset_float_max; /* latest point in string it can appear */
293 I32 *minlen_float; /* pointer to the minlen relevent to the string */
294 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
295 I32 flags;
296 I32 whilem_c;
cb434fcc 297 I32 *last_closep;
653099ff 298 struct regnode_charclass_class *start_class;
2c2d71f5
JH
299} scan_data_t;
300
a687059c 301/*
e50aee73 302 * Forward declarations for pregcomp()'s friends.
a687059c 303 */
a0d0e21e 304
27da23d5 305static const scan_data_t zero_scan_data =
1de06328 306 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
307
308#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
309#define SF_BEFORE_SEOL 0x0001
310#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
311#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
312#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
313
09b7f37c
CB
314#ifdef NO_UNARY_PLUS
315# define SF_FIX_SHIFT_EOL (0+2)
316# define SF_FL_SHIFT_EOL (0+4)
317#else
318# define SF_FIX_SHIFT_EOL (+2)
319# define SF_FL_SHIFT_EOL (+4)
320#endif
c277df42
IZ
321
322#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
323#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
324
325#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
326#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
327#define SF_IS_INF 0x0040
328#define SF_HAS_PAR 0x0080
329#define SF_IN_PAR 0x0100
330#define SF_HAS_EVAL 0x0200
331#define SCF_DO_SUBSTR 0x0400
653099ff
GS
332#define SCF_DO_STCLASS_AND 0x0800
333#define SCF_DO_STCLASS_OR 0x1000
334#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 335#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 336
786e8c11
YO
337#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
338
07be1b83 339
eb160463 340#define UTF (RExC_utf8 != 0)
e2509266
JH
341#define LOC ((RExC_flags & PMf_LOCALE) != 0)
342#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 343
ffc61ed2 344#define OOB_UNICODE 12345678
93733859 345#define OOB_NAMEDCLASS -1
b8c5462f 346
a0ed51b3
LW
347#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
348#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
349
8615cb43 350
b45f050a
JF
351/* length of regex to show in messages that don't mark a position within */
352#define RegexLengthToShowInErrorMessages 127
353
354/*
355 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
356 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
357 * op/pragma/warn/regcomp.
358 */
7253e4e3
RK
359#define MARKER1 "<-- HERE" /* marker as it appears in the description */
360#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 361
7253e4e3 362#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
363
364/*
365 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
366 * arg. Show regex, up to a maximum length. If it's too long, chop and add
367 * "...".
368 */
ccb2c380 369#define FAIL(msg) STMT_START { \
bfed75c6 370 const char *ellipses = ""; \
ccb2c380
MP
371 IV len = RExC_end - RExC_precomp; \
372 \
373 if (!SIZE_ONLY) \
374 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
375 if (len > RegexLengthToShowInErrorMessages) { \
376 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
377 len = RegexLengthToShowInErrorMessages - 10; \
378 ellipses = "..."; \
379 } \
380 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
381 msg, (int)len, RExC_precomp, ellipses); \
382} STMT_END
8615cb43 383
b45f050a 384/*
b45f050a
JF
385 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
386 */
ccb2c380 387#define Simple_vFAIL(m) STMT_START { \
a28509cc 388 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
389 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
b45f050a
JF
392
393/*
394 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
395 */
ccb2c380
MP
396#define vFAIL(m) STMT_START { \
397 if (!SIZE_ONLY) \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
399 Simple_vFAIL(m); \
400} STMT_END
b45f050a
JF
401
402/*
403 * Like Simple_vFAIL(), but accepts two arguments.
404 */
ccb2c380 405#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
408 (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
413 */
ccb2c380
MP
414#define vFAIL2(m,a1) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL2(m, a1); \
418} STMT_END
b45f050a
JF
419
420
421/*
422 * Like Simple_vFAIL(), but accepts three arguments.
423 */
ccb2c380 424#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
432 */
ccb2c380
MP
433#define vFAIL3(m,a1,a2) STMT_START { \
434 if (!SIZE_ONLY) \
435 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
436 Simple_vFAIL3(m, a1, a2); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Like Simple_vFAIL(), but accepts four arguments.
441 */
ccb2c380 442#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a 447
ccb2c380 448#define vWARN(loc,m) STMT_START { \
a28509cc 449 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
450 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
451 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
453
454#define vWARNdep(loc,m) STMT_START { \
a28509cc 455 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
456 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
457 "%s" REPORT_LOCATION, \
458 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
460
461
462#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 463 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
464 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
465 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
466} STMT_END
467
468#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 469 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
470 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
471 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 475 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
476 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
477 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
9d1d55b5 485
8615cb43 486
cd439c50 487/* Allow for side effects in s */
ccb2c380
MP
488#define REGC(c,s) STMT_START { \
489 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
490} STMT_END
cd439c50 491
fac92740
MJD
492/* Macros for recording node offsets. 20001227 mjd@plover.com
493 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
494 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
495 * Element 0 holds the number n.
07be1b83 496 * Position is 1 indexed.
fac92740
MJD
497 */
498
ccb2c380
MP
499#define Set_Node_Offset_To_R(node,byte) STMT_START { \
500 if (! SIZE_ONLY) { \
501 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 502 __LINE__, (node), (int)(byte))); \
ccb2c380 503 if((node) < 0) { \
551405c4 504 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
505 } else { \
506 RExC_offsets[2*(node)-1] = (byte); \
507 } \
508 } \
509} STMT_END
510
511#define Set_Node_Offset(node,byte) \
512 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
513#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
514
515#define Set_Node_Length_To_R(node,len) STMT_START { \
516 if (! SIZE_ONLY) { \
517 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 518 __LINE__, (int)(node), (int)(len))); \
ccb2c380 519 if((node) < 0) { \
551405c4 520 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
521 } else { \
522 RExC_offsets[2*(node)] = (len); \
523 } \
524 } \
525} STMT_END
526
527#define Set_Node_Length(node,len) \
528 Set_Node_Length_To_R((node)-RExC_emit_start, len)
529#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
530#define Set_Node_Cur_Length(node) \
531 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
532
533/* Get offsets and lengths */
534#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
535#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
536
07be1b83
YO
537#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
538 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
539 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
540} STMT_END
541
542
543#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
544#define EXPERIMENTAL_INPLACESCAN
545#endif
546
1de06328 547#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 548DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
549 PerlIO_printf(Perl_debug_log, \
550 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
551 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
552 (int)(depth)*2, "", \
553 (IV)((data)->pos_min), \
554 (IV)((data)->pos_delta), \
555 (IV)((data)->flags), \
556 (IV)((data)->whilem_c), \
557 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
558 ); \
559 if ((data)->last_found) \
560 PerlIO_printf(Perl_debug_log, \
561 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
562 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
563 SvPVX_const((data)->last_found), \
564 (IV)((data)->last_end), \
565 (IV)((data)->last_start_min), \
566 (IV)((data)->last_start_max), \
567 ((data)->longest && \
568 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
569 SvPVX_const((data)->longest_fixed), \
570 (IV)((data)->offset_fixed), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_float)) ? "*" : "", \
573 SvPVX_const((data)->longest_float), \
574 (IV)((data)->offset_float_min), \
575 (IV)((data)->offset_float_max) \
576 ); \
577 PerlIO_printf(Perl_debug_log,"\n"); \
578});
579
acfe0abc 580static void clear_re(pTHX_ void *r);
4327152a 581
653099ff 582/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 583 Update the longest found anchored substring and the longest found
653099ff
GS
584 floating substrings if needed. */
585
4327152a 586STATIC void
1de06328 587S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 588{
e1ec3a88
AL
589 const STRLEN l = CHR_SVLEN(data->last_found);
590 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 591 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 592
c277df42 593 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 594 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
595 if (*data->longest == data->longest_fixed) {
596 data->offset_fixed = l ? data->last_start_min : data->pos_min;
597 if (data->flags & SF_BEFORE_EOL)
b81d288d 598 data->flags
c277df42
IZ
599 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
600 else
601 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
602 data->minlen_fixed=minlenp;
603 data->lookbehind_fixed=0;
a0ed51b3
LW
604 }
605 else {
c277df42 606 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
607 data->offset_float_max = (l
608 ? data->last_start_max
c277df42 609 : data->pos_min + data->pos_delta);
9051bda5
HS
610 if ((U32)data->offset_float_max > (U32)I32_MAX)
611 data->offset_float_max = I32_MAX;
c277df42 612 if (data->flags & SF_BEFORE_EOL)
b81d288d 613 data->flags
c277df42
IZ
614 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
615 else
616 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
617 data->minlen_float=minlenp;
618 data->lookbehind_float=0;
c277df42
IZ
619 }
620 }
621 SvCUR_set(data->last_found, 0);
0eda9292 622 {
a28509cc 623 SV * const sv = data->last_found;
097eb12c
AL
624 if (SvUTF8(sv) && SvMAGICAL(sv)) {
625 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
626 if (mg)
627 mg->mg_len = 0;
628 }
0eda9292 629 }
c277df42
IZ
630 data->last_end = -1;
631 data->flags &= ~SF_BEFORE_EOL;
1de06328 632 DEBUG_STUDYDATA(data,0);
c277df42
IZ
633}
634
653099ff
GS
635/* Can match anything (initialization) */
636STATIC void
097eb12c 637S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 638{
653099ff 639 ANYOF_CLASS_ZERO(cl);
f8bef550 640 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 641 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
642 if (LOC)
643 cl->flags |= ANYOF_LOCALE;
644}
645
646/* Can match anything (initialization) */
647STATIC int
5f66b61c 648S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
649{
650 int value;
651
aaa51d5e 652 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
653 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
654 return 1;
1aa99e6b
IH
655 if (!(cl->flags & ANYOF_UNICODE_ALL))
656 return 0;
10edeb5d 657 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 658 return 0;
653099ff
GS
659 return 1;
660}
661
662/* Can match anything (initialization) */
663STATIC void
097eb12c 664S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 665{
8ecf7187 666 Zero(cl, 1, struct regnode_charclass_class);
653099ff 667 cl->type = ANYOF;
830247a4 668 cl_anything(pRExC_state, cl);
653099ff
GS
669}
670
671STATIC void
097eb12c 672S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 673{
8ecf7187 674 Zero(cl, 1, struct regnode_charclass_class);
653099ff 675 cl->type = ANYOF;
830247a4 676 cl_anything(pRExC_state, cl);
653099ff
GS
677 if (LOC)
678 cl->flags |= ANYOF_LOCALE;
679}
680
681/* 'And' a given class with another one. Can create false positives */
682/* We assume that cl is not inverted */
683STATIC void
5f66b61c 684S_cl_and(struct regnode_charclass_class *cl,
a28509cc 685 const struct regnode_charclass_class *and_with)
653099ff 686{
40d049e4
YO
687
688 assert(and_with->type == ANYOF);
653099ff
GS
689 if (!(and_with->flags & ANYOF_CLASS)
690 && !(cl->flags & ANYOF_CLASS)
691 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
692 && !(and_with->flags & ANYOF_FOLD)
693 && !(cl->flags & ANYOF_FOLD)) {
694 int i;
695
696 if (and_with->flags & ANYOF_INVERT)
697 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
698 cl->bitmap[i] &= ~and_with->bitmap[i];
699 else
700 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
701 cl->bitmap[i] &= and_with->bitmap[i];
702 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
703 if (!(and_with->flags & ANYOF_EOS))
704 cl->flags &= ~ANYOF_EOS;
1aa99e6b 705
14ebb1a2
JH
706 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
707 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
708 cl->flags &= ~ANYOF_UNICODE_ALL;
709 cl->flags |= ANYOF_UNICODE;
710 ARG_SET(cl, ARG(and_with));
711 }
14ebb1a2
JH
712 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
713 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 714 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
715 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
716 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 717 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
718}
719
720/* 'OR' a given class with another one. Can create false positives */
721/* We assume that cl is not inverted */
722STATIC void
097eb12c 723S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 724{
653099ff
GS
725 if (or_with->flags & ANYOF_INVERT) {
726 /* We do not use
727 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
728 * <= (B1 | !B2) | (CL1 | !CL2)
729 * which is wasteful if CL2 is small, but we ignore CL2:
730 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
731 * XXXX Can we handle case-fold? Unclear:
732 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
733 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
734 */
735 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
736 && !(or_with->flags & ANYOF_FOLD)
737 && !(cl->flags & ANYOF_FOLD) ) {
738 int i;
739
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] |= ~or_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise */
743 else {
830247a4 744 cl_anything(pRExC_state, cl);
653099ff
GS
745 }
746 } else {
747 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 749 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
750 || (cl->flags & ANYOF_FOLD)) ) {
751 int i;
752
753 /* OR char bitmap and class bitmap separately */
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] |= or_with->bitmap[i];
756 if (or_with->flags & ANYOF_CLASS) {
757 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
758 cl->classflags[i] |= or_with->classflags[i];
759 cl->flags |= ANYOF_CLASS;
760 }
761 }
762 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 763 cl_anything(pRExC_state, cl);
653099ff
GS
764 }
765 }
766 if (or_with->flags & ANYOF_EOS)
767 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
768
769 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
770 ARG(cl) != ARG(or_with)) {
771 cl->flags |= ANYOF_UNICODE_ALL;
772 cl->flags &= ~ANYOF_UNICODE;
773 }
774 if (or_with->flags & ANYOF_UNICODE_ALL) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
653099ff
GS
778}
779
a3621e74
YO
780#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784
3dab1dad
YO
785
786#ifdef DEBUGGING
07be1b83 787/*
3dab1dad
YO
788 dump_trie(trie)
789 dump_trie_interim_list(trie,next_alloc)
790 dump_trie_interim_table(trie,next_alloc)
791
792 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
793 The _interim_ variants are used for debugging the interim
794 tables that are used to generate the final compressed
795 representation which is what dump_trie expects.
796
3dab1dad
YO
797 Part of the reason for their existance is to provide a form
798 of documentation as to how the different representations function.
07be1b83
YO
799
800*/
3dab1dad
YO
801
802/*
803 dump_trie(trie)
804 Dumps the final compressed table form of the trie to Perl_debug_log.
805 Used for debugging make_trie().
806*/
807
808STATIC void
809S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
810{
811 U32 state;
ab3bbdeb
YO
812 SV *sv=sv_newmortal();
813 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
814 GET_RE_DEBUG_FLAGS_DECL;
815
ab3bbdeb 816
3dab1dad
YO
817 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
818 (int)depth * 2 + 2,"",
819 "Match","Base","Ofs" );
820
821 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 822 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 823 if ( tmp ) {
ab3bbdeb
YO
824 PerlIO_printf( Perl_debug_log, "%*s",
825 colwidth,
ddc5bc0f 826 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
827 PL_colors[0], PL_colors[1],
828 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
829 PERL_PV_ESCAPE_FIRSTCHAR
830 )
831 );
3dab1dad
YO
832 }
833 }
834 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
835 (int)depth * 2 + 2,"");
836
837 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 838 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
839 PerlIO_printf( Perl_debug_log, "\n");
840
1e2e3d02 841 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 842 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
843
844 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
845
846 if ( trie->states[ state ].wordnum ) {
847 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
848 } else {
849 PerlIO_printf( Perl_debug_log, "%6s", "" );
850 }
851
852 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
853
854 if ( base ) {
855 U32 ofs = 0;
856
857 while( ( base + ofs < trie->uniquecharcount ) ||
858 ( base + ofs - trie->uniquecharcount < trie->lasttrans
859 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
860 ofs++;
861
862 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
863
864 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
865 if ( ( base + ofs >= trie->uniquecharcount ) &&
866 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
867 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
868 {
ab3bbdeb
YO
869 PerlIO_printf( Perl_debug_log, "%*"UVXf,
870 colwidth,
3dab1dad
YO
871 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
872 } else {
ab3bbdeb 873 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
874 }
875 }
876
877 PerlIO_printf( Perl_debug_log, "]");
878
879 }
880 PerlIO_printf( Perl_debug_log, "\n" );
881 }
882}
883/*
884 dump_trie_interim_list(trie,next_alloc)
885 Dumps a fully constructed but uncompressed trie in list form.
886 List tries normally only are used for construction when the number of
887 possible chars (trie->uniquecharcount) is very high.
888 Used for debugging make_trie().
889*/
890STATIC void
891S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
892{
893 U32 state;
ab3bbdeb
YO
894 SV *sv=sv_newmortal();
895 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
896 GET_RE_DEBUG_FLAGS_DECL;
897 /* print out the table precompression. */
ab3bbdeb
YO
898 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
899 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
900 "------:-----+-----------------\n" );
3dab1dad
YO
901
902 for( state=1 ; state < next_alloc ; state ++ ) {
903 U16 charid;
904
ab3bbdeb 905 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
906 (int)depth * 2 + 2,"", (UV)state );
907 if ( ! trie->states[ state ].wordnum ) {
908 PerlIO_printf( Perl_debug_log, "%5s| ","");
909 } else {
910 PerlIO_printf( Perl_debug_log, "W%4x| ",
911 trie->states[ state ].wordnum
912 );
913 }
914 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 915 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
916 if ( tmp ) {
917 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
918 colwidth,
ddc5bc0f 919 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
920 PL_colors[0], PL_colors[1],
921 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
922 PERL_PV_ESCAPE_FIRSTCHAR
923 ) ,
1e2e3d02
YO
924 TRIE_LIST_ITEM(state,charid).forid,
925 (UV)TRIE_LIST_ITEM(state,charid).newstate
926 );
927 if (!(charid % 10))
664e119d
RGS
928 PerlIO_printf(Perl_debug_log, "\n%*s| ",
929 (int)((depth * 2) + 14), "");
1e2e3d02 930 }
ab3bbdeb
YO
931 }
932 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
933 }
934}
935
936/*
937 dump_trie_interim_table(trie,next_alloc)
938 Dumps a fully constructed but uncompressed trie in table form.
939 This is the normal DFA style state transition table, with a few
940 twists to facilitate compression later.
941 Used for debugging make_trie().
942*/
943STATIC void
944S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
945{
946 U32 state;
947 U16 charid;
ab3bbdeb
YO
948 SV *sv=sv_newmortal();
949 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
950 GET_RE_DEBUG_FLAGS_DECL;
951
952 /*
953 print out the table precompression so that we can do a visual check
954 that they are identical.
955 */
956
957 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
958
959 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 960 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 961 if ( tmp ) {
ab3bbdeb
YO
962 PerlIO_printf( Perl_debug_log, "%*s",
963 colwidth,
ddc5bc0f 964 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
965 PL_colors[0], PL_colors[1],
966 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
967 PERL_PV_ESCAPE_FIRSTCHAR
968 )
969 );
3dab1dad
YO
970 }
971 }
972
973 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
974
975 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 976 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
977 }
978
979 PerlIO_printf( Perl_debug_log, "\n" );
980
981 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
982
983 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
984 (int)depth * 2 + 2,"",
985 (UV)TRIE_NODENUM( state ) );
986
987 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
988 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
989 if (v)
990 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
991 else
992 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
993 }
994 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
995 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
996 } else {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
998 trie->states[ TRIE_NODENUM( state ) ].wordnum );
999 }
1000 }
07be1b83 1001}
3dab1dad
YO
1002
1003#endif
1004
786e8c11
YO
1005/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1006 startbranch: the first branch in the whole branch sequence
1007 first : start branch of sequence of branch-exact nodes.
1008 May be the same as startbranch
1009 last : Thing following the last branch.
1010 May be the same as tail.
1011 tail : item following the branch sequence
1012 count : words in the sequence
1013 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1014 depth : indent depth
3dab1dad 1015
786e8c11 1016Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1017
786e8c11
YO
1018A trie is an N'ary tree where the branches are determined by digital
1019decomposition of the key. IE, at the root node you look up the 1st character and
1020follow that branch repeat until you find the end of the branches. Nodes can be
1021marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1022
786e8c11 1023 /he|she|his|hers/
72f13be8 1024
786e8c11
YO
1025would convert into the following structure. Numbers represent states, letters
1026following numbers represent valid transitions on the letter from that state, if
1027the number is in square brackets it represents an accepting state, otherwise it
1028will be in parenthesis.
07be1b83 1029
786e8c11
YO
1030 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1031 | |
1032 | (2)
1033 | |
1034 (1) +-i->(6)-+-s->[7]
1035 |
1036 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1037
786e8c11
YO
1038 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1039
1040This shows that when matching against the string 'hers' we will begin at state 1
1041read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1042then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1043is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1044single traverse. We store a mapping from accepting to state to which word was
1045matched, and then when we have multiple possibilities we try to complete the
1046rest of the regex in the order in which they occured in the alternation.
1047
1048The only prior NFA like behaviour that would be changed by the TRIE support is
1049the silent ignoring of duplicate alternations which are of the form:
1050
1051 / (DUPE|DUPE) X? (?{ ... }) Y /x
1052
1053Thus EVAL blocks follwing a trie may be called a different number of times with
1054and without the optimisation. With the optimisations dupes will be silently
1055ignored. This inconsistant behaviour of EVAL type nodes is well established as
1056the following demonstrates:
1057
1058 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1059
1060which prints out 'word' three times, but
1061
1062 'words'=~/(word|word|word)(?{ print $1 })S/
1063
1064which doesnt print it out at all. This is due to other optimisations kicking in.
1065
1066Example of what happens on a structural level:
1067
1068The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1069
1070 1: CURLYM[1] {1,32767}(18)
1071 5: BRANCH(8)
1072 6: EXACT <ac>(16)
1073 8: BRANCH(11)
1074 9: EXACT <ad>(16)
1075 11: BRANCH(14)
1076 12: EXACT <ab>(16)
1077 16: SUCCEED(0)
1078 17: NOTHING(18)
1079 18: END(0)
1080
1081This would be optimizable with startbranch=5, first=5, last=16, tail=16
1082and should turn into:
1083
1084 1: CURLYM[1] {1,32767}(18)
1085 5: TRIE(16)
1086 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1087 <ac>
1088 <ad>
1089 <ab>
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094Cases where tail != last would be like /(?foo|bar)baz/:
1095
1096 1: BRANCH(4)
1097 2: EXACT <foo>(8)
1098 4: BRANCH(7)
1099 5: EXACT <bar>(8)
1100 7: TAIL(8)
1101 8: EXACT <baz>(10)
1102 10: END(0)
1103
1104which would be optimizable with startbranch=1, first=1, last=7, tail=8
1105and would end up looking like:
1106
1107 1: TRIE(8)
1108 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1109 <foo>
1110 <bar>
1111 7: TAIL(8)
1112 8: EXACT <baz>(10)
1113 10: END(0)
1114
1115 d = uvuni_to_utf8_flags(d, uv, 0);
1116
1117is the recommended Unicode-aware way of saying
1118
1119 *(d++) = uv;
1120*/
1121
1e2e3d02 1122#define TRIE_STORE_REVCHAR \
786e8c11 1123 STMT_START { \
1e2e3d02 1124 SV *tmp = newSVpvs(""); \
786e8c11 1125 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1126 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1127 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1128 } STMT_END
1129
1130#define TRIE_READ_CHAR STMT_START { \
1131 wordlen++; \
1132 if ( UTF ) { \
1133 if ( folder ) { \
1134 if ( foldlen > 0 ) { \
1135 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1136 foldlen -= len; \
1137 scan += len; \
1138 len = 0; \
1139 } else { \
1140 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1141 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1142 foldlen -= UNISKIP( uvc ); \
1143 scan = foldbuf + UNISKIP( uvc ); \
1144 } \
1145 } else { \
1146 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1147 } \
1148 } else { \
1149 uvc = (U32)*uc; \
1150 len = 1; \
1151 } \
1152} STMT_END
1153
1154
1155
1156#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1157 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1158 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1159 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1160 } \
1161 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1162 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1163 TRIE_LIST_CUR( state )++; \
1164} STMT_END
07be1b83 1165
786e8c11
YO
1166#define TRIE_LIST_NEW(state) STMT_START { \
1167 Newxz( trie->states[ state ].trans.list, \
1168 4, reg_trie_trans_le ); \
1169 TRIE_LIST_CUR( state ) = 1; \
1170 TRIE_LIST_LEN( state ) = 4; \
1171} STMT_END
07be1b83 1172
786e8c11
YO
1173#define TRIE_HANDLE_WORD(state) STMT_START { \
1174 U16 dupe= trie->states[ state ].wordnum; \
1175 regnode * const noper_next = regnext( noper ); \
1176 \
1177 if (trie->wordlen) \
1178 trie->wordlen[ curword ] = wordlen; \
1179 DEBUG_r({ \
1180 /* store the word for dumping */ \
1181 SV* tmp; \
1182 if (OP(noper) != NOTHING) \
1183 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1184 else \
1185 tmp = newSVpvn( "", 0 ); \
1186 if ( UTF ) SvUTF8_on( tmp ); \
1187 av_push( trie->words, tmp ); \
1188 }); \
1189 \
1190 curword++; \
1191 \
1192 if ( noper_next < tail ) { \
1193 if (!trie->jump) \
1194 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1195 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1196 if (!jumper) \
1197 jumper = noper_next; \
1198 if (!nextbranch) \
1199 nextbranch= regnext(cur); \
1200 } \
1201 \
1202 if ( dupe ) { \
1203 /* So it's a dupe. This means we need to maintain a */\
1204 /* linked-list from the first to the next. */\
1205 /* we only allocate the nextword buffer when there */\
1206 /* a dupe, so first time we have to do the allocation */\
1207 if (!trie->nextword) \
1208 Newxz( trie->nextword, word_count + 1, U16); \
1209 while ( trie->nextword[dupe] ) \
1210 dupe= trie->nextword[dupe]; \
1211 trie->nextword[dupe]= curword; \
1212 } else { \
1213 /* we haven't inserted this word yet. */ \
1214 trie->states[ state ].wordnum = curword; \
1215 } \
1216} STMT_END
07be1b83 1217
3dab1dad 1218
786e8c11
YO
1219#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1220 ( ( base + charid >= ucharcount \
1221 && base + charid < ubound \
1222 && state == trie->trans[ base - ucharcount + charid ].check \
1223 && trie->trans[ base - ucharcount + charid ].next ) \
1224 ? trie->trans[ base - ucharcount + charid ].next \
1225 : ( state==1 ? special : 0 ) \
1226 )
3dab1dad 1227
786e8c11
YO
1228#define MADE_TRIE 1
1229#define MADE_JUMP_TRIE 2
1230#define MADE_EXACT_TRIE 4
3dab1dad 1231
a3621e74 1232STATIC I32
786e8c11 1233S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1234{
27da23d5 1235 dVAR;
a3621e74
YO
1236 /* first pass, loop through and scan words */
1237 reg_trie_data *trie;
1238 regnode *cur;
9f7f3913 1239 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1240 STRLEN len = 0;
1241 UV uvc = 0;
1242 U16 curword = 0;
1243 U32 next_alloc = 0;
786e8c11
YO
1244 regnode *jumper = NULL;
1245 regnode *nextbranch = NULL;
7f69552c 1246 regnode *convert = NULL;
a3621e74 1247 /* we just use folder as a flag in utf8 */
e1ec3a88 1248 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1249 ? PL_fold
1250 : ( flags == EXACTFL
1251 ? PL_fold_locale
1252 : NULL
1253 )
1254 );
1255
e1ec3a88 1256 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1257 SV *re_trie_maxbuff;
3dab1dad
YO
1258#ifndef DEBUGGING
1259 /* these are only used during construction but are useful during
8e11feef 1260 * debugging so we store them in the struct when debugging.
8e11feef 1261 */
3dab1dad 1262 STRLEN trie_charcount=0;
3dab1dad
YO
1263 AV *trie_revcharmap;
1264#endif
a3621e74 1265 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1266#ifndef DEBUGGING
1267 PERL_UNUSED_ARG(depth);
1268#endif
a3621e74 1269
a02a5408 1270 Newxz( trie, 1, reg_trie_data );
a3621e74 1271 trie->refcount = 1;
3dab1dad 1272 trie->startstate = 1;
786e8c11 1273 trie->wordcount = word_count;
a3621e74 1274 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1275 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1276 if (!(UTF && folder))
1277 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1278 DEBUG_r({
1279 trie->words = newAV();
a3621e74 1280 });
3dab1dad 1281 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1282
0111c4fd 1283 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1284 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1285 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1286 }
3dab1dad
YO
1287 DEBUG_OPTIMISE_r({
1288 PerlIO_printf( Perl_debug_log,
786e8c11 1289 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1290 (int)depth * 2 + 2, "",
1291 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1292 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1293 (int)depth);
3dab1dad 1294 });
7f69552c
YO
1295
1296 /* Find the node we are going to overwrite */
1297 if ( first == startbranch && OP( last ) != BRANCH ) {
1298 /* whole branch chain */
1299 convert = first;
1300 } else {
1301 /* branch sub-chain */
1302 convert = NEXTOPER( first );
1303 }
1304
a3621e74
YO
1305 /* -- First loop and Setup --
1306
1307 We first traverse the branches and scan each word to determine if it
1308 contains widechars, and how many unique chars there are, this is
1309 important as we have to build a table with at least as many columns as we
1310 have unique chars.
1311
1312 We use an array of integers to represent the character codes 0..255
1313 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1314 native representation of the character value as the key and IV's for the
1315 coded index.
1316
1317 *TODO* If we keep track of how many times each character is used we can
1318 remap the columns so that the table compression later on is more
1319 efficient in terms of memory by ensuring most common value is in the
1320 middle and the least common are on the outside. IMO this would be better
1321 than a most to least common mapping as theres a decent chance the most
1322 common letter will share a node with the least common, meaning the node
1323 will not be compressable. With a middle is most common approach the worst
1324 case is when we have the least common nodes twice.
1325
1326 */
1327
a3621e74 1328 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1329 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1330 const U8 *uc = (U8*)STRING( noper );
a28509cc 1331 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1332 STRLEN foldlen = 0;
1333 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1334 const U8 *scan = (U8*)NULL;
07be1b83 1335 U32 wordlen = 0; /* required init */
3dab1dad 1336 STRLEN chars=0;
a3621e74 1337
3dab1dad
YO
1338 if (OP(noper) == NOTHING) {
1339 trie->minlen= 0;
1340 continue;
1341 }
1342 if (trie->bitmap) {
1343 TRIE_BITMAP_SET(trie,*uc);
1344 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1345 }
a3621e74 1346 for ( ; uc < e ; uc += len ) {
3dab1dad 1347 TRIE_CHARCOUNT(trie)++;
a3621e74 1348 TRIE_READ_CHAR;
3dab1dad 1349 chars++;
a3621e74
YO
1350 if ( uvc < 256 ) {
1351 if ( !trie->charmap[ uvc ] ) {
1352 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1353 if ( folder )
1354 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1355 TRIE_STORE_REVCHAR;
a3621e74
YO
1356 }
1357 } else {
1358 SV** svpp;
1359 if ( !trie->widecharmap )
1360 trie->widecharmap = newHV();
1361
1362 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1363
1364 if ( !svpp )
e4584336 1365 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1366
1367 if ( !SvTRUE( *svpp ) ) {
1368 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 }
1372 }
3dab1dad
YO
1373 if( cur == first ) {
1374 trie->minlen=chars;
1375 trie->maxlen=chars;
1376 } else if (chars < trie->minlen) {
1377 trie->minlen=chars;
1378 } else if (chars > trie->maxlen) {
1379 trie->maxlen=chars;
1380 }
1381
a3621e74
YO
1382 } /* end first pass */
1383 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1384 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1385 (int)depth * 2 + 2,"",
85c3142d 1386 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1387 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1388 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1389 );
786e8c11 1390 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1391
1392 /*
1393 We now know what we are dealing with in terms of unique chars and
1394 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1395 representation using a flat table will take. If it's over a reasonable
1396 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1397 conservative but potentially much slower representation using an array
1398 of lists.
1399
1400 At the end we convert both representations into the same compressed
1401 form that will be used in regexec.c for matching with. The latter
1402 is a form that cannot be used to construct with but has memory
1403 properties similar to the list form and access properties similar
1404 to the table form making it both suitable for fast searches and
1405 small enough that its feasable to store for the duration of a program.
1406
1407 See the comment in the code where the compressed table is produced
1408 inplace from the flat tabe representation for an explanation of how
1409 the compression works.
1410
1411 */
1412
1413
3dab1dad 1414 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1415 /*
1416 Second Pass -- Array Of Lists Representation
1417
1418 Each state will be represented by a list of charid:state records
1419 (reg_trie_trans_le) the first such element holds the CUR and LEN
1420 points of the allocated array. (See defines above).
1421
1422 We build the initial structure using the lists, and then convert
1423 it into the compressed table form which allows faster lookups
1424 (but cant be modified once converted).
a3621e74
YO
1425 */
1426
a3621e74
YO
1427 STRLEN transcount = 1;
1428
1e2e3d02
YO
1429 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1430 "%*sCompiling trie using list compiler\n",
1431 (int)depth * 2 + 2, ""));
1432
3dab1dad 1433 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1434 TRIE_LIST_NEW(1);
1435 next_alloc = 2;
1436
1437 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1438
c445ea15
AL
1439 regnode * const noper = NEXTOPER( cur );
1440 U8 *uc = (U8*)STRING( noper );
1441 const U8 * const e = uc + STR_LEN( noper );
1442 U32 state = 1; /* required init */
1443 U16 charid = 0; /* sanity init */
1444 U8 *scan = (U8*)NULL; /* sanity init */
1445 STRLEN foldlen = 0; /* required init */
07be1b83 1446 U32 wordlen = 0; /* required init */
c445ea15
AL
1447 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1448
3dab1dad 1449 if (OP(noper) != NOTHING) {
786e8c11 1450 for ( ; uc < e ; uc += len ) {
c445ea15 1451
786e8c11 1452 TRIE_READ_CHAR;
c445ea15 1453
786e8c11
YO
1454 if ( uvc < 256 ) {
1455 charid = trie->charmap[ uvc ];
c445ea15 1456 } else {
786e8c11
YO
1457 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1458 if ( !svpp ) {
1459 charid = 0;
1460 } else {
1461 charid=(U16)SvIV( *svpp );
1462 }
c445ea15 1463 }
786e8c11
YO
1464 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1465 if ( charid ) {
a3621e74 1466
786e8c11
YO
1467 U16 check;
1468 U32 newstate = 0;
a3621e74 1469
786e8c11
YO
1470 charid--;
1471 if ( !trie->states[ state ].trans.list ) {
1472 TRIE_LIST_NEW( state );
c445ea15 1473 }
786e8c11
YO
1474 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1475 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1476 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1477 break;
1478 }
1479 }
1480 if ( ! newstate ) {
1481 newstate = next_alloc++;
1482 TRIE_LIST_PUSH( state, charid, newstate );
1483 transcount++;
1484 }
1485 state = newstate;
1486 } else {
1487 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1488 }
a28509cc 1489 }
c445ea15 1490 }
3dab1dad 1491 TRIE_HANDLE_WORD(state);
a3621e74
YO
1492
1493 } /* end second pass */
1494
1e2e3d02
YO
1495 /* next alloc is the NEXT state to be allocated */
1496 trie->statecount = next_alloc;
a3621e74
YO
1497 Renew( trie->states, next_alloc, reg_trie_state );
1498
3dab1dad
YO
1499 /* and now dump it out before we compress it */
1500 DEBUG_TRIE_COMPILE_MORE_r(
1501 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1502 );
a3621e74 1503
a02a5408 1504 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1505 {
1506 U32 state;
a3621e74
YO
1507 U32 tp = 0;
1508 U32 zp = 0;
1509
1510
1511 for( state=1 ; state < next_alloc ; state ++ ) {
1512 U32 base=0;
1513
1514 /*
1515 DEBUG_TRIE_COMPILE_MORE_r(
1516 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1517 );
1518 */
1519
1520 if (trie->states[state].trans.list) {
1521 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1522 U16 maxid=minid;
a28509cc 1523 U16 idx;
a3621e74
YO
1524
1525 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1526 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1527 if ( forid < minid ) {
1528 minid=forid;
1529 } else if ( forid > maxid ) {
1530 maxid=forid;
1531 }
a3621e74
YO
1532 }
1533 if ( transcount < tp + maxid - minid + 1) {
1534 transcount *= 2;
1535 Renew( trie->trans, transcount, reg_trie_trans );
1536 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1537 }
1538 base = trie->uniquecharcount + tp - minid;
1539 if ( maxid == minid ) {
1540 U32 set = 0;
1541 for ( ; zp < tp ; zp++ ) {
1542 if ( ! trie->trans[ zp ].next ) {
1543 base = trie->uniquecharcount + zp - minid;
1544 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1545 trie->trans[ zp ].check = state;
1546 set = 1;
1547 break;
1548 }
1549 }
1550 if ( !set ) {
1551 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1552 trie->trans[ tp ].check = state;
1553 tp++;
1554 zp = tp;
1555 }
1556 } else {
1557 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1558 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1559 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1560 trie->trans[ tid ].check = state;
1561 }
1562 tp += ( maxid - minid + 1 );
1563 }
1564 Safefree(trie->states[ state ].trans.list);
1565 }
1566 /*
1567 DEBUG_TRIE_COMPILE_MORE_r(
1568 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1569 );
1570 */
1571 trie->states[ state ].trans.base=base;
1572 }
cc601c31 1573 trie->lasttrans = tp + 1;
a3621e74
YO
1574 }
1575 } else {
1576 /*
1577 Second Pass -- Flat Table Representation.
1578
1579 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1580 We know that we will need Charcount+1 trans at most to store the data
1581 (one row per char at worst case) So we preallocate both structures
1582 assuming worst case.
1583
1584 We then construct the trie using only the .next slots of the entry
1585 structs.
1586
1587 We use the .check field of the first entry of the node temporarily to
1588 make compression both faster and easier by keeping track of how many non
1589 zero fields are in the node.
1590
1591 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1592 transition.
1593
1594 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1595 number representing the first entry of the node, and state as a
1596 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1597 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1598 are 2 entrys per node. eg:
1599
1600 A B A B
1601 1. 2 4 1. 3 7
1602 2. 0 3 3. 0 5
1603 3. 0 0 5. 0 0
1604 4. 0 0 7. 0 0
1605
1606 The table is internally in the right hand, idx form. However as we also
1607 have to deal with the states array which is indexed by nodenum we have to
1608 use TRIE_NODENUM() to convert.
1609
1610 */
1e2e3d02
YO
1611 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1612 "%*sCompiling trie using table compiler\n",
1613 (int)depth * 2 + 2, ""));
3dab1dad
YO
1614
1615 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1616 reg_trie_trans );
3dab1dad 1617 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1618 next_alloc = trie->uniquecharcount + 1;
1619
3dab1dad 1620
a3621e74
YO
1621 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1622
c445ea15 1623 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1624 const U8 *uc = (U8*)STRING( noper );
1625 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1626
1627 U32 state = 1; /* required init */
1628
1629 U16 charid = 0; /* sanity init */
1630 U32 accept_state = 0; /* sanity init */
1631 U8 *scan = (U8*)NULL; /* sanity init */
1632
1633 STRLEN foldlen = 0; /* required init */
07be1b83 1634 U32 wordlen = 0; /* required init */
a3621e74
YO
1635 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1636
3dab1dad 1637 if ( OP(noper) != NOTHING ) {
786e8c11 1638 for ( ; uc < e ; uc += len ) {
a3621e74 1639
786e8c11 1640 TRIE_READ_CHAR;
a3621e74 1641
786e8c11
YO
1642 if ( uvc < 256 ) {
1643 charid = trie->charmap[ uvc ];
1644 } else {
1645 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1646 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1647 }
786e8c11
YO
1648 if ( charid ) {
1649 charid--;
1650 if ( !trie->trans[ state + charid ].next ) {
1651 trie->trans[ state + charid ].next = next_alloc;
1652 trie->trans[ state ].check++;
1653 next_alloc += trie->uniquecharcount;
1654 }
1655 state = trie->trans[ state + charid ].next;
1656 } else {
1657 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1658 }
1659 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1660 }
a3621e74 1661 }
3dab1dad
YO
1662 accept_state = TRIE_NODENUM( state );
1663 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1664
1665 } /* end second pass */
1666
3dab1dad
YO
1667 /* and now dump it out before we compress it */
1668 DEBUG_TRIE_COMPILE_MORE_r(
1669 dump_trie_interim_table(trie,next_alloc,depth+1)
1670 );
a3621e74 1671
a3621e74
YO
1672 {
1673 /*
1674 * Inplace compress the table.*
1675
1676 For sparse data sets the table constructed by the trie algorithm will
1677 be mostly 0/FAIL transitions or to put it another way mostly empty.
1678 (Note that leaf nodes will not contain any transitions.)
1679
1680 This algorithm compresses the tables by eliminating most such
1681 transitions, at the cost of a modest bit of extra work during lookup:
1682
1683 - Each states[] entry contains a .base field which indicates the
1684 index in the state[] array wheres its transition data is stored.
1685
1686 - If .base is 0 there are no valid transitions from that node.
1687
1688 - If .base is nonzero then charid is added to it to find an entry in
1689 the trans array.
1690
1691 -If trans[states[state].base+charid].check!=state then the
1692 transition is taken to be a 0/Fail transition. Thus if there are fail
1693 transitions at the front of the node then the .base offset will point
1694 somewhere inside the previous nodes data (or maybe even into a node
1695 even earlier), but the .check field determines if the transition is
1696 valid.
1697
786e8c11 1698 XXX - wrong maybe?
a3621e74
YO
1699 The following process inplace converts the table to the compressed
1700 table: We first do not compress the root node 1,and mark its all its
1701 .check pointers as 1 and set its .base pointer as 1 as well. This
1702 allows to do a DFA construction from the compressed table later, and
1703 ensures that any .base pointers we calculate later are greater than
1704 0.
1705
1706 - We set 'pos' to indicate the first entry of the second node.
1707
1708 - We then iterate over the columns of the node, finding the first and
1709 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1710 and set the .check pointers accordingly, and advance pos
1711 appropriately and repreat for the next node. Note that when we copy
1712 the next pointers we have to convert them from the original
1713 NODEIDX form to NODENUM form as the former is not valid post
1714 compression.
1715
1716 - If a node has no transitions used we mark its base as 0 and do not
1717 advance the pos pointer.
1718
1719 - If a node only has one transition we use a second pointer into the
1720 structure to fill in allocated fail transitions from other states.
1721 This pointer is independent of the main pointer and scans forward
1722 looking for null transitions that are allocated to a state. When it
1723 finds one it writes the single transition into the "hole". If the
786e8c11 1724 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1725
1726 - Once compressed we can Renew/realloc the structures to release the
1727 excess space.
1728
1729 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1730 specifically Fig 3.47 and the associated pseudocode.
1731
1732 demq
1733 */
a3b680e6 1734 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1735 U32 state, charid;
a3621e74 1736 U32 pos = 0, zp=0;
1e2e3d02 1737 trie->statecount = laststate;
a3621e74
YO
1738
1739 for ( state = 1 ; state < laststate ; state++ ) {
1740 U8 flag = 0;
a28509cc
AL
1741 const U32 stateidx = TRIE_NODEIDX( state );
1742 const U32 o_used = trie->trans[ stateidx ].check;
1743 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1744 trie->trans[ stateidx ].check = 0;
1745
1746 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1747 if ( flag || trie->trans[ stateidx + charid ].next ) {
1748 if ( trie->trans[ stateidx + charid ].next ) {
1749 if (o_used == 1) {
1750 for ( ; zp < pos ; zp++ ) {
1751 if ( ! trie->trans[ zp ].next ) {
1752 break;
1753 }
1754 }
1755 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1756 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1757 trie->trans[ zp ].check = state;
1758 if ( ++zp > pos ) pos = zp;
1759 break;
1760 }
1761 used--;
1762 }
1763 if ( !flag ) {
1764 flag = 1;
1765 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1766 }
1767 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ pos ].check = state;
1769 pos++;
1770 }
1771 }
1772 }
cc601c31 1773 trie->lasttrans = pos + 1;
1e2e3d02 1774 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1775 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1776 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1777 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1778 (int)depth * 2 + 2,"",
1779 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1780 (IV)next_alloc,
1781 (IV)pos,
a3621e74
YO
1782 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1783 );
1784
1785 } /* end table compress */
1786 }
1e2e3d02
YO
1787 DEBUG_TRIE_COMPILE_MORE_r(
1788 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1789 (int)depth * 2 + 2, "",
1790 (UV)trie->statecount,
1791 (UV)trie->lasttrans)
1792 );
cc601c31
YO
1793 /* resize the trans array to remove unused space */
1794 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1795
3dab1dad
YO
1796 /* and now dump out the compressed format */
1797 DEBUG_TRIE_COMPILE_r(
1798 dump_trie(trie,depth+1)
1799 );
07be1b83 1800
3dab1dad 1801 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1802 U8 nodetype =(U8)(flags & 0xFF);
1803 char *str=NULL;
786e8c11 1804
07be1b83 1805#ifdef DEBUGGING
e62cc96a 1806 regnode *optimize = NULL;
b57a0404
JH
1807 U32 mjd_offset = 0;
1808 U32 mjd_nodelen = 0;
07be1b83 1809#endif
a3621e74 1810 /*
3dab1dad
YO
1811 This means we convert either the first branch or the first Exact,
1812 depending on whether the thing following (in 'last') is a branch
1813 or not and whther first is the startbranch (ie is it a sub part of
1814 the alternation or is it the whole thing.)
1815 Assuming its a sub part we conver the EXACT otherwise we convert
1816 the whole branch sequence, including the first.
a3621e74 1817 */
3dab1dad 1818 /* Find the node we are going to overwrite */
7f69552c 1819 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1820 /* branch sub-chain */
3dab1dad 1821 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1822 DEBUG_r({
1823 mjd_offset= Node_Offset((convert));
1824 mjd_nodelen= Node_Length((convert));
1825 });
7f69552c
YO
1826 /* whole branch chain */
1827 } else {
1828 DEBUG_r({
1829 const regnode *nop = NEXTOPER( convert );
1830 mjd_offset= Node_Offset((nop));
1831 mjd_nodelen= Node_Length((nop));
1832 });
07be1b83 1833 }
7f69552c 1834
07be1b83
YO
1835 DEBUG_OPTIMISE_r(
1836 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1837 (int)depth * 2 + 2, "",
786e8c11 1838 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1839 );
a3621e74 1840
3dab1dad
YO
1841 /* But first we check to see if there is a common prefix we can
1842 split out as an EXACT and put in front of the TRIE node. */
1843 trie->startstate= 1;
786e8c11 1844 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1845 U32 state;
1e2e3d02 1846 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1847 U32 ofs = 0;
8e11feef
RGS
1848 I32 idx = -1;
1849 U32 count = 0;
1850 const U32 base = trie->states[ state ].trans.base;
a3621e74 1851
3dab1dad 1852 if ( trie->states[state].wordnum )
8e11feef 1853 count = 1;
a3621e74 1854
8e11feef 1855 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1856 if ( ( base + ofs >= trie->uniquecharcount ) &&
1857 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1858 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1859 {
3dab1dad 1860 if ( ++count > 1 ) {
8e11feef 1861 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1862 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1863 if ( state == 1 ) break;
3dab1dad
YO
1864 if ( count == 2 ) {
1865 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1866 DEBUG_OPTIMISE_r(
8e11feef
RGS
1867 PerlIO_printf(Perl_debug_log,
1868 "%*sNew Start State=%"UVuf" Class: [",
1869 (int)depth * 2 + 2, "",
786e8c11 1870 (UV)state));
be8e71aa
YO
1871 if (idx >= 0) {
1872 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1873 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1874
3dab1dad 1875 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1876 if ( folder )
1877 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1878 DEBUG_OPTIMISE_r(
07be1b83 1879 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1880 );
8e11feef
RGS
1881 }
1882 }
1883 TRIE_BITMAP_SET(trie,*ch);
1884 if ( folder )
1885 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1886 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1887 }
1888 idx = ofs;
1889 }
3dab1dad
YO
1890 }
1891 if ( count == 1 ) {
1892 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1893 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1894 DEBUG_OPTIMISE_r(
8e11feef
RGS
1895 PerlIO_printf( Perl_debug_log,
1896 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1897 (int)depth * 2 + 2, "",
786e8c11 1898 (UV)state, (UV)idx, ch)
3dab1dad
YO
1899 );
1900 if ( state==1 ) {
1901 OP( convert ) = nodetype;
1902 str=STRING(convert);
1903 STR_LEN(convert)=0;
1904 }
1905 *str++=*ch;
1906 STR_LEN(convert)++;
a3621e74 1907
8e11feef 1908 } else {
f9049ba1 1909#ifdef DEBUGGING
8e11feef
RGS
1910 if (state>1)
1911 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1912#endif
8e11feef
RGS
1913 break;
1914 }
1915 }
3dab1dad 1916 if (str) {
8e11feef 1917 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1918 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1919 trie->startstate = state;
07be1b83
YO
1920 trie->minlen -= (state - 1);
1921 trie->maxlen -= (state - 1);
1922 DEBUG_r({
1923 regnode *fix = convert;
1924 mjd_nodelen++;
1925 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1926 while( ++fix < n ) {
1927 Set_Node_Offset_Length(fix, 0, 0);
1928 }
1929 });
8e11feef
RGS
1930 if (trie->maxlen) {
1931 convert = n;
1932 } else {
3dab1dad 1933 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1934 DEBUG_r(optimize= n);
3dab1dad
YO
1935 }
1936 }
1937 }
a5ca303d
YO
1938 if (!jumper)
1939 jumper = last;
3dab1dad 1940 if ( trie->maxlen ) {
8e11feef
RGS
1941 NEXT_OFF( convert ) = (U16)(tail - convert);
1942 ARG_SET( convert, data_slot );
786e8c11
YO
1943 /* Store the offset to the first unabsorbed branch in
1944 jump[0], which is otherwise unused by the jump logic.
1945 We use this when dumping a trie and during optimisation. */
1946 if (trie->jump)
7f69552c 1947 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1948
786e8c11
YO
1949 /* XXXX */
1950 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1951 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1952 {
1953 OP( convert ) = TRIEC;
1954 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1955 Safefree(trie->bitmap);
1956 trie->bitmap= NULL;
1957 } else
1958 OP( convert ) = TRIE;
a3621e74 1959
3dab1dad
YO
1960 /* store the type in the flags */
1961 convert->flags = nodetype;
a5ca303d
YO
1962 DEBUG_r({
1963 optimize = convert
1964 + NODE_STEP_REGNODE
1965 + regarglen[ OP( convert ) ];
1966 });
1967 /* XXX We really should free up the resource in trie now,
1968 as we won't use them - (which resources?) dmq */
3dab1dad 1969 }
a3621e74 1970 /* needed for dumping*/
e62cc96a 1971 DEBUG_r(if (optimize) {
07be1b83 1972 regnode *opt = convert;
e62cc96a 1973 while ( ++opt < optimize) {
07be1b83
YO
1974 Set_Node_Offset_Length(opt,0,0);
1975 }
786e8c11
YO
1976 /*
1977 Try to clean up some of the debris left after the
1978 optimisation.
a3621e74 1979 */
786e8c11 1980 while( optimize < jumper ) {
07be1b83 1981 mjd_nodelen += Node_Length((optimize));
a3621e74 1982 OP( optimize ) = OPTIMIZED;
07be1b83 1983 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1984 optimize++;
1985 }
07be1b83 1986 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1987 });
1988 } /* end node insert */
07be1b83 1989#ifndef DEBUGGING
6e8b4190 1990 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1991#endif
786e8c11
YO
1992 return trie->jump
1993 ? MADE_JUMP_TRIE
1994 : trie->startstate>1
1995 ? MADE_EXACT_TRIE
1996 : MADE_TRIE;
1997}
1998
1999STATIC void
2000S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2001{
2002/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2003
2004 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2005 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2006 ISBN 0-201-10088-6
2007
2008 We find the fail state for each state in the trie, this state is the longest proper
2009 suffix of the current states 'word' that is also a proper prefix of another word in our
2010 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2011 the DFA not to have to restart after its tried and failed a word at a given point, it
2012 simply continues as though it had been matching the other word in the first place.
2013 Consider
2014 'abcdgu'=~/abcdefg|cdgu/
2015 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2016 fail, which would bring use to the state representing 'd' in the second word where we would
2017 try 'g' and succeed, prodceding to match 'cdgu'.
2018 */
2019 /* add a fail transition */
2020 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2021 U32 *q;
2022 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2023 const U32 numstates = trie->statecount;
786e8c11
YO
2024 const U32 ubound = trie->lasttrans + ucharcount;
2025 U32 q_read = 0;
2026 U32 q_write = 0;
2027 U32 charid;
2028 U32 base = trie->states[ 1 ].trans.base;
2029 U32 *fail;
2030 reg_ac_data *aho;
2031 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2032 GET_RE_DEBUG_FLAGS_DECL;
2033#ifndef DEBUGGING
2034 PERL_UNUSED_ARG(depth);
2035#endif
2036
2037
2038 ARG_SET( stclass, data_slot );
2039 Newxz( aho, 1, reg_ac_data );
2040 RExC_rx->data->data[ data_slot ] = (void*)aho;
2041 aho->trie=trie;
2042 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2043 numstates * sizeof(reg_trie_state));
786e8c11
YO
2044 Newxz( q, numstates, U32);
2045 Newxz( aho->fail, numstates, U32 );
2046 aho->refcount = 1;
2047 fail = aho->fail;
2048 /* initialize fail[0..1] to be 1 so that we always have
2049 a valid final fail state */
2050 fail[ 0 ] = fail[ 1 ] = 1;
2051
2052 for ( charid = 0; charid < ucharcount ; charid++ ) {
2053 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2054 if ( newstate ) {
2055 q[ q_write ] = newstate;
2056 /* set to point at the root */
2057 fail[ q[ q_write++ ] ]=1;
2058 }
2059 }
2060 while ( q_read < q_write) {
2061 const U32 cur = q[ q_read++ % numstates ];
2062 base = trie->states[ cur ].trans.base;
2063
2064 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2065 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2066 if (ch_state) {
2067 U32 fail_state = cur;
2068 U32 fail_base;
2069 do {
2070 fail_state = fail[ fail_state ];
2071 fail_base = aho->states[ fail_state ].trans.base;
2072 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2073
2074 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2075 fail[ ch_state ] = fail_state;
2076 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2077 {
2078 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2079 }
2080 q[ q_write++ % numstates] = ch_state;
2081 }
2082 }
2083 }
2084 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2085 when we fail in state 1, this allows us to use the
2086 charclass scan to find a valid start char. This is based on the principle
2087 that theres a good chance the string being searched contains lots of stuff
2088 that cant be a start char.
2089 */
2090 fail[ 0 ] = fail[ 1 ] = 0;
2091 DEBUG_TRIE_COMPILE_r({
1e2e3d02
YO
2092 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2093 (int)(depth * 2), "", numstates
2094 );
786e8c11
YO
2095 for( q_read=1; q_read<numstates; q_read++ ) {
2096 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2097 }
2098 PerlIO_printf(Perl_debug_log, "\n");
2099 });
2100 Safefree(q);
2101 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2102}
2103
786e8c11 2104
a3621e74 2105/*
5d1c421c
JH
2106 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2107 * These need to be revisited when a newer toolchain becomes available.
2108 */
2109#if defined(__sparc64__) && defined(__GNUC__)
2110# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2111# undef SPARC64_GCC_WORKAROUND
2112# define SPARC64_GCC_WORKAROUND 1
2113# endif
2114#endif
2115
07be1b83
YO
2116#define DEBUG_PEEP(str,scan,depth) \
2117 DEBUG_OPTIMISE_r({ \
2118 SV * const mysv=sv_newmortal(); \
2119 regnode *Next = regnext(scan); \
2120 regprop(RExC_rx, mysv, scan); \
7f69552c 2121 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2122 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2123 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2124 });
2125
1de06328
YO
2126
2127
2128
2129
07be1b83
YO
2130#define JOIN_EXACT(scan,min,flags) \
2131 if (PL_regkind[OP(scan)] == EXACT) \
2132 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2133
be8e71aa 2134STATIC U32
07be1b83
YO
2135S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2136 /* Merge several consecutive EXACTish nodes into one. */
2137 regnode *n = regnext(scan);
2138 U32 stringok = 1;
2139 regnode *next = scan + NODE_SZ_STR(scan);
2140 U32 merged = 0;
2141 U32 stopnow = 0;
2142#ifdef DEBUGGING
2143 regnode *stop = scan;
72f13be8 2144 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2145#else
d47053eb
RGS
2146 PERL_UNUSED_ARG(depth);
2147#endif
2148#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2149 PERL_UNUSED_ARG(flags);
2150 PERL_UNUSED_ARG(val);
07be1b83 2151#endif
07be1b83
YO
2152 DEBUG_PEEP("join",scan,depth);
2153
2154 /* Skip NOTHING, merge EXACT*. */
2155 while (n &&
2156 ( PL_regkind[OP(n)] == NOTHING ||
2157 (stringok && (OP(n) == OP(scan))))
2158 && NEXT_OFF(n)
2159 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2160
2161 if (OP(n) == TAIL || n > next)
2162 stringok = 0;
2163 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2164 DEBUG_PEEP("skip:",n,depth);
2165 NEXT_OFF(scan) += NEXT_OFF(n);
2166 next = n + NODE_STEP_REGNODE;
2167#ifdef DEBUGGING
2168 if (stringok)
2169 stop = n;
2170#endif
2171 n = regnext(n);
2172 }
2173 else if (stringok) {
786e8c11 2174 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2175 regnode * const nnext = regnext(n);
2176
2177 DEBUG_PEEP("merg",n,depth);
2178
2179 merged++;
2180 if (oldl + STR_LEN(n) > U8_MAX)
2181 break;
2182 NEXT_OFF(scan) += NEXT_OFF(n);
2183 STR_LEN(scan) += STR_LEN(n);
2184 next = n + NODE_SZ_STR(n);
2185 /* Now we can overwrite *n : */
2186 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2187#ifdef DEBUGGING
2188 stop = next - 1;
2189#endif
2190 n = nnext;
2191 if (stopnow) break;
2192 }
2193
d47053eb
RGS
2194#ifdef EXPERIMENTAL_INPLACESCAN
2195 if (flags && !NEXT_OFF(n)) {
2196 DEBUG_PEEP("atch", val, depth);
2197 if (reg_off_by_arg[OP(n)]) {
2198 ARG_SET(n, val - n);
2199 }
2200 else {
2201 NEXT_OFF(n) = val - n;
2202 }
2203 stopnow = 1;
2204 }
07be1b83
YO
2205#endif
2206 }
2207
2208 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2209 /*
2210 Two problematic code points in Unicode casefolding of EXACT nodes:
2211
2212 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2213 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2214
2215 which casefold to
2216
2217 Unicode UTF-8
2218
2219 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2220 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2221
2222 This means that in case-insensitive matching (or "loose matching",
2223 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2224 length of the above casefolded versions) can match a target string
2225 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2226 This would rather mess up the minimum length computation.
2227
2228 What we'll do is to look for the tail four bytes, and then peek
2229 at the preceding two bytes to see whether we need to decrease
2230 the minimum length by four (six minus two).
2231
2232 Thanks to the design of UTF-8, there cannot be false matches:
2233 A sequence of valid UTF-8 bytes cannot be a subsequence of
2234 another valid sequence of UTF-8 bytes.
2235
2236 */
2237 char * const s0 = STRING(scan), *s, *t;
2238 char * const s1 = s0 + STR_LEN(scan) - 1;
2239 char * const s2 = s1 - 4;
e294cc5d
JH
2240#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2241 const char t0[] = "\xaf\x49\xaf\x42";
2242#else
07be1b83 2243 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2244#endif
07be1b83
YO
2245 const char * const t1 = t0 + 3;
2246
2247 for (s = s0 + 2;
2248 s < s2 && (t = ninstr(s, s1, t0, t1));
2249 s = t + 4) {
e294cc5d
JH
2250#ifdef EBCDIC
2251 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2252 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2253#else
07be1b83
YO
2254 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2255 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2256#endif
07be1b83
YO
2257 *min -= 4;
2258 }
2259 }
2260
2261#ifdef DEBUGGING
2262 /* Allow dumping */
2263 n = scan + NODE_SZ_STR(scan);
2264 while (n <= stop) {
2265 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2266 OP(n) = OPTIMIZED;
2267 NEXT_OFF(n) = 0;
2268 }
2269 n++;
2270 }
2271#endif
2272 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2273 return stopnow;
2274}
2275
653099ff
GS
2276/* REx optimizer. Converts nodes into quickier variants "in place".
2277 Finds fixed substrings. */
2278
a0288114 2279/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2280 to the position after last scanned or to NULL. */
2281
40d049e4
YO
2282#define INIT_AND_WITHP \
2283 assert(!and_withp); \
2284 Newx(and_withp,1,struct regnode_charclass_class); \
2285 SAVEFREEPV(and_withp)
07be1b83 2286
76e3520e 2287STATIC I32
40d049e4 2288S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2289 I32 *minlenp, I32 *deltap,
40d049e4
YO
2290 regnode *last,
2291 scan_data_t *data,
2292 I32 stopparen,
2293 U8* recursed,
2294 struct regnode_charclass_class *and_withp,
2295 U32 flags, U32 depth)
c277df42
IZ
2296 /* scanp: Start here (read-write). */
2297 /* deltap: Write maxlen-minlen here. */
2298 /* last: Stop before this one. */
40d049e4
YO
2299 /* data: string data about the pattern */
2300 /* stopparen: treat close N as END */
2301 /* recursed: which subroutines have we recursed into */
2302 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2303{
97aff369 2304 dVAR;
c277df42
IZ
2305 I32 min = 0, pars = 0, code;
2306 regnode *scan = *scanp, *next;
2307 I32 delta = 0;
2308 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2309 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2310 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2311 scan_data_t data_fake;
a3621e74 2312 SV *re_trie_maxbuff = NULL;
786e8c11 2313 regnode *first_non_open = scan;
a3621e74 2314 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2315#ifdef DEBUGGING
40d049e4 2316 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2317#endif
40d049e4 2318
786e8c11 2319 if ( depth == 0 ) {
40d049e4 2320 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2321 first_non_open=regnext(first_non_open);
2322 }
2323
b81d288d 2324
c277df42
IZ
2325 while (scan && OP(scan) != END && scan < last) {
2326 /* Peephole optimizer: */
1de06328 2327 DEBUG_STUDYDATA(data,depth);
07be1b83 2328 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2329 JOIN_EXACT(scan,&min,0);
a3621e74 2330
653099ff
GS
2331 /* Follow the next-chain of the current node and optimize
2332 away all the NOTHINGs from it. */
c277df42 2333 if (OP(scan) != CURLYX) {
a3b680e6 2334 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2335 ? I32_MAX
2336 /* I32 may be smaller than U16 on CRAYs! */
2337 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2338 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2339 int noff;
2340 regnode *n = scan;
b81d288d 2341
c277df42
IZ
2342 /* Skip NOTHING and LONGJMP. */
2343 while ((n = regnext(n))
3dab1dad 2344 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2345 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2346 && off + noff < max)
2347 off += noff;
2348 if (reg_off_by_arg[OP(scan)])
2349 ARG(scan) = off;
b81d288d 2350 else
c277df42
IZ
2351 NEXT_OFF(scan) = off;
2352 }
a3621e74 2353
07be1b83 2354
3dab1dad 2355
653099ff
GS
2356 /* The principal pseudo-switch. Cannot be a switch, since we
2357 look into several different things. */
b81d288d 2358 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2359 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2360 next = regnext(scan);
2361 code = OP(scan);
a3621e74 2362 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2363
2364 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2365 /* NOTE - There is similar code to this block below for handling
2366 TRIE nodes on a re-study. If you change stuff here check there
2367 too. */
c277df42 2368 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2369 struct regnode_charclass_class accum;
d4c19fe8 2370 regnode * const startbranch=scan;
c277df42 2371
653099ff 2372 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2373 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2374 if (flags & SCF_DO_STCLASS)
830247a4 2375 cl_init_zero(pRExC_state, &accum);
a3621e74 2376
c277df42 2377 while (OP(scan) == code) {
830247a4 2378 I32 deltanext, minnext, f = 0, fake;
653099ff 2379 struct regnode_charclass_class this_class;
c277df42
IZ
2380
2381 num++;
2382 data_fake.flags = 0;
40d049e4 2383 if (data) {
2c2d71f5 2384 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2385 data_fake.last_closep = data->last_closep;
2386 }
2387 else
2388 data_fake.last_closep = &fake;
c277df42
IZ
2389 next = regnext(scan);
2390 scan = NEXTOPER(scan);
2391 if (code != BRANCH)
2392 scan = NEXTOPER(scan);
653099ff 2393 if (flags & SCF_DO_STCLASS) {
830247a4 2394 cl_init(pRExC_state, &this_class);
653099ff
GS
2395 data_fake.start_class = &this_class;
2396 f = SCF_DO_STCLASS_AND;
b81d288d 2397 }
e1901655
IZ
2398 if (flags & SCF_WHILEM_VISITED_POS)
2399 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2400
653099ff 2401 /* we suppose the run is continuous, last=next...*/
1de06328 2402 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2403 next, &data_fake,
2404 stopparen, recursed, NULL, f,depth+1);
b81d288d 2405 if (min1 > minnext)
c277df42
IZ
2406 min1 = minnext;
2407 if (max1 < minnext + deltanext)
2408 max1 = minnext + deltanext;
2409 if (deltanext == I32_MAX)
aca2d497 2410 is_inf = is_inf_internal = 1;
c277df42
IZ
2411 scan = next;
2412 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2413 pars++;
3dab1dad
YO
2414 if (data) {
2415 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2416 data->flags |= SF_HAS_EVAL;
2c2d71f5 2417 data->whilem_c = data_fake.whilem_c;
3dab1dad 2418 }
653099ff 2419 if (flags & SCF_DO_STCLASS)
830247a4 2420 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2421 if (code == SUSPEND)
c277df42
IZ
2422 break;
2423 }
2424 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2425 min1 = 0;
2426 if (flags & SCF_DO_SUBSTR) {
2427 data->pos_min += min1;
2428 data->pos_delta += max1 - min1;
2429 if (max1 != min1 || is_inf)
2430 data->longest = &(data->longest_float);
2431 }
2432 min += min1;
2433 delta += max1 - min1;
653099ff 2434 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2435 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2436 if (min1) {
40d049e4 2437 cl_and(data->start_class, and_withp);
653099ff
GS
2438 flags &= ~SCF_DO_STCLASS;
2439 }
2440 }
2441 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2442 if (min1) {
2443 cl_and(data->start_class, &accum);
653099ff 2444 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2445 }
2446 else {
b81d288d 2447 /* Switch to OR mode: cache the old value of
de0c8cb8 2448 * data->start_class */
40d049e4
YO
2449 INIT_AND_WITHP;
2450 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2451 struct regnode_charclass_class);
2452 flags &= ~SCF_DO_STCLASS_AND;
2453 StructCopy(&accum, data->start_class,
2454 struct regnode_charclass_class);
2455 flags |= SCF_DO_STCLASS_OR;
2456 data->start_class->flags |= ANYOF_EOS;
2457 }
653099ff 2458 }
a3621e74 2459
786e8c11 2460 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2461 /* demq.
2462
2463 Assuming this was/is a branch we are dealing with: 'scan' now
2464 points at the item that follows the branch sequence, whatever
2465 it is. We now start at the beginning of the sequence and look
2466 for subsequences of
2467
786e8c11
YO
2468 BRANCH->EXACT=>x1
2469 BRANCH->EXACT=>x2
2470 tail
a3621e74
YO
2471
2472 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2473
2474 If we can find such a subseqence we need to turn the first
2475 element into a trie and then add the subsequent branch exact
2476 strings to the trie.
2477
2478 We have two cases
2479
786e8c11 2480 1. patterns where the whole set of branch can be converted.
a3621e74 2481
786e8c11 2482 2. patterns where only a subset can be converted.
a3621e74
YO
2483
2484 In case 1 we can replace the whole set with a single regop
2485 for the trie. In case 2 we need to keep the start and end
2486 branchs so
2487
2488 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2489 becomes BRANCH TRIE; BRANCH X;
2490
786e8c11
YO
2491 There is an additional case, that being where there is a
2492 common prefix, which gets split out into an EXACT like node
2493 preceding the TRIE node.
2494
2495 If x(1..n)==tail then we can do a simple trie, if not we make
2496 a "jump" trie, such that when we match the appropriate word
2497 we "jump" to the appopriate tail node. Essentailly we turn
2498 a nested if into a case structure of sorts.
a3621e74
YO
2499
2500 */
786e8c11 2501
3dab1dad 2502 int made=0;
0111c4fd
RGS
2503 if (!re_trie_maxbuff) {
2504 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2505 if (!SvIOK(re_trie_maxbuff))
2506 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2507 }
786e8c11 2508 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2509 regnode *cur;
2510 regnode *first = (regnode *)NULL;
2511 regnode *last = (regnode *)NULL;
2512 regnode *tail = scan;
2513 U8 optype = 0;
2514 U32 count=0;
2515
2516#ifdef DEBUGGING
c445ea15 2517 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2518#endif
2519 /* var tail is used because there may be a TAIL
2520 regop in the way. Ie, the exacts will point to the
2521 thing following the TAIL, but the last branch will
2522 point at the TAIL. So we advance tail. If we
2523 have nested (?:) we may have to move through several
2524 tails.
2525 */
2526
2527 while ( OP( tail ) == TAIL ) {
2528 /* this is the TAIL generated by (?:) */
2529 tail = regnext( tail );
2530 }
2531
3dab1dad 2532
a3621e74 2533 DEBUG_OPTIMISE_r({
32fc9b6a 2534 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2535 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2536 (int)depth * 2 + 2, "",
2537 "Looking for TRIE'able sequences. Tail node is: ",
2538 SvPV_nolen_const( mysv )
a3621e74
YO
2539 );
2540 });
3dab1dad 2541
a3621e74
YO
2542 /*
2543
2544 step through the branches, cur represents each
2545 branch, noper is the first thing to be matched
2546 as part of that branch and noper_next is the
2547 regnext() of that node. if noper is an EXACT
2548 and noper_next is the same as scan (our current
2549 position in the regex) then the EXACT branch is
2550 a possible optimization target. Once we have
2551 two or more consequetive such branches we can
2552 create a trie of the EXACT's contents and stich
2553 it in place. If the sequence represents all of
2554 the branches we eliminate the whole thing and
2555 replace it with a single TRIE. If it is a
2556 subsequence then we need to stitch it in. This
2557 means the first branch has to remain, and needs
2558 to be repointed at the item on the branch chain
2559 following the last branch optimized. This could
2560 be either a BRANCH, in which case the
2561 subsequence is internal, or it could be the
2562 item following the branch sequence in which
2563 case the subsequence is at the end.
2564
2565 */
2566
2567 /* dont use tail as the end marker for this traverse */
2568 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2569 regnode * const noper = NEXTOPER( cur );
be981c67 2570#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2571 regnode * const noper_next = regnext( noper );
be981c67 2572#endif
a3621e74 2573
a3621e74 2574 DEBUG_OPTIMISE_r({
32fc9b6a 2575 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2576 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2577 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2578
32fc9b6a 2579 regprop(RExC_rx, mysv, noper);
a3621e74 2580 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2581 SvPV_nolen_const(mysv));
a3621e74
YO
2582
2583 if ( noper_next ) {
32fc9b6a 2584 regprop(RExC_rx, mysv, noper_next );
a3621e74 2585 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2586 SvPV_nolen_const(mysv));
a3621e74 2587 }
3dab1dad
YO
2588 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2589 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2590 });
3dab1dad
YO
2591 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2592 : PL_regkind[ OP( noper ) ] == EXACT )
2593 || OP(noper) == NOTHING )
786e8c11
YO
2594#ifdef NOJUMPTRIE
2595 && noper_next == tail
2596#endif
2597 && count < U16_MAX)
a3621e74
YO
2598 {
2599 count++;
3dab1dad
YO
2600 if ( !first || optype == NOTHING ) {
2601 if (!first) first = cur;
a3621e74
YO
2602 optype = OP( noper );
2603 } else {
a3621e74 2604 last = cur;
a3621e74
YO
2605 }
2606 } else {
2607 if ( last ) {
786e8c11
YO
2608 make_trie( pRExC_state,
2609 startbranch, first, cur, tail, count,
2610 optype, depth+1 );
a3621e74 2611 }
3dab1dad 2612 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2613#ifdef NOJUMPTRIE
2614 && noper_next == tail
2615#endif
2616 ){
a3621e74
YO
2617 count = 1;
2618 first = cur;
2619 optype = OP( noper );
2620 } else {
2621 count = 0;
2622 first = NULL;
2623 optype = 0;
2624 }
2625 last = NULL;
2626 }
2627 }
2628 DEBUG_OPTIMISE_r({
32fc9b6a 2629 regprop(RExC_rx, mysv, cur);
a3621e74 2630 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2631 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2632 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2633
2634 });
2635 if ( last ) {
786e8c11 2636 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2637#ifdef TRIE_STUDY_OPT
786e8c11
YO
2638 if ( ((made == MADE_EXACT_TRIE &&
2639 startbranch == first)
2640 || ( first_non_open == first )) &&
2641 depth==0 )
2642 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2643#endif
07be1b83 2644 }
a3621e74 2645 }
3dab1dad
YO
2646
2647 } /* do trie */
786e8c11 2648
a0ed51b3 2649 }
a3621e74 2650 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2651 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2652 } else /* single branch is optimized. */
c277df42
IZ
2653 scan = NEXTOPER(scan);
2654 continue;
a0ed51b3
LW
2655 }
2656 else if (OP(scan) == EXACT) {
cd439c50 2657 I32 l = STR_LEN(scan);
c445ea15 2658 UV uc;
a0ed51b3 2659 if (UTF) {
a3b680e6 2660 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2661 l = utf8_length(s, s + l);
9041c2e3 2662 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2663 } else {
2664 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2665 }
2666 min += l;
c277df42 2667 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2668 /* The code below prefers earlier match for fixed
2669 offset, later match for variable offset. */
2670 if (data->last_end == -1) { /* Update the start info. */
2671 data->last_start_min = data->pos_min;
2672 data->last_start_max = is_inf
b81d288d 2673 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2674 }
cd439c50 2675 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2676 if (UTF)
2677 SvUTF8_on(data->last_found);
0eda9292 2678 {
9a957fbc 2679 SV * const sv = data->last_found;
a28509cc 2680 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2681 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2682 if (mg && mg->mg_len >= 0)
5e43f467
JH
2683 mg->mg_len += utf8_length((U8*)STRING(scan),
2684 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2685 }
c277df42
IZ
2686 data->last_end = data->pos_min + l;
2687 data->pos_min += l; /* As in the first entry. */
2688 data->flags &= ~SF_BEFORE_EOL;
2689 }
653099ff
GS
2690 if (flags & SCF_DO_STCLASS_AND) {
2691 /* Check whether it is compatible with what we know already! */
2692 int compat = 1;
2693
1aa99e6b 2694 if (uc >= 0x100 ||
516a5887 2695 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2696 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2697 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2698 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2699 )
653099ff
GS
2700 compat = 0;
2701 ANYOF_CLASS_ZERO(data->start_class);
2702 ANYOF_BITMAP_ZERO(data->start_class);
2703 if (compat)
1aa99e6b 2704 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2705 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2706 if (uc < 0x100)
2707 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2708 }
2709 else if (flags & SCF_DO_STCLASS_OR) {
2710 /* false positive possible if the class is case-folded */
1aa99e6b 2711 if (uc < 0x100)
9b877dbb
IH
2712 ANYOF_BITMAP_SET(data->start_class, uc);
2713 else
2714 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2715 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2716 cl_and(data->start_class, and_withp);
653099ff
GS
2717 }
2718 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2719 }
3dab1dad 2720 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2721 I32 l = STR_LEN(scan);
1aa99e6b 2722 UV uc = *((U8*)STRING(scan));
653099ff
GS
2723
2724 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2725 if (flags & SCF_DO_SUBSTR) {
2726 assert(data);
1de06328 2727 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2728 }
a0ed51b3 2729 if (UTF) {
6136c704 2730 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2731 l = utf8_length(s, s + l);
9041c2e3 2732 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2733 }
2734 min += l;
ecaa9b9c 2735 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2736 data->pos_min += l;
653099ff
GS
2737 if (flags & SCF_DO_STCLASS_AND) {
2738 /* Check whether it is compatible with what we know already! */
2739 int compat = 1;
2740
1aa99e6b 2741 if (uc >= 0x100 ||
516a5887 2742 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2743 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2744 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2745 compat = 0;
2746 ANYOF_CLASS_ZERO(data->start_class);
2747 ANYOF_BITMAP_ZERO(data->start_class);
2748 if (compat) {
1aa99e6b 2749 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2750 data->start_class->flags &= ~ANYOF_EOS;
2751 data->start_class->flags |= ANYOF_FOLD;
2752 if (OP(scan) == EXACTFL)
2753 data->start_class->flags |= ANYOF_LOCALE;
2754 }
2755 }
2756 else if (flags & SCF_DO_STCLASS_OR) {
2757 if (data->start_class->flags & ANYOF_FOLD) {
2758 /* false positive possible if the class is case-folded.
2759 Assume that the locale settings are the same... */
1aa99e6b
IH
2760 if (uc < 0x100)
2761 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2762 data->start_class->flags &= ~ANYOF_EOS;
2763 }
40d049e4 2764 cl_and(data->start_class, and_withp);
653099ff
GS
2765 }
2766 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2767 }
bfed75c6 2768 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2769 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2770 I32 f = flags, pos_before = 0;
d4c19fe8 2771 regnode * const oscan = scan;
653099ff
GS
2772 struct regnode_charclass_class this_class;
2773 struct regnode_charclass_class *oclass = NULL;
727f22e3 2774 I32 next_is_eval = 0;
653099ff 2775
3dab1dad 2776 switch (PL_regkind[OP(scan)]) {
653099ff 2777 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2778 scan = NEXTOPER(scan);
2779 goto finish;
2780 case PLUS:
653099ff 2781 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2782 next = NEXTOPER(scan);
653099ff 2783 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2784 mincount = 1;
2785 maxcount = REG_INFTY;
c277df42
IZ
2786 next = regnext(scan);
2787 scan = NEXTOPER(scan);
2788 goto do_curly;
2789 }
2790 }
2791 if (flags & SCF_DO_SUBSTR)
2792 data->pos_min++;
2793 min++;
2794 /* Fall through. */
2795 case STAR:
653099ff
GS
2796 if (flags & SCF_DO_STCLASS) {
2797 mincount = 0;
b81d288d 2798 maxcount = REG_INFTY;
653099ff
GS
2799 next = regnext(scan);
2800 scan = NEXTOPER(scan);
2801 goto do_curly;
2802 }
b81d288d 2803 is_inf = is_inf_internal = 1;
c277df42
IZ
2804 scan = regnext(scan);
2805 if (flags & SCF_DO_SUBSTR) {
1de06328 2806 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2807 data->longest = &(data->longest_float);
2808 }
2809 goto optimize_curly_tail;
2810 case CURLY:
40d049e4
YO
2811 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2812 && (scan->flags == stopparen))
2813 {
2814 mincount = 1;
2815 maxcount = 1;
2816 } else {
2817 mincount = ARG1(scan);
2818 maxcount = ARG2(scan);
2819 }
c277df42 2820 next = regnext(scan);
cb434fcc
IZ
2821 if (OP(scan) == CURLYX) {
2822 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2823 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2824 }
c277df42 2825 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2826 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2827 do_curly:
2828 if (flags & SCF_DO_SUBSTR) {
1de06328 2829 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2830 pos_before = data->pos_min;
2831 }
2832 if (data) {
2833 fl = data->flags;
2834 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2835 if (is_inf)
2836 data->flags |= SF_IS_INF;
2837 }
653099ff 2838 if (flags & SCF_DO_STCLASS) {
830247a4 2839 cl_init(pRExC_state, &this_class);
653099ff
GS
2840 oclass = data->start_class;
2841 data->start_class = &this_class;
2842 f |= SCF_DO_STCLASS_AND;
2843 f &= ~SCF_DO_STCLASS_OR;
2844 }
e1901655
IZ
2845 /* These are the cases when once a subexpression
2846 fails at a particular position, it cannot succeed
2847 even after backtracking at the enclosing scope.
b81d288d 2848
e1901655
IZ
2849 XXXX what if minimal match and we are at the
2850 initial run of {n,m}? */
2851 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2852 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2853
c277df42 2854 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2855 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2856 last, data, stopparen, recursed, NULL,
a3621e74
YO
2857 (mincount == 0
2858 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2859
2860 if (flags & SCF_DO_STCLASS)
2861 data->start_class = oclass;
2862 if (mincount == 0 || minnext == 0) {
2863 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2864 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2865 }
2866 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2867 /* Switch to OR mode: cache the old value of
653099ff 2868 * data->start_class */
40d049e4
YO
2869 INIT_AND_WITHP;
2870 StructCopy(data->start_class, and_withp,
653099ff
GS
2871 struct regnode_charclass_class);
2872 flags &= ~SCF_DO_STCLASS_AND;
2873 StructCopy(&this_class, data->start_class,
2874 struct regnode_charclass_class);
2875 flags |= SCF_DO_STCLASS_OR;
2876 data->start_class->flags |= ANYOF_EOS;
2877 }
2878 } else { /* Non-zero len */
2879 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2880 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2881 cl_and(data->start_class, and_withp);
653099ff
GS
2882 }
2883 else if (flags & SCF_DO_STCLASS_AND)
2884 cl_and(data->start_class, &this_class);
2885 flags &= ~SCF_DO_STCLASS;
2886 }
c277df42
IZ
2887 if (!scan) /* It was not CURLYX, but CURLY. */
2888 scan = next;
041457d9
DM
2889 if ( /* ? quantifier ok, except for (?{ ... }) */
2890 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2891 && (minnext == 0) && (deltanext == 0)
99799961 2892 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2893 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2894 && ckWARN(WARN_REGEXP))
b45f050a 2895 {
830247a4 2896 vWARN(RExC_parse,
b45f050a
JF
2897 "Quantifier unexpected on zero-length expression");
2898 }
2899
c277df42 2900 min += minnext * mincount;
b81d288d 2901 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2902 && (minnext + deltanext) > 0)
2903 || deltanext == I32_MAX);
aca2d497 2904 is_inf |= is_inf_internal;
c277df42
IZ
2905 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2906
2907 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2908 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2909 && data->flags & SF_IN_PAR
2910 && !(data->flags & SF_HAS_EVAL)
2911 && !deltanext && minnext == 1 ) {
2912 /* Try to optimize to CURLYN. */
2913 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2914 regnode * const nxt1 = nxt;
497b47a8
JH
2915#ifdef DEBUGGING
2916 regnode *nxt2;
2917#endif
c277df42
IZ
2918
2919 /* Skip open. */
2920 nxt = regnext(nxt);
bfed75c6 2921 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2922 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2923 && STR_LEN(nxt) == 1))
c277df42 2924 goto nogo;
497b47a8 2925#ifdef DEBUGGING
c277df42 2926 nxt2 = nxt;
497b47a8 2927#endif
c277df42 2928 nxt = regnext(nxt);
b81d288d 2929 if (OP(nxt) != CLOSE)
c277df42 2930 goto nogo;
40d049e4
YO
2931 if (RExC_open_parens) {
2932 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2933 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2934 }
c277df42 2935 /* Now we know that nxt2 is the only contents: */
eb160463 2936 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2937 OP(oscan) = CURLYN;
2938 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2939
c277df42
IZ
2940#ifdef DEBUGGING
2941 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2942 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2943 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2944 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2945 OP(nxt + 1) = OPTIMIZED; /* was count. */
2946 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2947#endif
c277df42 2948 }
c277df42
IZ
2949 nogo:
2950
2951 /* Try optimization CURLYX => CURLYM. */
b81d288d 2952 if ( OP(oscan) == CURLYX && data
c277df42 2953 && !(data->flags & SF_HAS_PAR)
c277df42 2954 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2955 && !deltanext /* atom is fixed width */
2956 && minnext != 0 /* CURLYM can't handle zero width */
2957 ) {
c277df42
IZ
2958 /* XXXX How to optimize if data == 0? */
2959 /* Optimize to a simpler form. */
2960 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2961 regnode *nxt2;
2962
2963 OP(oscan) = CURLYM;
2964 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2965 && (OP(nxt2) != WHILEM))
c277df42
IZ
2966 nxt = nxt2;
2967 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2968 /* Need to optimize away parenths. */
2969 if (data->flags & SF_IN_PAR) {
2970 /* Set the parenth number. */
2971 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2972
b81d288d 2973 if (OP(nxt) != CLOSE)
b45f050a 2974 FAIL("Panic opt close");
eb160463 2975 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2976 if (RExC_open_parens) {
2977 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2978 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2979 }
c277df42
IZ
2980 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2981 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2982
c277df42
IZ
2983#ifdef DEBUGGING
2984 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2985 OP(nxt + 1) = OPTIMIZED; /* was count. */
2986 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2987 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2988#endif
c277df42
IZ
2989#if 0
2990 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2991 regnode *nnxt = regnext(nxt1);
b81d288d 2992
c277df42
IZ
2993 if (nnxt == nxt) {
2994 if (reg_off_by_arg[OP(nxt1)])
2995 ARG_SET(nxt1, nxt2 - nxt1);
2996 else if (nxt2 - nxt1 < U16_MAX)
2997 NEXT_OFF(nxt1) = nxt2 - nxt1;
2998 else
2999 OP(nxt) = NOTHING; /* Cannot beautify */
3000 }
3001 nxt1 = nnxt;
3002 }
3003#endif
3004 /* Optimize again: */
1de06328 3005 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3006 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3007 }
3008 else
c277df42 3009 oscan->flags = 0;
c277df42 3010 }
e1901655
IZ
3011 else if ((OP(oscan) == CURLYX)
3012 && (flags & SCF_WHILEM_VISITED_POS)
3013 /* See the comment on a similar expression above.
3014 However, this time it not a subexpression
3015 we care about, but the expression itself. */
3016 && (maxcount == REG_INFTY)
3017 && data && ++data->whilem_c < 16) {
3018 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3019 /* Find WHILEM (as in regexec.c) */
3020 regnode *nxt = oscan + NEXT_OFF(oscan);
3021
3022 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3023 nxt += ARG(nxt);
eb160463
GS
3024 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3025 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3026 }
b81d288d 3027 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3028 pars++;
3029 if (flags & SCF_DO_SUBSTR) {
c445ea15 3030 SV *last_str = NULL;
c277df42
IZ
3031 int counted = mincount != 0;
3032
3033 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3034#if defined(SPARC64_GCC_WORKAROUND)
3035 I32 b = 0;
3036 STRLEN l = 0;
cfd0369c 3037 const char *s = NULL;
5d1c421c
JH
3038 I32 old = 0;
3039
3040 if (pos_before >= data->last_start_min)
3041 b = pos_before;
3042 else
3043 b = data->last_start_min;
3044
3045 l = 0;
cfd0369c 3046 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3047 old = b - data->last_start_min;
3048
3049#else
b81d288d 3050 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3051 ? pos_before : data->last_start_min;
3052 STRLEN l;
d4c19fe8 3053 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3054 I32 old = b - data->last_start_min;
5d1c421c 3055#endif
a0ed51b3
LW
3056
3057 if (UTF)
3058 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3059
a0ed51b3 3060 l -= old;
c277df42 3061 /* Get the added string: */
79cb57f6 3062 last_str = newSVpvn(s + old, l);
0e933229
IH
3063 if (UTF)
3064 SvUTF8_on(last_str);
c277df42
IZ
3065 if (deltanext == 0 && pos_before == b) {
3066 /* What was added is a constant string */
3067 if (mincount > 1) {
3068 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3069 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3070 SvPVX_const(last_str), l, mincount - 1);
b162af07 3071 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3072 /* Add additional parts. */
b81d288d 3073 SvCUR_set(data->last_found,
c277df42
IZ
3074 SvCUR(data->last_found) - l);
3075 sv_catsv(data->last_found, last_str);
0eda9292
JH
3076 {
3077 SV * sv = data->last_found;
3078 MAGIC *mg =
3079 SvUTF8(sv) && SvMAGICAL(sv) ?
3080 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3081 if (mg && mg->mg_len >= 0)
3082 mg->mg_len += CHR_SVLEN(last_str);
3083 }
c277df42
IZ
3084 data->last_end += l * (mincount - 1);
3085 }
2a8d9689
HS
3086 } else {
3087 /* start offset must point into the last copy */
3088 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3089 data->last_start_max += is_inf ? I32_MAX
3090 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3091 }
3092 }
3093 /* It is counted once already... */
3094 data->pos_min += minnext * (mincount - counted);
3095 data->pos_delta += - counted * deltanext +
3096 (minnext + deltanext) * maxcount - minnext * mincount;
3097 if (mincount != maxcount) {
653099ff
GS
3098 /* Cannot extend fixed substrings found inside
3099 the group. */
1de06328 3100 scan_commit(pRExC_state,data,minlenp);
c277df42 3101 if (mincount && last_str) {
d4c19fe8
AL
3102 SV * const sv = data->last_found;
3103 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3104 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3105
3106 if (mg)
3107 mg->mg_len = -1;
3108 sv_setsv(sv, last_str);
c277df42 3109 data->last_end = data->pos_min;
b81d288d 3110 data->last_start_min =
a0ed51b3 3111 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3112 data->last_start_max = is_inf
3113 ? I32_MAX
c277df42 3114 : data->pos_min + data->pos_delta
a0ed51b3 3115 - CHR_SVLEN(last_str);
c277df42
IZ
3116 }
3117 data->longest = &(data->longest_float);
3118 }
aca2d497 3119 SvREFCNT_dec(last_str);
c277df42 3120 }
405ff068 3121 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3122 data->flags |= SF_HAS_EVAL;
3123 optimize_curly_tail:
c277df42 3124 if (OP(oscan) != CURLYX) {
3dab1dad 3125 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3126 && NEXT_OFF(next))
3127 NEXT_OFF(oscan) += NEXT_OFF(next);
3128 }
c277df42 3129 continue;
653099ff 3130 default: /* REF and CLUMP only? */
c277df42 3131 if (flags & SCF_DO_SUBSTR) {
1de06328 3132 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3133 data->longest = &(data->longest_float);
3134 }
aca2d497 3135 is_inf = is_inf_internal = 1;
653099ff 3136 if (flags & SCF_DO_STCLASS_OR)
830247a4 3137 cl_anything(pRExC_state, data->start_class);
653099ff 3138 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3139 break;
3140 }
a0ed51b3 3141 }
bfed75c6 3142 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3143 int value = 0;
653099ff 3144
c277df42 3145 if (flags & SCF_DO_SUBSTR) {
1de06328 3146 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3147 data->pos_min++;
3148 }
3149 min++;
653099ff
GS
3150 if (flags & SCF_DO_STCLASS) {
3151 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3152
3153 /* Some of the logic below assumes that switching
3154 locale on will only add false positives. */
3dab1dad 3155 switch (PL_regkind[OP(scan)]) {
653099ff 3156 case SANY:
653099ff
GS
3157 default:
3158 do_default:
3159 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3160 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3161 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3162 break;
3163 case REG_ANY:
3164 if (OP(scan) == SANY)
3165 goto do_default;
3166 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3167 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3168 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3169 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3170 }
3171 if (flags & SCF_DO_STCLASS_AND || !value)
3172 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3173 break;
3174 case ANYOF:
3175 if (flags & SCF_DO_STCLASS_AND)
3176 cl_and(data->start_class,
3177 (struct regnode_charclass_class*)scan);
3178 else
830247a4 3179 cl_or(pRExC_state, data->start_class,
653099ff
GS
3180 (struct regnode_charclass_class*)scan);
3181 break;
3182 case ALNUM:
3183 if (flags & SCF_DO_STCLASS_AND) {
3184 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3185 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3186 for (value = 0; value < 256; value++)
3187 if (!isALNUM(value))
3188 ANYOF_BITMAP_CLEAR(data->start_class, value);
3189 }
3190 }
3191 else {
3192 if (data->start_class->flags & ANYOF_LOCALE)
3193 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3194 else {
3195 for (value = 0; value < 256; value++)
3196 if (isALNUM(value))
b81d288d 3197 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3198 }
3199 }
3200 break;
3201 case ALNUML:
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 if (data->start_class->flags & ANYOF_LOCALE)
3204 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3205 }
3206 else {
3207 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3208 data->start_class->flags |= ANYOF_LOCALE;
3209 }
3210 break;
3211 case NALNUM:
3212 if (flags & SCF_DO_STCLASS_AND) {
3213 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3215 for (value = 0; value < 256; value++)
3216 if (isALNUM(value))
3217 ANYOF_BITMAP_CLEAR(data->start_class, value);
3218 }
3219 }
3220 else {
3221 if (data->start_class->flags & ANYOF_LOCALE)
3222 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3223 else {
3224 for (value = 0; value < 256; value++)
3225 if (!isALNUM(value))
b81d288d 3226 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3227 }
3228 }
3229 break;
3230 case NALNUML:
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 if (data->start_class->flags & ANYOF_LOCALE)
3233 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3234 }
3235 else {
3236 data->start_class->flags |= ANYOF_LOCALE;
3237 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3238 }
3239 break;
3240 case SPACE:
3241 if (flags & SCF_DO_STCLASS_AND) {
3242 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3243 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3244 for (value = 0; value < 256; value++)
3245 if (!isSPACE(value))
3246 ANYOF_BITMAP_CLEAR(data->start_class, value);
3247 }
3248 }
3249 else {
3250 if (data->start_class->flags & ANYOF_LOCALE)
3251 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3252 else {
3253 for (value = 0; value < 256; value++)
3254 if (isSPACE(value))
b81d288d 3255 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3256 }
3257 }
3258 break;
3259 case SPACEL:
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 if (data->start_class->flags & ANYOF_LOCALE)
3262 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3263 }
3264 else {
3265 data->start_class->flags |= ANYOF_LOCALE;
3266 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3267 }
3268 break;
3269 case NSPACE:
3270 if (flags & SCF_DO_STCLASS_AND) {
3271 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3273 for (value = 0; value < 256; value++)
3274 if (isSPACE(value))
3275 ANYOF_BITMAP_CLEAR(data->start_class, value);
3276 }
3277 }
3278 else {
3279 if (data->start_class->flags & ANYOF_LOCALE)
3280 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3281 else {
3282 for (value = 0; value < 256; value++)
3283 if (!isSPACE(value))
b81d288d 3284 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3285 }
3286 }
3287 break;
3288 case NSPACEL:
3289 if (flags & SCF_DO_STCLASS_AND) {
3290 if (data->start_class->flags & ANYOF_LOCALE) {
3291 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3292 for (value = 0; value < 256; value++)
3293 if (!isSPACE(value))
3294 ANYOF_BITMAP_CLEAR(data->start_class, value);
3295 }
3296 }
3297 else {
3298 data->start_class->flags |= ANYOF_LOCALE;
3299 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3300 }
3301 break;
3302 case DIGIT:
3303 if (flags & SCF_DO_STCLASS_AND) {
3304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3305 for (value = 0; value < 256; value++)
3306 if (!isDIGIT(value))
3307 ANYOF_BITMAP_CLEAR(data->start_class, value);
3308 }
3309 else {
3310 if (data->start_class->flags & ANYOF_LOCALE)
3311 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3312 else {
3313 for (value = 0; value < 256; value++)
3314 if (isDIGIT(value))
b81d288d 3315 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3316 }
3317 }
3318 break;
3319 case NDIGIT:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3322 for (value = 0; value < 256; value++)
3323 if (isDIGIT(value))
3324 ANYOF_BITMAP_CLEAR(data->start_class, value);
3325 }
3326 else {
3327 if (data->start_class->flags & ANYOF_LOCALE)
3328 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3329 else {
3330 for (value = 0; value < 256; value++)
3331 if (!isDIGIT(value))
b81d288d 3332 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3333 }
3334 }
3335 break;
3336 }
3337 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3338 cl_and(data->start_class, and_withp);
653099ff
GS
3339 flags &= ~SCF_DO_STCLASS;
3340 }
a0ed51b3 3341 }
3dab1dad 3342 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3343 data->flags |= (OP(scan) == MEOL
3344 ? SF_BEFORE_MEOL
3345 : SF_BEFORE_SEOL);
a0ed51b3 3346 }
3dab1dad 3347 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3348 /* Lookbehind, or need to calculate parens/evals/stclass: */
3349 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3350 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3351 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3352 || OP(scan) == UNLESSM )
3353 {
3354 /* Negative Lookahead/lookbehind
3355 In this case we can't do fixed string optimisation.
3356 */
3357
3358 I32 deltanext, minnext, fake = 0;
3359 regnode *nscan;
3360 struct regnode_charclass_class intrnl;
3361 int f = 0;
3362
3363 data_fake.flags = 0;
3364 if (data) {
3365 data_fake.whilem_c = data->whilem_c;
3366 data_fake.last_closep = data->last_closep;
a0ed51b3 3367 }
1de06328
YO
3368 else
3369 data_fake.last_closep = &fake;
3370 if ( flags & SCF_DO_STCLASS && !scan->flags
3371 && OP(scan) == IFMATCH ) { /* Lookahead */
3372 cl_init(pRExC_state, &intrnl);
3373 data_fake.start_class = &intrnl;
3374 f |= SCF_DO_STCLASS_AND;
c277df42 3375 }
1de06328
YO
3376 if (flags & SCF_WHILEM_VISITED_POS)
3377 f |= SCF_WHILEM_VISITED_POS;
3378 next = regnext(scan);
3379 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3380 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3381 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3382 if (scan->flags) {
3383 if (deltanext) {
3384 vFAIL("Variable length lookbehind not implemented");
3385 }
3386 else if (minnext > (I32)U8_MAX) {
3387 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3388 }
3389 scan->flags = (U8)minnext;
3390 }
3391 if (data) {
3392 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3393 pars++;
3394 if (data_fake.flags & SF_HAS_EVAL)
3395 data->flags |= SF_HAS_EVAL;
3396 data->whilem_c = data_fake.whilem_c;
3397 }
3398 if (f & SCF_DO_STCLASS_AND) {
3399 const int was = (data->start_class->flags & ANYOF_EOS);
3400
3401 cl_and(data->start_class, &intrnl);
3402 if (was)
3403 data->start_class->flags |= ANYOF_EOS;
3404 }
be8e71aa 3405 }
1de06328
YO
3406#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3407 else {
3408 /* Positive Lookahead/lookbehind
3409 In this case we can do fixed string optimisation,
3410 but we must be careful about it. Note in the case of
3411 lookbehind the positions will be offset by the minimum
3412 length of the pattern, something we won't know about
3413 until after the recurse.
3414 */
3415 I32 deltanext, fake = 0;
3416 regnode *nscan;
3417 struct regnode_charclass_class intrnl;
3418 int f = 0;
3419 /* We use SAVEFREEPV so that when the full compile
3420 is finished perl will clean up the allocated
3421 minlens when its all done. This was we don't
3422 have to worry about freeing them when we know
3423 they wont be used, which would be a pain.
3424 */
3425 I32 *minnextp;
3426 Newx( minnextp, 1, I32 );
3427 SAVEFREEPV(minnextp);
3428
3429 if (data) {
3430 StructCopy(data, &data_fake, scan_data_t);
3431 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3432 f |= SCF_DO_SUBSTR;
3433 if (scan->flags)
3434 scan_commit(pRExC_state, &data_fake,minlenp);
3435 data_fake.last_found=newSVsv(data->last_found);
3436 }
3437 }
3438 else
3439 data_fake.last_closep = &fake;
3440 data_fake.flags = 0;
3441 if (is_inf)
3442 data_fake.flags |= SF_IS_INF;
3443 if ( flags & SCF_DO_STCLASS && !scan->flags
3444 && OP(scan) == IFMATCH ) { /* Lookahead */
3445 cl_init(pRExC_state, &intrnl);
3446 data_fake.start_class = &intrnl;
3447 f |= SCF_DO_STCLASS_AND;
3448 }
3449 if (flags & SCF_WHILEM_VISITED_POS)
3450 f |= SCF_WHILEM_VISITED_POS;
3451 next = regnext(scan);
3452 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3453
3454 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3455 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3456 if (scan->flags) {
3457 if (deltanext) {
3458 vFAIL("Variable length lookbehind not implemented");
3459 }
3460 else if (*minnextp > (I32)U8_MAX) {
3461 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3462 }
3463 scan->flags = (U8)*minnextp;
3464 }
40d049e4 3465
1de06328 3466 *minnextp += min;
40d049e4 3467
1de06328
YO
3468 if (f & SCF_DO_STCLASS_AND) {
3469 const int was = (data->start_class->flags & ANYOF_EOS);
3470
3471 cl_and(data->start_class, &intrnl);
3472 if (was)
3473 data->start_class->flags |= ANYOF_EOS;
40d049e4 3474 }
1de06328
YO
3475 if (data) {
3476 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3477 pars++;
3478 if (data_fake.flags & SF_HAS_EVAL)
3479 data->flags |= SF_HAS_EVAL;
3480 data->whilem_c = data_fake.whilem_c;
3481 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3482 if (RExC_rx->minlen<*minnextp)
3483 RExC_rx->minlen=*minnextp;
3484 scan_commit(pRExC_state, &data_fake, minnextp);
3485 SvREFCNT_dec(data_fake.last_found);
3486
3487 if ( data_fake.minlen_fixed != minlenp )
3488 {
3489 data->offset_fixed= data_fake.offset_fixed;
3490 data->minlen_fixed= data_fake.minlen_fixed;
3491 data->lookbehind_fixed+= scan->flags;
3492 }
3493 if ( data_fake.minlen_float != minlenp )
3494 {
3495 data->minlen_float= data_fake.minlen_float;
3496 data->offset_float_min=data_fake.offset_float_min;
3497 data->offset_float_max=data_fake.offset_float_max;
3498 data->lookbehind_float+= scan->flags;
3499 }
3500 }
3501 }
3502
653099ff 3503
653099ff 3504 }
1de06328 3505#endif
a0ed51b3
LW
3506 }
3507 else if (OP(scan) == OPEN) {
40d049e4
YO
3508 if (stopparen != (I32)ARG(scan))
3509 pars++;
a0ed51b3 3510 }
cb434fcc 3511 else if (OP(scan) == CLOSE) {
40d049e4
YO
3512 if (stopparen == (I32)ARG(scan)) {
3513 break;
3514 }
eb160463 3515 if ((I32)ARG(scan) == is_par) {
cb434fcc 3516 next = regnext(scan);
c277df42 3517
cb434fcc
IZ
3518 if ( next && (OP(next) != WHILEM) && next < last)
3519 is_par = 0; /* Disable optimization */
3520 }
3521 if (data)
3522 *(data->last_closep) = ARG(scan);
a0ed51b3 3523 }
1a147d38 3524 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3525 /* set the pointer */
3526 I32 paren;
3527 regnode *start;
3528 regnode *end;
1a147d38 3529 if (OP(scan) == GOSUB) {
40d049e4
YO
3530 paren = ARG(scan);
3531 RExC_recurse[ARG2L(scan)] = scan;
3532 start = RExC_open_parens[paren-1];
3533 end = RExC_close_parens[paren-1];
3534 } else {
3535 paren = 0;
3536 start = RExC_rx->program + 1;
3537 end = RExC_opend;
3538 }
3539 assert(start);
3540 assert(end);
3541 if (!recursed) {
3542 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3543 SAVEFREEPV(recursed);
3544 }
3545 if (!PAREN_TEST(recursed,paren+1)) {
3546 I32 deltanext = 0;
3547 PAREN_SET(recursed,paren+1);
3548
3549 DEBUG_PEEP("goto",start,depth);
3550 min += study_chunk(
3551 pRExC_state,
3552 &start,
3553 minlenp,
3554 &deltanext,
3555 end+1,
3556 data,
3557 paren,
3558 recursed,
3559 and_withp,
3560 flags,depth+1);
3561 delta+=deltanext;
3562 if (deltanext == I32_MAX) {
3563 is_inf = is_inf_internal = 1;
3564 delta=deltanext;
3565 }
3566 DEBUG_PEEP("rtrn",end,depth);
3567 PAREN_UNSET(recursed,paren+1);
3568 } else {
3569 if (flags & SCF_DO_SUBSTR) {
3570 scan_commit(pRExC_state,data,minlenp);
3571 data->longest = &(data->longest_float);
3572 }
3573 is_inf = is_inf_internal = 1;
3574 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3575 cl_anything(pRExC_state, data->start_class);
3576 flags &= ~SCF_DO_STCLASS;
3577 }
3578 }
a0ed51b3 3579 else if (OP(scan) == EVAL) {
c277df42
IZ
3580 if (data)
3581 data->flags |= SF_HAS_EVAL;
3582 }
40d049e4
YO
3583 else if ( OP(scan)==OPFAIL ) {
3584 if (flags & SCF_DO_SUBSTR) {
3585 scan_commit(pRExC_state,data,minlenp);
3586 flags &= ~SCF_DO_SUBSTR;
3587 }
3588 }
3589 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3590 {
0f5d15d6 3591 if (flags & SCF_DO_SUBSTR) {
1de06328 3592 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3593 data->longest = &(data->longest_float);
3594 }
3595 is_inf = is_inf_internal = 1;
653099ff 3596 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3597 cl_anything(pRExC_state, data->start_class);
96776eda 3598 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3599 }
786e8c11 3600#ifdef TRIE_STUDY_OPT
40d049e4 3601#ifdef FULL_TRIE_STUDY
786e8c11
YO
3602 else if (PL_regkind[OP(scan)] == TRIE) {
3603 /* NOTE - There is similar code to this block above for handling
40d049e4 3604 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3605 check there too. */
7f69552c 3606 regnode *trie_node= scan;
786e8c11
YO
3607 regnode *tail= regnext(scan);
3608 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3609 I32 max1 = 0, min1 = I32_MAX;
3610 struct regnode_charclass_class accum;
3611
3612 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3613 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3614 if (flags & SCF_DO_STCLASS)
3615 cl_init_zero(pRExC_state, &accum);
3616
3617 if (!trie->jump) {
3618 min1= trie->minlen;
3619 max1= trie->maxlen;
3620 } else {
3621 const regnode *nextbranch= NULL;
3622 U32 word;
3623
3624 for ( word=1 ; word <= trie->wordcount ; word++)
3625 {
3626 I32 deltanext=0, minnext=0, f = 0, fake;
3627 struct regnode_charclass_class this_class;
3628
3629 data_fake.flags = 0;
3630 if (data) {
3631 data_fake.whilem_c = data->whilem_c;
3632 data_fake.last_closep = data->last_closep;
3633 }
3634 else
3635 data_fake.last_closep = &fake;
3636
3637 if (flags & SCF_DO_STCLASS) {
3638 cl_init(pRExC_state, &this_class);
3639 data_fake.start_class = &this_class;
3640 f = SCF_DO_STCLASS_AND;
3641 }
3642 if (flags & SCF_WHILEM_VISITED_POS)
3643 f |= SCF_WHILEM_VISITED_POS;
3644
3645 if (trie->jump[word]) {
3646 if (!nextbranch)
7f69552c
YO
3647 nextbranch = trie_node + trie->jump[0];
3648 scan= trie_node + trie->jump[word];
786e8c11
YO
3649 /* We go from the jump point to the branch that follows
3650 it. Note this means we need the vestigal unused branches
3651 even though they arent otherwise used.
3652 */
40d049e4
YO
3653 minnext = study_chunk(pRExC_state, &scan, minlenp,
3654 &deltanext, (regnode *)nextbranch, &data_fake,
3655 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3656 }
3657 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3658 nextbranch= regnext((regnode*)nextbranch);
3659
3660 if (min1 > (I32)(minnext + trie->minlen))
3661 min1 = minnext + trie->minlen;
3662 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3663 max1 = minnext + deltanext + trie->maxlen;
3664 if (deltanext == I32_MAX)
3665 is_inf = is_inf_internal = 1;
3666
3667 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3668 pars++;
3669
3670 if (data) {
3671 if (data_fake.flags & SF_HAS_EVAL)
3672 data->flags |= SF_HAS_EVAL;
3673 data->whilem_c = data_fake.whilem_c;
3674 }
3675 if (flags & SCF_DO_STCLASS)
3676 cl_or(pRExC_state, &accum, &this_class);
3677 }
3678 }
3679 if (flags & SCF_DO_SUBSTR) {
3680 data->pos_min += min1;
3681 data->pos_delta += max1 - min1;
3682 if (max1 != min1 || is_inf)
3683 data->longest = &(data->longest_float);
3684 }
3685 min += min1;
3686 delta += max1 - min1;
3687 if (flags & SCF_DO_STCLASS_OR) {
3688 cl_or(pRExC_state, data->start_class, &accum);
3689 if (min1) {
40d049e4 3690 cl_and(data->start_class, and_withp);
786e8c11
YO
3691 flags &= ~SCF_DO_STCLASS;
3692 }
3693 }
3694 else if (flags & SCF_DO_STCLASS_AND) {
3695 if (min1) {
3696 cl_and(data->start_class, &accum);
3697 flags &= ~SCF_DO_STCLASS;
3698 }
3699 else {
3700 /* Switch to OR mode: cache the old value of
3701 * data->start_class */
40d049e4
YO
3702 INIT_AND_WITHP;
3703 StructCopy(data->start_class, and_withp,
786e8c11
YO
3704 struct regnode_charclass_class);
3705 flags &= ~SCF_DO_STCLASS_AND;
3706 StructCopy(&accum, data->start_class,
3707 struct regnode_charclass_class);
3708 flags |= SCF_DO_STCLASS_OR;
3709 data->start_class->flags |= ANYOF_EOS;
3710 }
3711 }
3712 scan= tail;
3713 continue;
3714 }
3715#else
3716 else if (PL_regkind[OP(scan)] == TRIE) {
3717 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3718 U8*bang=NULL;
3719
3720 min += trie->minlen;
3721 delta += (trie->maxlen - trie->minlen);
3722 flags &= ~SCF_DO_STCLASS; /* xxx */
3723 if (flags & SCF_DO_SUBSTR) {
1de06328 3724 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3725 data->pos_min += trie->minlen;
3726 data->pos_delta += (trie->maxlen - trie->minlen);
3727 if (trie->maxlen != trie->minlen)
3728 data->longest = &(data->longest_float);
3729 }
3730 if (trie->jump) /* no more substrings -- for now /grr*/
3731 flags &= ~SCF_DO_SUBSTR;
3732 }
3733#endif /* old or new */
3734#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3735 /* Else: zero-length, ignore. */
3736 scan = regnext(scan);
3737 }
3738
3739 finish:
3740 *scanp = scan;
aca2d497 3741 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3742 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3743 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3744 if (is_par > (I32)U8_MAX)
c277df42
IZ
3745 is_par = 0;
3746 if (is_par && pars==1 && data) {
3747 data->flags |= SF_IN_PAR;
3748 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3749 }
3750 else if (pars && data) {
c277df42
IZ
3751 data->flags |= SF_HAS_PAR;
3752 data->flags &= ~SF_IN_PAR;
3753 }
653099ff 3754 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3755 cl_and(data->start_class, and_withp);
786e8c11
YO
3756 if (flags & SCF_TRIE_RESTUDY)
3757 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3758
3759 DEBUG_STUDYDATA(data,depth);
3760
c277df42
IZ
3761 return min;
3762}
3763
76e3520e 3764STATIC I32
5f66b61c 3765S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3766{
830247a4 3767 if (RExC_rx->data) {
2eb97020 3768 const U32 count = RExC_rx->data->count;
b81d288d 3769 Renewc(RExC_rx->data,
2eb97020 3770 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3771 char, struct reg_data);
2eb97020 3772 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3773 RExC_rx->data->count += n;
a0ed51b3
LW
3774 }
3775 else {
a02a5408 3776 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3777 char, struct reg_data);
a02a5408 3778 Newx(RExC_rx->data->what, n, U8);
830247a4 3779 RExC_rx->data->count = n;
c277df42 3780 }
830247a4
IZ
3781 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3782 return RExC_rx->data->count - n;
c277df42
IZ
3783}
3784
76234dfb 3785#ifndef PERL_IN_XSUB_RE
d88dccdf 3786void
864dbfa3 3787Perl_reginitcolors(pTHX)
d88dccdf 3788{
97aff369 3789 dVAR;
1df70142 3790 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3791 if (s) {
1df70142
AL
3792 char *t = savepv(s);
3793 int i = 0;
3794 PL_colors[0] = t;
d88dccdf 3795 while (++i < 6) {
1df70142
AL
3796 t = strchr(t, '\t');
3797 if (t) {
3798 *t = '\0';
3799 PL_colors[i] = ++t;
d88dccdf
IZ
3800 }
3801 else
1df70142 3802 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3803 }
3804 } else {
1df70142 3805 int i = 0;
b81d288d 3806 while (i < 6)
06b5626a 3807 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3808 }
3809 PL_colorset = 1;
3810}
76234dfb 3811#endif
8615cb43 3812
07be1b83 3813
786e8c11
YO
3814#ifdef TRIE_STUDY_OPT
3815#define CHECK_RESTUDY_GOTO \
3816 if ( \
3817 (data.flags & SCF_TRIE_RESTUDY) \
3818 && ! restudied++ \
3819 ) goto reStudy
3820#else
3821#define CHECK_RESTUDY_GOTO
3822#endif
f9f4320a 3823
a687059c 3824/*
e50aee73 3825 - pregcomp - compile a regular expression into internal code
a687059c
LW
3826 *
3827 * We can't allocate space until we know how big the compiled form will be,
3828 * but we can't compile it (and thus know how big it is) until we've got a
3829 * place to put the code. So we cheat: we compile it twice, once with code
3830 * generation turned off and size counting turned on, and once "for real".
3831 * This also means that we don't allocate space until we are sure that the
3832 * thing really will compile successfully, and we never have to move the
3833 * code and thus invalidate pointers into it. (Note that it has to be in
3834 * one piece because free() must be able to free it all.) [NB: not true in perl]
3835 *
3836 * Beware that the optimization-preparation code in here knows about some
3837 * of the structure of the compiled regexp. [I'll say.]
3838 */
b9b4dddf
YO
3839
3840
3841
f9f4320a 3842#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3843#define RE_ENGINE_PTR &PL_core_reg_engine
3844#else
f9f4320a
YO
3845extern const struct regexp_engine my_reg_engine;
3846#define RE_ENGINE_PTR &my_reg_engine
3847#endif
b9b4dddf
YO
3848/* these make a few things look better, to avoid indentation */
3849#define BEGIN_BLOCK {
f9f4320a
YO
3850#define END_BLOCK }
3851
a687059c 3852regexp *
864dbfa3 3853Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3854{
97aff369 3855 dVAR;
f9f4320a
YO
3856 GET_RE_DEBUG_FLAGS_DECL;
3857 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3858#ifndef PERL_IN_XSUB_RE
3859 BEGIN_BLOCK
f9f4320a
YO
3860 /* Dispatch a request to compile a regexp to correct
3861 regexp engine. */
3862 HV * const table = GvHV(PL_hintgv);
3863 if (table) {
3864 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3865 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3866 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3867 DEBUG_COMPILE_r({
8d8756e7 3868 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3869 SvIV(*ptr));
3870 });
f2f78491 3871 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3872 }
b9b4dddf
YO
3873 }
3874 END_BLOCK
3875#endif
3876 BEGIN_BLOCK
a0d0e21e 3877 register regexp *r;
c277df42 3878 regnode *scan;
c277df42 3879 regnode *first;
a0d0e21e 3880 I32 flags;
a0d0e21e
LW
3881 I32 minlen = 0;
3882 I32 sawplus = 0;
3883 I32 sawopen = 0;
2c2d71f5 3884 scan_data_t data;
830247a4 3885 RExC_state_t RExC_state;
be8e71aa 3886 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3887#ifdef TRIE_STUDY_OPT
3888 int restudied= 0;
3889 RExC_state_t copyRExC_state;
3890#endif
a0d0e21e 3891 if (exp == NULL)
c277df42 3892 FAIL("NULL regexp argument");
a0d0e21e 3893
a5961de5 3894 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3895
5cfc7842 3896 RExC_precomp = exp;
a3621e74 3897 DEBUG_COMPILE_r({
ab3bbdeb
YO
3898 SV *dsv= sv_newmortal();
3899 RE_PV_QUOTED_DECL(s, RExC_utf8,
3900 dsv, RExC_precomp, (xend - exp), 60);
3901 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3902 PL_colors[4],PL_colors[5],s);
a5961de5 3903 });
e2509266 3904 RExC_flags = pm->op_pmflags;
830247a4 3905 RExC_sawback = 0;
bbce6d69 3906
830247a4
IZ
3907 RExC_seen = 0;
3908 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3909 RExC_seen_evals = 0;
3910 RExC_extralen = 0;
c277df42 3911
bbce6d69 3912 /* First pass: determine size, legality. */
830247a4 3913 RExC_parse = exp;
fac92740 3914 RExC_start = exp;
830247a4
IZ
3915 RExC_end = xend;
3916 RExC_naughty = 0;
3917 RExC_npar = 1;
3918 RExC_size = 0L;
3919 RExC_emit = &PL_regdummy;
3920 RExC_whilem_seen = 0;
fc8cd66c 3921 RExC_charnames = NULL;
40d049e4
YO
3922 RExC_open_parens = NULL;
3923 RExC_close_parens = NULL;
3924 RExC_opend = NULL;
81714fb9 3925 RExC_paren_names = NULL;
40d049e4
YO
3926 RExC_recurse = NULL;
3927 RExC_recurse_count = 0;
81714fb9 3928
85ddcde9
JH
3929#if 0 /* REGC() is (currently) a NOP at the first pass.
3930 * Clever compilers notice this and complain. --jhi */
830247a4 3931 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3932#endif
3dab1dad
YO
3933 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3934 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3935 RExC_precomp = NULL;
a0d0e21e
LW
3936 return(NULL);
3937 }
07be1b83 3938 DEBUG_PARSE_r({
81714fb9
YO
3939 PerlIO_printf(Perl_debug_log,
3940 "Required size %"IVdf" nodes\n"
3941 "Starting second pass (creation)\n",
3942 (IV)RExC_size);
07be1b83
YO
3943 RExC_lastnum=0;
3944 RExC_lastparse=NULL;
3945 });
c277df42
IZ
3946 /* Small enough for pointer-storage convention?
3947 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3948 if (RExC_size >= 0x10000L && RExC_extralen)
3949 RExC_size += RExC_extralen;
c277df42 3950 else
830247a4
IZ
3951 RExC_extralen = 0;
3952 if (RExC_whilem_seen > 15)
3953 RExC_whilem_seen = 15;
a0d0e21e 3954
f9f4320a
YO
3955 /* Allocate space and zero-initialize. Note, the two step process
3956 of zeroing when in debug mode, thus anything assigned has to
3957 happen after that */
a02a5408 3958 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3959 char, regexp);
a0d0e21e 3960 if (r == NULL)
b45f050a 3961 FAIL("Regexp out of space");
0f79a09d
GS
3962#ifdef DEBUGGING
3963 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3964 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3965#endif
f9f4320a
YO
3966 /* initialization begins here */
3967 r->engine= RE_ENGINE_PTR;
c277df42 3968 r->refcnt = 1;
bbce6d69 3969 r->prelen = xend - exp;
5cfc7842 3970 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3971 r->subbeg = NULL;
f8c7b90f 3972#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3973 r->saved_copy = NULL;
ed252734 3974#endif
cf93c79d 3975 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3976 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3977 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3978
3979 r->substrs = 0; /* Useful during FAIL. */
3980 r->startp = 0; /* Useful during FAIL. */
6bda09f9 3981 r->endp = 0;
81714fb9
YO
3982 r->paren_names = 0;
3983
6bda09f9 3984 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
3985 Newxz(RExC_open_parens, RExC_npar,regnode *);
3986 SAVEFREEPV(RExC_open_parens);
3987 Newxz(RExC_close_parens,RExC_npar,regnode *);
3988 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
3989 }
3990
3991 /* Useful during FAIL. */
a02a5408 3992 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3993 if (r->offsets) {
2af232bd 3994 r->offsets[0] = RExC_size;
fac92740 3995 }
a3621e74 3996 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3997 "%s %"UVuf" bytes for offset annotations.\n",
3998 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3999 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4000
830247a4 4001 RExC_rx = r;
bbce6d69 4002
4003 /* Second pass: emit code. */
e2509266 4004 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4005 RExC_parse = exp;
4006 RExC_end = xend;
4007 RExC_naughty = 0;
4008 RExC_npar = 1;
fac92740 4009 RExC_emit_start = r->program;
830247a4 4010 RExC_emit = r->program;
2cd61cdb 4011 /* Store the count of eval-groups for security checks: */
786e8c11 4012 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4013 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4014 r->data = 0;
3dab1dad 4015 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4016 return(NULL);
6bda09f9 4017
07be1b83
YO
4018 /* XXXX To minimize changes to RE engine we always allocate
4019 3-units-long substrs field. */
4020 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4021 if (RExC_recurse_count) {
4022 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4023 SAVEFREEPV(RExC_recurse);
4024 }
a0d0e21e 4025
07be1b83 4026reStudy:
1de06328 4027 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4028 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4029
07be1b83
YO
4030#ifdef TRIE_STUDY_OPT
4031 if ( restudied ) {
4032 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4033 RExC_state=copyRExC_state;
1de06328 4034 if (data.last_found) {
07be1b83 4035 SvREFCNT_dec(data.longest_fixed);
07be1b83 4036 SvREFCNT_dec(data.longest_float);
07be1b83 4037 SvREFCNT_dec(data.last_found);
1de06328 4038 }
40d049e4 4039 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4040 } else {
40d049e4 4041 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83
YO
4042 copyRExC_state=RExC_state;
4043 }
40d049e4
YO
4044#else
4045 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4046#endif
fc8cd66c 4047
a0d0e21e 4048 /* Dig out information for optimizations. */
cf93c79d 4049 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4050 pm->op_pmflags = RExC_flags;
a0ed51b3 4051 if (UTF)
5ff6fc6d 4052 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4053 r->regstclass = NULL;
830247a4 4054 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4055 r->reganch |= ROPT_NAUGHTY;
c277df42 4056 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4057
1de06328
YO
4058 /* testing for BRANCH here tells us whether there is "must appear"
4059 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4060 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4061 I32 fake;
c5254dd6 4062 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4063 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4064 int stclass_flag;
07be1b83 4065 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4066
4067 first = scan;
c277df42 4068 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4069 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4070 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4071 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4072 /* for now we can't handle lookbehind IFMATCH*/
4073 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4074 (OP(first) == PLUS) ||
4075 (OP(first) == MINMOD) ||
653099ff 4076 /* An {n,m} with n>0 */
07be1b83
YO
4077 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4078 {
786e8c11 4079
a0d0e21e
LW
4080 if (OP(first) == PLUS)
4081 sawplus = 1;
4082 else
3dab1dad 4083 first += regarglen[OP(first)];
07be1b83
YO
4084 if (OP(first) == IFMATCH) {
4085 first = NEXTOPER(first);
4086 first += EXTRA_STEP_2ARGS;
7c167cea 4087 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4088 first = NEXTOPER(first);
a687059c
LW
4089 }
4090
a0d0e21e
LW
4091 /* Starting-point info. */
4092 again:
786e8c11 4093 DEBUG_PEEP("first:",first,0);
07be1b83 4094 /* Ignore EXACT as we deal with it later. */
3dab1dad 4095 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4096 if (OP(first) == EXACT)
6f207bd3 4097 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4098 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4099 r->regstclass = first;
4100 }
07be1b83 4101#ifdef TRIE_STCLASS
786e8c11 4102 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4103 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4104 {
786e8c11 4105 regnode *trie_op;
07be1b83 4106 /* this can happen only on restudy */
786e8c11
YO
4107 if ( OP(first) == TRIE ) {
4108 struct regnode_1 *trieop;
4109 Newxz(trieop,1,struct regnode_1);
4110 StructCopy(first,trieop,struct regnode_1);
4111 trie_op=(regnode *)trieop;
4112 } else {
4113 struct regnode_charclass *trieop;
4114 Newxz(trieop,1,struct regnode_charclass);
4115 StructCopy(first,trieop,struct regnode_charclass);
4116 trie_op=(regnode *)trieop;
4117 }
1de06328 4118 OP(trie_op)+=2;
786e8c11
YO
4119 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4120 r->regstclass = trie_op;
07be1b83
YO
4121 }
4122#endif
bfed75c6 4123 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4124 r->regstclass = first;
3dab1dad
YO
4125 else if (PL_regkind[OP(first)] == BOUND ||
4126 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4127 r->regstclass = first;
3dab1dad 4128 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4129 r->reganch |= (OP(first) == MBOL
4130 ? ROPT_ANCH_MBOL
4131 : (OP(first) == SBOL
4132 ? ROPT_ANCH_SBOL
4133 : ROPT_ANCH_BOL));
a0d0e21e 4134 first = NEXTOPER(first);
774d564b 4135 goto again;
4136 }
4137 else if (OP(first) == GPOS) {
4138 r->reganch |= ROPT_ANCH_GPOS;
4139 first = NEXTOPER(first);
4140 goto again;
a0d0e21e 4141 }
e09294f4 4142 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4143 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4144 !(r->reganch & ROPT_ANCH) )
4145 {
4146 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4147 const int type =
4148 (OP(NEXTOPER(first)) == REG_ANY)
4149 ? ROPT_ANCH_MBOL
4150 : ROPT_ANCH_SBOL;
cad2e5aa 4151 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4152 first = NEXTOPER(first);
774d564b 4153 goto again;
a0d0e21e 4154 }
b81d288d 4155 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4156 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4157 /* x+ must match at the 1st pos of run of x's */
4158 r->reganch |= ROPT_SKIP;
a0d0e21e 4159
c277df42 4160 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4161#ifdef TRIE_STUDY_OPT
81714fb9 4162 DEBUG_PARSE_r(
be8e71aa
YO
4163 if (!restudied)
4164 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4165 (IV)(first - scan + 1))
4166 );
4167#else
81714fb9 4168 DEBUG_PARSE_r(
be8e71aa
YO
4169 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4170 (IV)(first - scan + 1))
4171 );
4172#endif
4173
4174
a0d0e21e
LW
4175 /*
4176 * If there's something expensive in the r.e., find the
4177 * longest literal string that must appear and make it the
4178 * regmust. Resolve ties in favor of later strings, since
4179 * the regstart check works with the beginning of the r.e.
4180 * and avoiding duplication strengthens checking. Not a
4181 * strong reason, but sufficient in the absence of others.
4182 * [Now we resolve ties in favor of the earlier string if
c277df42 4183 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4184 * earlier string may buy us something the later one won't.]
4185 */
a0d0e21e 4186 minlen = 0;
a687059c 4187
396482e1
GA
4188 data.longest_fixed = newSVpvs("");
4189 data.longest_float = newSVpvs("");
4190 data.last_found = newSVpvs("");
c277df42
IZ
4191 data.longest = &(data.longest_fixed);
4192 first = scan;
653099ff 4193 if (!r->regstclass) {
830247a4 4194 cl_init(pRExC_state, &ch_class);
653099ff
GS
4195 data.start_class = &ch_class;
4196 stclass_flag = SCF_DO_STCLASS_AND;
4197 } else /* XXXX Check for BOUND? */
4198 stclass_flag = 0;
cb434fcc 4199 data.last_closep = &last_close;
653099ff 4200
1de06328 4201 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4202 &data, -1, NULL, NULL,
4203 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4204
07be1b83 4205
786e8c11
YO
4206 CHECK_RESTUDY_GOTO;
4207
4208
830247a4 4209 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4210 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4211 && !RExC_seen_zerolen
4212 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4213 r->reganch |= ROPT_CHECK_ALL;
1de06328 4214 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4215 SvREFCNT_dec(data.last_found);
4216
1de06328
YO
4217 /* Note that code very similar to this but for anchored string
4218 follows immediately below, changes may need to be made to both.
4219 Be careful.
4220 */
a0ed51b3 4221 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4222 if (longest_float_length
c277df42
IZ
4223 || (data.flags & SF_FL_BEFORE_EOL
4224 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4225 || (RExC_flags & PMf_MULTILINE))))
4226 {
1182767e 4227 I32 t,ml;
cf93c79d 4228
1de06328 4229 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4230 && data.offset_fixed == data.offset_float_min
4231 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4232 goto remove_float; /* As in (a)+. */
4233
1de06328
YO
4234 /* copy the information about the longest float from the reg_scan_data
4235 over to the program. */
33b8afdf
JH
4236 if (SvUTF8(data.longest_float)) {
4237 r->float_utf8 = data.longest_float;
c445ea15 4238 r->float_substr = NULL;
33b8afdf
JH
4239 } else {
4240 r->float_substr = data.longest_float;
c445ea15 4241 r->float_utf8 = NULL;
33b8afdf 4242 }
1de06328
YO
4243 /* float_end_shift is how many chars that must be matched that
4244 follow this item. We calculate it ahead of time as once the
4245 lookbehind offset is added in we lose the ability to correctly
4246 calculate it.*/
4247 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4248 : (I32)longest_float_length;
1de06328
YO
4249 r->float_end_shift = ml - data.offset_float_min
4250 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4251 + data.lookbehind_float;
4252 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4253 r->float_max_offset = data.offset_float_max;
1182767e 4254 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4255 r->float_max_offset -= data.lookbehind_float;
4256
cf93c79d
IZ
4257 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4258 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4259 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4260 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4261 }
4262 else {
aca2d497 4263 remove_float:
c445ea15 4264 r->float_substr = r->float_utf8 = NULL;
c277df42 4265 SvREFCNT_dec(data.longest_float);
c5254dd6 4266 longest_float_length = 0;
a0d0e21e 4267 }
c277df42 4268
1de06328
YO
4269 /* Note that code very similar to this but for floating string
4270 is immediately above, changes may need to be made to both.
4271 Be careful.
4272 */
a0ed51b3 4273 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4274 if (longest_fixed_length
c277df42
IZ
4275 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4276 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4277 || (RExC_flags & PMf_MULTILINE))))
4278 {
1182767e 4279 I32 t,ml;
cf93c79d 4280
1de06328
YO
4281 /* copy the information about the longest fixed
4282 from the reg_scan_data over to the program. */
33b8afdf
JH
4283 if (SvUTF8(data.longest_fixed)) {
4284 r->anchored_utf8 = data.longest_fixed;
c445ea15 4285 r->anchored_substr = NULL;
33b8afdf
JH
4286 } else {
4287 r->anchored_substr = data.longest_fixed;
c445ea15 4288 r->anchored_utf8 = NULL;
33b8afdf 4289 }
1de06328
YO
4290 /* fixed_end_shift is how many chars that must be matched that
4291 follow this item. We calculate it ahead of time as once the
4292 lookbehind offset is added in we lose the ability to correctly
4293 calculate it.*/
4294 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4295 : (I32)longest_fixed_length;
1de06328
YO
4296 r->anchored_end_shift = ml - data.offset_fixed
4297 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4298 + data.lookbehind_fixed;
4299 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4300
cf93c79d
IZ
4301 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4302 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4303 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4304 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4305 }
4306 else {
c445ea15 4307 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4308 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4309 longest_fixed_length = 0;
a0d0e21e 4310 }
b81d288d 4311 if (r->regstclass
ffc61ed2 4312 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4313 r->regstclass = NULL;
33b8afdf
JH
4314 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4315 && stclass_flag
653099ff 4316 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4317 && !cl_is_anything(data.start_class))
4318 {
1df70142 4319 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4320
a02a5408 4321 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4322 struct regnode_charclass_class);
4323 StructCopy(data.start_class,
830247a4 4324 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4325 struct regnode_charclass_class);
830247a4 4326 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4327 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4328 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4329 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4330 PerlIO_printf(Perl_debug_log,
a0288114 4331 "synthetic stclass \"%s\".\n",
3f7c398e 4332 SvPVX_const(sv));});
653099ff 4333 }
c277df42
IZ
4334
4335 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4336 if (longest_fixed_length > longest_float_length) {
1de06328 4337 r->check_end_shift = r->anchored_end_shift;
c277df42 4338 r->check_substr = r->anchored_substr;
33b8afdf 4339 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4340 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4341 if (r->reganch & ROPT_ANCH_SINGLE)
4342 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4343 }
4344 else {
1de06328 4345 r->check_end_shift = r->float_end_shift;
c277df42 4346 r->check_substr = r->float_substr;
33b8afdf 4347 r->check_utf8 = r->float_utf8;
1de06328
YO
4348 r->check_offset_min = r->float_min_offset;
4349 r->check_offset_max = r->float_max_offset;
a0d0e21e 4350 }
30382c73
IZ
4351 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4352 This should be changed ASAP! */
33b8afdf 4353 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4354 r->reganch |= RE_USE_INTUIT;
33b8afdf 4355 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4356 r->reganch |= RE_INTUIT_TAIL;
4357 }
1de06328
YO
4358 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4359 if ( (STRLEN)minlen < longest_float_length )
4360 minlen= longest_float_length;
4361 if ( (STRLEN)minlen < longest_fixed_length )
4362 minlen= longest_fixed_length;
4363 */
a0ed51b3
LW
4364 }
4365 else {
c277df42
IZ
4366 /* Several toplevels. Best we can is to set minlen. */
4367 I32 fake;
653099ff 4368 struct regnode_charclass_class ch_class;
cb434fcc 4369 I32 last_close = 0;
c277df42 4370
a3621e74 4371 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4372
c277df42 4373 scan = r->program + 1;
830247a4 4374 cl_init(pRExC_state, &ch_class);
653099ff 4375 data.start_class = &ch_class;
cb434fcc 4376 data.last_closep = &last_close;
07be1b83 4377
1de06328 4378 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4379 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
07be1b83 4380
786e8c11 4381 CHECK_RESTUDY_GOTO;
07be1b83 4382
33b8afdf 4383 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4384 = r->float_substr = r->float_utf8 = NULL;
653099ff 4385 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4386 && !cl_is_anything(data.start_class))
4387 {
1df70142 4388 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4389
a02a5408 4390 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4391 struct regnode_charclass_class);
4392 StructCopy(data.start_class,
830247a4 4393 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4394 struct regnode_charclass_class);
830247a4 4395 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4396 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4397 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4398 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4399 PerlIO_printf(Perl_debug_log,
a0288114 4400 "synthetic stclass \"%s\".\n",
3f7c398e 4401 SvPVX_const(sv));});
653099ff 4402 }
a0d0e21e
LW
4403 }
4404
1de06328
YO
4405 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4406 the "real" pattern. */
4407 if (r->minlen < minlen)
4408 r->minlen = minlen;
4409
b81d288d 4410 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4411 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4412 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4413 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4414 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4415 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4416 if (RExC_seen & REG_SEEN_CANY)
4417 r->reganch |= ROPT_CANY_SEEN;
81714fb9
YO
4418 if (RExC_paren_names)
4419 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4420 else
4421 r->paren_names = NULL;
4422
40d049e4
YO
4423 if (RExC_recurse_count) {
4424 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4425 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4426 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4427 }
4428 }
a02a5408
JC
4429 Newxz(r->startp, RExC_npar, I32);
4430 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4431
f2278c82 4432 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4433 DEBUG_DUMP_r({
4434 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4435 regdump(r);
4436 });
8e9a8a48
YO
4437 DEBUG_OFFSETS_r(if (r->offsets) {
4438 const U32 len = r->offsets[0];
4439 U32 i;
4440 GET_RE_DEBUG_FLAGS_DECL;
4441 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4442 for (i = 1; i <= len; i++) {
4443 if (r->offsets[i*2-1] || r->offsets[i*2])
4444 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4445 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4446 }
4447 PerlIO_printf(Perl_debug_log, "\n");
4448 });
a0d0e21e 4449 return(r);
f9f4320a 4450 END_BLOCK
a687059c
LW
4451}
4452
f9f4320a
YO
4453#undef CORE_ONLY_BLOCK
4454#undef END_BLOCK
4455#undef RE_ENGINE_PTR
3dab1dad 4456
9af228c6 4457#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4458SV*
4459Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4460{
4461 I32 parno = 0; /* no match */
4462 if (PL_curpm) {
4463 const REGEXP * const rx = PM_GETRE(PL_curpm);
4464 if (rx && rx->paren_names) {
4465 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4466 if (he_str) {
4467 IV i;
4468 SV* sv_dat=HeVAL(he_str);
4469 I32 *nums=(I32*)SvPVX(sv_dat);
4470 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4471 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4472 rx->endp[nums[i]] != -1)
4473 {
4474 parno = nums[i];
4475 break;
4476 }
4477 }
4478 }
4479 }
4480 }
4481 if ( !parno ) {
4482 return 0;
4483 } else {
4484 GV *gv_paren;
4485 SV *sv= sv_newmortal();
4486 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4487 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4488 return GvSVn(gv_paren);
4489 }
4490}
9af228c6 4491#endif
0a4db386 4492
894be9b7 4493/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4494 * If flags is REG_RSN_RETURN_NULL returns null.
4495 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4496 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4497 * to the parsed name as looked up in the RExC_paren_names hash.
4498 * If there is an error throws a vFAIL().. type exception.
894be9b7 4499 */
0a4db386
YO
4500
4501#define REG_RSN_RETURN_NULL 0
4502#define REG_RSN_RETURN_NAME 1
4503#define REG_RSN_RETURN_DATA 2
4504
894be9b7
YO
4505STATIC SV*
4506S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4507 char *name_start = RExC_parse;
0a4db386 4508 if ( UTF ) {
894be9b7 4509 STRLEN numlen;
0a4db386
YO
4510 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4511 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4512 {
4513 RExC_parse += numlen;
4514 }
4515 } else {
4516 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4517 RExC_parse++;
4518 }
0a4db386
YO
4519 if ( flags ) {
4520 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4521 (int)(RExC_parse - name_start)));
894be9b7 4522 if (UTF)
0a4db386
YO
4523 SvUTF8_on(sv_name);
4524 if ( flags == REG_RSN_RETURN_NAME)
4525 return sv_name;
4526 else if (flags==REG_RSN_RETURN_DATA) {
4527 HE *he_str = NULL;
4528 SV *sv_dat = NULL;
4529 if ( ! sv_name ) /* should not happen*/
4530 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4531 if (RExC_paren_names)
4532 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4533 if ( he_str )
4534 sv_dat = HeVAL(he_str);
4535 if ( ! sv_dat )
4536 vFAIL("Reference to nonexistent named group");
4537 return sv_dat;
4538 }
4539 else {
4540 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4541 }
4542 /* NOT REACHED */
894be9b7 4543 }
0a4db386 4544 return NULL;
894be9b7
YO
4545}
4546
3dab1dad
YO
4547#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4548 int rem=(int)(RExC_end - RExC_parse); \
4549 int cut; \
4550 int num; \
4551 int iscut=0; \
4552 if (rem>10) { \
4553 rem=10; \
4554 iscut=1; \
4555 } \
4556 cut=10-rem; \
4557 if (RExC_lastparse!=RExC_parse) \
4558 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4559 rem, RExC_parse, \
4560 cut + 4, \
4561 iscut ? "..." : "<" \
4562 ); \
4563 else \
4564 PerlIO_printf(Perl_debug_log,"%16s",""); \
4565 \
4566 if (SIZE_ONLY) \
4567 num=RExC_size; \
4568 else \
4569 num=REG_NODE_NUM(RExC_emit); \
4570 if (RExC_lastnum!=num) \
0a4db386 4571 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4572 else \
0a4db386 4573 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4574 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4575 (int)((depth*2)), "", \
3dab1dad
YO
4576 (funcname) \
4577 ); \
4578 RExC_lastnum=num; \
4579 RExC_lastparse=RExC_parse; \
4580})
4581
07be1b83
YO
4582
4583
3dab1dad
YO
4584#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4585 DEBUG_PARSE_MSG((funcname)); \
4586 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4587})
6bda09f9
YO
4588#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4589 DEBUG_PARSE_MSG((funcname)); \
4590 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4591})
a687059c
LW
4592/*
4593 - reg - regular expression, i.e. main body or parenthesized thing
4594 *
4595 * Caller must absorb opening parenthesis.
4596 *
4597 * Combining parenthesis handling with the base level of regular expression
4598 * is a trifle forced, but the need to tie the tails of the branches to what
4599 * follows makes it hard to avoid.
4600 */
07be1b83
YO
4601#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4602#ifdef DEBUGGING
4603#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4604#else
4605#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4606#endif
3dab1dad 4607
76e3520e 4608STATIC regnode *
3dab1dad 4609S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4610 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4611{
27da23d5 4612 dVAR;
c277df42
IZ
4613 register regnode *ret; /* Will be the head of the group. */
4614 register regnode *br;
4615 register regnode *lastbr;
cbbf8932 4616 register regnode *ender = NULL;
a0d0e21e 4617 register I32 parno = 0;
cbbf8932
AL
4618 I32 flags;
4619 const I32 oregflags = RExC_flags;
6136c704
AL
4620 bool have_branch = 0;
4621 bool is_open = 0;
9d1d55b5
JP
4622
4623 /* for (?g), (?gc), and (?o) warnings; warning
4624 about (?c) will warn about (?g) -- japhy */
4625
6136c704
AL
4626#define WASTED_O 0x01
4627#define WASTED_G 0x02
4628#define WASTED_C 0x04
4629#define WASTED_GC (0x02|0x04)
cbbf8932 4630 I32 wastedflags = 0x00;
9d1d55b5 4631
fac92740 4632 char * parse_start = RExC_parse; /* MJD */
a28509cc 4633 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4634
3dab1dad
YO
4635 GET_RE_DEBUG_FLAGS_DECL;
4636 DEBUG_PARSE("reg ");
4637
4638
821b33a5 4639 *flagp = 0; /* Tentatively. */
a0d0e21e 4640
9d1d55b5 4641
a0d0e21e
LW
4642 /* Make an OPEN node, if parenthesized. */
4643 if (paren) {
fac92740 4644 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4645 U32 posflags = 0, negflags = 0;
4646 U32 *flagsp = &posflags;
6136c704 4647 bool is_logical = 0;
a28509cc 4648 const char * const seqstart = RExC_parse;
ca9dfc88 4649
830247a4
IZ
4650 RExC_parse++;
4651 paren = *RExC_parse++;
c277df42 4652 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4653 switch (paren) {
894be9b7 4654
fac92740 4655 case '<': /* (?<...) */
b81d288d 4656 if (*RExC_parse == '!')
c277df42 4657 paren = ',';
0a4db386
YO
4658 else if (*RExC_parse != '=')
4659 { /* (?<...>) */
81714fb9 4660 char *name_start;
894be9b7 4661 SV *svname;
81714fb9
YO
4662 paren= '>';
4663 case '\'': /* (?'...') */
4664 name_start= RExC_parse;
0a4db386
YO
4665 svname = reg_scan_name(pRExC_state,
4666 SIZE_ONLY ? /* reverse test from the others */
4667 REG_RSN_RETURN_NAME :
4668 REG_RSN_RETURN_NULL);
81714fb9
YO
4669 if (RExC_parse == name_start)
4670 goto unknown;
4671 if (*RExC_parse != paren)
4672 vFAIL2("Sequence (?%c... not terminated",
4673 paren=='>' ? '<' : paren);
4674 if (SIZE_ONLY) {
e62cc96a
YO
4675 HE *he_str;
4676 SV *sv_dat = NULL;
894be9b7
YO
4677 if (!svname) /* shouldnt happen */
4678 Perl_croak(aTHX_
4679 "panic: reg_scan_name returned NULL");
81714fb9
YO
4680 if (!RExC_paren_names) {
4681 RExC_paren_names= newHV();
4682 sv_2mortal((SV*)RExC_paren_names);
4683 }
4684 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4685 if ( he_str )
81714fb9 4686 sv_dat = HeVAL(he_str);
e62cc96a 4687 if ( ! sv_dat ) {
81714fb9 4688 /* croak baby croak */
e62cc96a
YO
4689 Perl_croak(aTHX_
4690 "panic: paren_name hash element allocation failed");
4691 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4692 IV count=SvIV(sv_dat);
4693 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4694 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4695 pv[count]=RExC_npar;
4696 SvIVX(sv_dat)++;
4697 } else {
4698 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4699 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4700 SvIOK_on(sv_dat);
4701 SvIVX(sv_dat)= 1;
e62cc96a
YO
4702 }
4703
81714fb9
YO
4704 /*sv_dump(sv_dat);*/
4705 }
4706 nextchar(pRExC_state);
4707 paren = 1;
4708 goto capturing_parens;
4709 }
4710 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4711 RExC_parse++;
fac92740
MJD
4712 case '=': /* (?=...) */
4713 case '!': /* (?!...) */
7f69552c
YO
4714 if (*RExC_parse == ')')
4715 goto do_op_fail;
830247a4 4716 RExC_seen_zerolen++;
fac92740
MJD
4717 case ':': /* (?:...) */
4718 case '>': /* (?>...) */
a0d0e21e 4719 break;
24b23f37 4720 case 'C': /* (?CUT) and (?COMMIT) */
54612592
YO
4721 if (RExC_parse[0] == 'O' &&
4722 RExC_parse[1] == 'M' &&
4723 RExC_parse[2] == 'M' &&
4724 RExC_parse[3] == 'I' &&
4725 RExC_parse[4] == 'T' &&
4726 RExC_parse[5] == ')')
4727 {
4728 RExC_parse+=5;
4729 ret = reg_node(pRExC_state, COMMIT);
24b23f37
YO
4730 } else if (
4731 RExC_parse[0] == 'U' &&
4732 RExC_parse[1] == 'T' &&
4733 RExC_parse[2] == ')')
4734 {
4735 RExC_parse+=2;
4736 ret = reg_node(pRExC_state, CUT);
54612592
YO
4737 } else {
4738 vFAIL("Sequence (?C... not terminated");
4739 }
4740 nextchar(pRExC_state);
4741 return ret;
4742 break;
24b23f37
YO
4743 case 'E': /* (?ERROR) */
4744 if (RExC_parse[0] == 'R' &&
4745 RExC_parse[1] == 'R' &&
4746 RExC_parse[2] == 'O' &&
4747 RExC_parse[3] == 'R' &&
4748 RExC_parse[4] == ')')
4749 {
4750 RExC_parse+=4;
4751 ret = reg_node(pRExC_state, OPERROR);
4752 } else {
4753 vFAIL("Sequence (?E... not terminated");
4754 }
4755 nextchar(pRExC_state);
4756 return ret;
4757 break;
7f69552c
YO
4758 case 'F':
4759 if (RExC_parse[0] == 'A' &&
4760 RExC_parse[1] == 'I' &&
4761 RExC_parse[2] == 'L')
4762 RExC_parse+=3;
4763 if (*RExC_parse != ')')
4764 vFAIL("Sequence (?FAIL) or (?F) not terminated");
4765 do_op_fail:
4766 ret = reg_node(pRExC_state, OPFAIL);
4767 nextchar(pRExC_state);
4768 return ret;
4769 break;
fac92740
MJD
4770 case '$': /* (?$...) */
4771 case '@': /* (?@...) */
8615cb43 4772 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4773 break;
fac92740 4774 case '#': /* (?#...) */
830247a4
IZ
4775 while (*RExC_parse && *RExC_parse != ')')
4776 RExC_parse++;
4777 if (*RExC_parse != ')')
c277df42 4778 FAIL("Sequence (?#... not terminated");
830247a4 4779 nextchar(pRExC_state);
a0d0e21e
LW
4780 *flagp = TRYAGAIN;
4781 return NULL;
894be9b7
YO
4782 case '0' : /* (?0) */
4783 case 'R' : /* (?R) */
4784 if (*RExC_parse != ')')
6bda09f9 4785 FAIL("Sequence (?R) not terminated");
1a147d38 4786 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4787 nextchar(pRExC_state);
4788 return ret;
4789 /*notreached*/
894be9b7
YO
4790 { /* named and numeric backreferences */
4791 I32 num;
4792 char * parse_start;
4793 case '&': /* (?&NAME) */
4794 parse_start = RExC_parse - 1;
4795 {
0a4db386
YO
4796 SV *sv_dat = reg_scan_name(pRExC_state,
4797 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4798 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4799 }
4800 goto gen_recurse_regop;
4801 /* NOT REACHED */
6bda09f9
YO
4802 case '1': case '2': case '3': case '4': /* (?1) */
4803 case '5': case '6': case '7': case '8': case '9':
4804 RExC_parse--;
894be9b7
YO
4805 num = atoi(RExC_parse);
4806 parse_start = RExC_parse - 1; /* MJD */
6bda09f9
YO
4807 while (isDIGIT(*RExC_parse))
4808 RExC_parse++;
4809 if (*RExC_parse!=')')
4810 vFAIL("Expecting close bracket");
894be9b7
YO
4811
4812 gen_recurse_regop:
1a147d38 4813 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4814 if (!SIZE_ONLY) {
4815 if (num > (I32)RExC_rx->nparens) {
4816 RExC_parse++;
4817 vFAIL("Reference to nonexistent group");
4818 }
40d049e4 4819 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4820 RExC_emit++;
226de585 4821 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4822 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4823 } else {
6bda09f9 4824 RExC_size++;
6bda09f9 4825 }
0a4db386 4826 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4827 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4828 Set_Node_Offset(ret, parse_start); /* MJD */
4829
6bda09f9
YO
4830 nextchar(pRExC_state);
4831 return ret;
894be9b7
YO
4832 } /* named and numeric backreferences */
4833 /* NOT REACHED */
4834
fac92740 4835 case 'p': /* (?p...) */
9014280d 4836 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4837 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4838 /* FALL THROUGH*/
fac92740 4839 case '?': /* (??...) */
6136c704 4840 is_logical = 1;
438a3801
YST
4841 if (*RExC_parse != '{')
4842 goto unknown;
830247a4 4843 paren = *RExC_parse++;
0f5d15d6 4844 /* FALL THROUGH */
fac92740 4845 case '{': /* (?{...}) */
c277df42 4846 {
c277df42
IZ
4847 I32 count = 1, n = 0;
4848 char c;
830247a4 4849 char *s = RExC_parse;
c277df42 4850
830247a4
IZ
4851 RExC_seen_zerolen++;
4852 RExC_seen |= REG_SEEN_EVAL;
4853 while (count && (c = *RExC_parse)) {
6136c704
AL
4854 if (c == '\\') {
4855 if (RExC_parse[1])
4856 RExC_parse++;
4857 }
b81d288d 4858 else if (c == '{')
c277df42 4859 count++;
b81d288d 4860 else if (c == '}')
c277df42 4861 count--;
830247a4 4862 RExC_parse++;
c277df42 4863 }
6136c704 4864 if (*RExC_parse != ')') {
b81d288d 4865 RExC_parse = s;
b45f050a
JF
4866 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4867 }
c277df42 4868 if (!SIZE_ONLY) {
f3548bdc 4869 PAD *pad;
6136c704
AL
4870 OP_4tree *sop, *rop;
4871 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4872
569233ed
SB
4873 ENTER;
4874 Perl_save_re_context(aTHX);
f3548bdc 4875 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4876 sop->op_private |= OPpREFCOUNTED;
4877 /* re_dup will OpREFCNT_inc */
4878 OpREFCNT_set(sop, 1);
569233ed 4879 LEAVE;
c277df42 4880
830247a4
IZ
4881 n = add_data(pRExC_state, 3, "nop");
4882 RExC_rx->data->data[n] = (void*)rop;
4883 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4884 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4885 SvREFCNT_dec(sv);
a0ed51b3 4886 }
e24b16f9 4887 else { /* First pass */
830247a4 4888 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4889 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4890 /* No compiled RE interpolated, has runtime
4891 components ===> unsafe. */
4892 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4893 if (PL_tainting && PL_tainted)
cc6b7395 4894 FAIL("Eval-group in insecure regular expression");
54df2634 4895#if PERL_VERSION > 8
923e4eb5 4896 if (IN_PERL_COMPILETIME)
b5c19bd7 4897 PL_cv_has_eval = 1;
54df2634 4898#endif
c277df42 4899 }
b5c19bd7 4900
830247a4 4901 nextchar(pRExC_state);
6136c704 4902 if (is_logical) {
830247a4 4903 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4904 if (!SIZE_ONLY)
4905 ret->flags = 2;
3dab1dad 4906 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4907 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4908 return ret;
4909 }
ccb2c380
MP
4910 ret = reganode(pRExC_state, EVAL, n);
4911 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4912 Set_Node_Offset(ret, parse_start);
4913 return ret;
c277df42 4914 }
fac92740 4915 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4916 {
0a4db386 4917 int is_define= 0;
fac92740 4918 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4919 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4920 || RExC_parse[1] == '<'
830247a4 4921 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4922 I32 flag;
4923
830247a4 4924 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4925 if (!SIZE_ONLY)
4926 ret->flags = 1;
3dab1dad 4927 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4928 goto insert_if;
b81d288d 4929 }
a0ed51b3 4930 }
0a4db386
YO
4931 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
4932 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
4933 {
4934 char ch = RExC_parse[0] == '<' ? '>' : '\'';
4935 char *name_start= RExC_parse++;
4936 I32 num = 0;
4937 SV *sv_dat=reg_scan_name(pRExC_state,
4938 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4939 if (RExC_parse == name_start || *RExC_parse != ch)
4940 vFAIL2("Sequence (?(%c... not terminated",
4941 (ch == '>' ? '<' : ch));
4942 RExC_parse++;
4943 if (!SIZE_ONLY) {
4944 num = add_data( pRExC_state, 1, "S" );
4945 RExC_rx->data->data[num]=(void*)sv_dat;
4946 SvREFCNT_inc(sv_dat);
4947 }
4948 ret = reganode(pRExC_state,NGROUPP,num);
4949 goto insert_if_check_paren;
4950 }
4951 else if (RExC_parse[0] == 'D' &&
4952 RExC_parse[1] == 'E' &&
4953 RExC_parse[2] == 'F' &&
4954 RExC_parse[3] == 'I' &&
4955 RExC_parse[4] == 'N' &&
4956 RExC_parse[5] == 'E')
4957 {
4958 ret = reganode(pRExC_state,DEFINEP,0);
4959 RExC_parse +=6 ;
4960 is_define = 1;
4961 goto insert_if_check_paren;
4962 }
4963 else if (RExC_parse[0] == 'R') {
4964 RExC_parse++;
4965 parno = 0;
4966 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
4967 parno = atoi(RExC_parse++);
4968 while (isDIGIT(*RExC_parse))
4969 RExC_parse++;
4970 } else if (RExC_parse[0] == '&') {
4971 SV *sv_dat;
4972 RExC_parse++;
4973 sv_dat = reg_scan_name(pRExC_state,
4974 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4975 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4976 }
1a147d38 4977 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
4978 goto insert_if_check_paren;
4979 }
830247a4 4980 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4981 /* (?(1)...) */
6136c704 4982 char c;
830247a4 4983 parno = atoi(RExC_parse++);
c277df42 4984
830247a4
IZ
4985 while (isDIGIT(*RExC_parse))
4986 RExC_parse++;
fac92740 4987 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4988
0a4db386 4989 insert_if_check_paren:
830247a4 4990 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4991 vFAIL("Switch condition not recognized");
c277df42 4992 insert_if:
3dab1dad
YO
4993 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4994 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4995 if (br == NULL)
830247a4 4996 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4997 else
3dab1dad 4998 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4999 c = *nextchar(pRExC_state);
d1b80229
IZ
5000 if (flags&HASWIDTH)
5001 *flagp |= HASWIDTH;
c277df42 5002 if (c == '|') {
0a4db386
YO
5003 if (is_define)
5004 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5005 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5006 regbranch(pRExC_state, &flags, 1,depth+1);
5007 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5008 if (flags&HASWIDTH)
5009 *flagp |= HASWIDTH;
830247a4 5010 c = *nextchar(pRExC_state);
a0ed51b3
LW
5011 }
5012 else
c277df42
IZ
5013 lastbr = NULL;
5014 if (c != ')')
8615cb43 5015 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5016 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5017 REGTAIL(pRExC_state, br, ender);
c277df42 5018 if (lastbr) {
3dab1dad
YO
5019 REGTAIL(pRExC_state, lastbr, ender);
5020 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5021 }
5022 else
3dab1dad 5023 REGTAIL(pRExC_state, ret, ender);
c277df42 5024 return ret;
a0ed51b3
LW
5025 }
5026 else {
830247a4 5027 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5028 }
5029 }
1b1626e4 5030 case 0:
830247a4 5031 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5032 vFAIL("Sequence (? incomplete");
1b1626e4 5033 break;
a0d0e21e 5034 default:
830247a4 5035 --RExC_parse;
fac92740 5036 parse_flags: /* (?i) */
830247a4 5037 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5038 /* (?g), (?gc) and (?o) are useless here
5039 and must be globally applied -- japhy */
5040
5041 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5042 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5043 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5044 if (! (wastedflags & wflagbit) ) {
5045 wastedflags |= wflagbit;
5046 vWARN5(
5047 RExC_parse + 1,
5048 "Useless (%s%c) - %suse /%c modifier",
5049 flagsp == &negflags ? "?-" : "?",
5050 *RExC_parse,
5051 flagsp == &negflags ? "don't " : "",
5052 *RExC_parse
5053 );
5054 }
5055 }
5056 }
5057 else if (*RExC_parse == 'c') {
5058 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5059 if (! (wastedflags & WASTED_C) ) {
5060 wastedflags |= WASTED_GC;
9d1d55b5
JP
5061 vWARN3(
5062 RExC_parse + 1,
5063 "Useless (%sc) - %suse /gc modifier",
5064 flagsp == &negflags ? "?-" : "?",
5065 flagsp == &negflags ? "don't " : ""
5066 );
5067 }
5068 }
5069 }
5070 else { pmflag(flagsp, *RExC_parse); }
5071
830247a4 5072 ++RExC_parse;
ca9dfc88 5073 }
830247a4 5074 if (*RExC_parse == '-') {
ca9dfc88 5075 flagsp = &negflags;
9d1d55b5 5076 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5077 ++RExC_parse;
ca9dfc88 5078 goto parse_flags;
48c036b1 5079 }
e2509266
JH
5080 RExC_flags |= posflags;
5081 RExC_flags &= ~negflags;
830247a4
IZ
5082 if (*RExC_parse == ':') {
5083 RExC_parse++;
ca9dfc88
IZ
5084 paren = ':';
5085 break;
5086 }
c277df42 5087 unknown:
830247a4
IZ
5088 if (*RExC_parse != ')') {
5089 RExC_parse++;
5090 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5091 }
830247a4 5092 nextchar(pRExC_state);
a0d0e21e
LW
5093 *flagp = TRYAGAIN;
5094 return NULL;
5095 }
5096 }
fac92740 5097 else { /* (...) */
81714fb9 5098 capturing_parens:
830247a4
IZ
5099 parno = RExC_npar;
5100 RExC_npar++;
5101 ret = reganode(pRExC_state, OPEN, parno);
6bda09f9 5102 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
226de585 5103 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5104 "Setting open paren #%"IVdf" to %d\n",
5105 (IV)parno, REG_NODE_NUM(ret)));
5106 RExC_open_parens[parno-1]= ret;
6bda09f9 5107 }
fac92740
MJD
5108 Set_Node_Length(ret, 1); /* MJD */
5109 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5110 is_open = 1;
a0d0e21e 5111 }
a0ed51b3 5112 }
fac92740 5113 else /* ! paren */
a0d0e21e
LW
5114 ret = NULL;
5115
5116 /* Pick up the branches, linking them together. */
fac92740 5117 parse_start = RExC_parse; /* MJD */
3dab1dad 5118 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5119 /* branch_len = (paren != 0); */
2af232bd 5120
a0d0e21e
LW
5121 if (br == NULL)
5122 return(NULL);
830247a4
IZ
5123 if (*RExC_parse == '|') {
5124 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5125 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5126 }
fac92740 5127 else { /* MJD */
6bda09f9 5128 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5129 Set_Node_Length(br, paren != 0);
5130 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5131 }
c277df42
IZ
5132 have_branch = 1;
5133 if (SIZE_ONLY)
830247a4 5134 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5135 }
5136 else if (paren == ':') {
c277df42
IZ
5137 *flagp |= flags&SIMPLE;
5138 }
6136c704 5139 if (is_open) { /* Starts with OPEN. */
3dab1dad 5140 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5141 }
5142 else if (paren != '?') /* Not Conditional */
a0d0e21e 5143 ret = br;
32a0ca98 5144 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5145 lastbr = br;
830247a4
IZ
5146 while (*RExC_parse == '|') {
5147 if (!SIZE_ONLY && RExC_extralen) {
5148 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5149 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5150 }
5151 if (SIZE_ONLY)
830247a4
IZ
5152 RExC_extralen += 2; /* Account for LONGJMP. */
5153 nextchar(pRExC_state);
3dab1dad 5154 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5155
a687059c 5156 if (br == NULL)
a0d0e21e 5157 return(NULL);
3dab1dad 5158 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5159 lastbr = br;
821b33a5
IZ
5160 if (flags&HASWIDTH)
5161 *flagp |= HASWIDTH;
a687059c 5162 *flagp |= flags&SPSTART;
a0d0e21e
LW
5163 }
5164
c277df42
IZ
5165 if (have_branch || paren != ':') {
5166 /* Make a closing node, and hook it on the end. */
5167 switch (paren) {
5168 case ':':
830247a4 5169 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5170 break;
5171 case 1:
830247a4 5172 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5173 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5174 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5175 "Setting close paren #%"IVdf" to %d\n",
5176 (IV)parno, REG_NODE_NUM(ender)));
5177 RExC_close_parens[parno-1]= ender;
5178 }
fac92740
MJD
5179 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5180 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5181 break;
5182 case '<':
c277df42
IZ
5183 case ',':
5184 case '=':
5185 case '!':
c277df42 5186 *flagp &= ~HASWIDTH;
821b33a5
IZ
5187 /* FALL THROUGH */
5188 case '>':
830247a4 5189 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5190 break;
5191 case 0:
830247a4 5192 ender = reg_node(pRExC_state, END);
40d049e4
YO
5193 if (!SIZE_ONLY) {
5194 assert(!RExC_opend); /* there can only be one! */
5195 RExC_opend = ender;
5196 }
c277df42
IZ
5197 break;
5198 }
eaf3ca90 5199 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5200
9674d46a 5201 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5202 if (depth==1)
5203 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5204
c277df42 5205 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5206 for (br = ret; br; br = regnext(br)) {
5207 const U8 op = PL_regkind[OP(br)];
5208 if (op == BRANCH) {
07be1b83 5209 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5210 }
5211 else if (op == BRANCHJ) {
07be1b83 5212 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5213 }
c277df42
IZ
5214 }
5215 }
a0d0e21e 5216 }
c277df42
IZ
5217
5218 {
e1ec3a88
AL
5219 const char *p;
5220 static const char parens[] = "=!<,>";
c277df42
IZ
5221
5222 if (paren && (p = strchr(parens, paren))) {
eb160463 5223 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5224 int flag = (p - parens) > 1;
5225
5226 if (paren == '>')
5227 node = SUSPEND, flag = 0;
6bda09f9 5228 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5229 Set_Node_Cur_Length(ret);
5230 Set_Node_Offset(ret, parse_start + 1);
c277df42 5231 ret->flags = flag;
07be1b83 5232 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5233 }
a0d0e21e
LW
5234 }
5235
5236 /* Check for proper termination. */
ce3e6498 5237 if (paren) {
e2509266 5238 RExC_flags = oregflags;
830247a4
IZ
5239 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5240 RExC_parse = oregcomp_parse;
380a0633 5241 vFAIL("Unmatched (");
ce3e6498 5242 }
a0ed51b3 5243 }
830247a4
IZ
5244 else if (!paren && RExC_parse < RExC_end) {
5245 if (*RExC_parse == ')') {
5246 RExC_parse++;
380a0633 5247 vFAIL("Unmatched )");
a0ed51b3
LW
5248 }
5249 else
b45f050a 5250 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5251 /* NOTREACHED */
5252 }
a687059c 5253
a0d0e21e 5254 return(ret);
a687059c
LW
5255}
5256
5257/*
5258 - regbranch - one alternative of an | operator
5259 *
5260 * Implements the concatenation operator.
5261 */
76e3520e 5262STATIC regnode *
3dab1dad 5263S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5264{
97aff369 5265 dVAR;
c277df42
IZ
5266 register regnode *ret;
5267 register regnode *chain = NULL;
5268 register regnode *latest;
5269 I32 flags = 0, c = 0;
3dab1dad
YO
5270 GET_RE_DEBUG_FLAGS_DECL;
5271 DEBUG_PARSE("brnc");
b81d288d 5272 if (first)
c277df42
IZ
5273 ret = NULL;
5274 else {
b81d288d 5275 if (!SIZE_ONLY && RExC_extralen)
830247a4 5276 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5277 else {
830247a4 5278 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5279 Set_Node_Length(ret, 1);
5280 }
c277df42
IZ
5281 }
5282
b81d288d 5283 if (!first && SIZE_ONLY)
830247a4 5284 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5285
c277df42 5286 *flagp = WORST; /* Tentatively. */
a0d0e21e 5287
830247a4
IZ
5288 RExC_parse--;
5289 nextchar(pRExC_state);
5290 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5291 flags &= ~TRYAGAIN;
3dab1dad 5292 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5293 if (latest == NULL) {
5294 if (flags & TRYAGAIN)
5295 continue;
5296 return(NULL);
a0ed51b3
LW
5297 }
5298 else if (ret == NULL)
c277df42 5299 ret = latest;
a0d0e21e 5300 *flagp |= flags&HASWIDTH;
c277df42 5301 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5302 *flagp |= flags&SPSTART;
5303 else {
830247a4 5304 RExC_naughty++;
3dab1dad 5305 REGTAIL(pRExC_state, chain, latest);
a687059c 5306 }
a0d0e21e 5307 chain = latest;
c277df42
IZ
5308 c++;
5309 }
5310 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5311 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5312 if (ret == NULL)
5313 ret = chain;
5314 }
5315 if (c == 1) {
5316 *flagp |= flags&SIMPLE;
a0d0e21e 5317 }
a687059c 5318
d4c19fe8 5319 return ret;
a687059c
LW
5320}
5321
5322/*
5323 - regpiece - something followed by possible [*+?]
5324 *
5325 * Note that the branching code sequences used for ? and the general cases
5326 * of * and + are somewhat optimized: they use the same NOTHING node as
5327 * both the endmarker for their branch list and the body of the last branch.
5328 * It might seem that this node could be dispensed with entirely, but the
5329 * endmarker role is not redundant.
5330 */
76e3520e 5331STATIC regnode *
3dab1dad 5332S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5333{
97aff369 5334 dVAR;
c277df42 5335 register regnode *ret;
a0d0e21e
LW
5336 register char op;
5337 register char *next;
5338 I32 flags;
1df70142 5339 const char * const origparse = RExC_parse;
a0d0e21e 5340 I32 min;
c277df42 5341 I32 max = REG_INFTY;
fac92740 5342 char *parse_start;
10edeb5d 5343 const char *maxpos = NULL;
3dab1dad
YO
5344 GET_RE_DEBUG_FLAGS_DECL;
5345 DEBUG_PARSE("piec");
a0d0e21e 5346
3dab1dad 5347 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5348 if (ret == NULL) {
5349 if (flags & TRYAGAIN)
5350 *flagp |= TRYAGAIN;
5351 return(NULL);
5352 }
5353
830247a4 5354 op = *RExC_parse;
a0d0e21e 5355
830247a4 5356 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5357 maxpos = NULL;
fac92740 5358 parse_start = RExC_parse; /* MJD */
830247a4 5359 next = RExC_parse + 1;
a0d0e21e
LW
5360 while (isDIGIT(*next) || *next == ',') {
5361 if (*next == ',') {
5362 if (maxpos)
5363 break;
5364 else
5365 maxpos = next;
a687059c 5366 }
a0d0e21e
LW
5367 next++;
5368 }
5369 if (*next == '}') { /* got one */
5370 if (!maxpos)
5371 maxpos = next;
830247a4
IZ
5372 RExC_parse++;
5373 min = atoi(RExC_parse);
a0d0e21e
LW
5374 if (*maxpos == ',')
5375 maxpos++;
5376 else
830247a4 5377 maxpos = RExC_parse;
a0d0e21e
LW
5378 max = atoi(maxpos);
5379 if (!max && *maxpos != '0')
c277df42
IZ
5380 max = REG_INFTY; /* meaning "infinity" */
5381 else if (max >= REG_INFTY)
8615cb43 5382 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5383 RExC_parse = next;
5384 nextchar(pRExC_state);
a0d0e21e
LW
5385
5386 do_curly:
5387 if ((flags&SIMPLE)) {
830247a4 5388 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5389 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5390 Set_Node_Offset(ret, parse_start+1); /* MJD */
5391 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5392 }
5393 else {
3dab1dad 5394 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5395
5396 w->flags = 0;
3dab1dad 5397 REGTAIL(pRExC_state, ret, w);
830247a4 5398 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5399 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5400 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5401 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5402 }
6bda09f9 5403 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5404 /* MJD hk */
5405 Set_Node_Offset(ret, parse_start+1);
2af232bd 5406 Set_Node_Length(ret,
fac92740 5407 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5408
830247a4 5409 if (!SIZE_ONLY && RExC_extralen)
c277df42 5410 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5411 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5412 if (SIZE_ONLY)
830247a4
IZ
5413 RExC_whilem_seen++, RExC_extralen += 3;
5414 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5415 }
c277df42 5416 ret->flags = 0;
a0d0e21e
LW
5417
5418 if (min > 0)
821b33a5
IZ
5419 *flagp = WORST;
5420 if (max > 0)
5421 *flagp |= HASWIDTH;
a0d0e21e 5422 if (max && max < min)
8615cb43 5423 vFAIL("Can't do {n,m} with n > m");
c277df42 5424 if (!SIZE_ONLY) {
eb160463
GS
5425 ARG1_SET(ret, (U16)min);
5426 ARG2_SET(ret, (U16)max);
a687059c 5427 }
a687059c 5428
a0d0e21e 5429 goto nest_check;
a687059c 5430 }
a0d0e21e 5431 }
a687059c 5432
a0d0e21e
LW
5433 if (!ISMULT1(op)) {
5434 *flagp = flags;
a687059c 5435 return(ret);
a0d0e21e 5436 }
bb20fd44 5437
c277df42 5438#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5439
5440 /* if this is reinstated, don't forget to put this back into perldiag:
5441
5442 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5443
5444 (F) The part of the regexp subject to either the * or + quantifier
5445 could match an empty string. The {#} shows in the regular
5446 expression about where the problem was discovered.
5447
5448 */
5449
bb20fd44 5450 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5451 vFAIL("Regexp *+ operand could be empty");
b81d288d 5452#endif
bb20fd44 5453
fac92740 5454 parse_start = RExC_parse;
830247a4 5455 nextchar(pRExC_state);
a0d0e21e 5456
821b33a5 5457 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5458
5459 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5460 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5461 ret->flags = 0;
830247a4 5462 RExC_naughty += 4;
a0d0e21e
LW
5463 }
5464 else if (op == '*') {
5465 min = 0;
5466 goto do_curly;
a0ed51b3
LW
5467 }
5468 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5469 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5470 ret->flags = 0;
830247a4 5471 RExC_naughty += 3;
a0d0e21e
LW
5472 }
5473 else if (op == '+') {
5474 min = 1;
5475 goto do_curly;
a0ed51b3
LW
5476 }
5477 else if (op == '?') {
a0d0e21e
LW
5478 min = 0; max = 1;
5479 goto do_curly;
5480 }
5481 nest_check:
041457d9 5482 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5483 vWARN3(RExC_parse,
b45f050a 5484 "%.*s matches null string many times",
afd78fd5 5485 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5486 origparse);
a0d0e21e
LW
5487 }
5488
b9b4dddf 5489 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5490 nextchar(pRExC_state);
6bda09f9 5491 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5492 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5493 }
b9b4dddf
YO
5494#ifndef REG_ALLOW_MINMOD_SUSPEND
5495 else
5496#endif
5497 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5498 regnode *ender;
5499 nextchar(pRExC_state);
5500 ender = reg_node(pRExC_state, SUCCEED);
5501 REGTAIL(pRExC_state, ret, ender);
5502 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5503 ret->flags = 0;
5504 ender = reg_node(pRExC_state, TAIL);
5505 REGTAIL(pRExC_state, ret, ender);
5506 /*ret= ender;*/
5507 }
5508
5509 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5510 RExC_parse++;
b45f050a
JF
5511 vFAIL("Nested quantifiers");
5512 }
a0d0e21e
LW
5513
5514 return(ret);
a687059c
LW
5515}
5516
fc8cd66c
YO
5517
5518/* reg_namedseq(pRExC_state,UVp)
5519
5520 This is expected to be called by a parser routine that has
5521 recognized'\N' and needs to handle the rest. RExC_parse is
5522 expected to point at the first char following the N at the time
5523 of the call.
5524
5525 If valuep is non-null then it is assumed that we are parsing inside
5526 of a charclass definition and the first codepoint in the resolved
5527 string is returned via *valuep and the routine will return NULL.
5528 In this mode if a multichar string is returned from the charnames
5529 handler a warning will be issued, and only the first char in the
5530 sequence will be examined. If the string returned is zero length
5531 then the value of *valuep is undefined and NON-NULL will
5532 be returned to indicate failure. (This will NOT be a valid pointer
5533 to a regnode.)
5534
5535 If value is null then it is assumed that we are parsing normal text
5536 and inserts a new EXACT node into the program containing the resolved
5537 string and returns a pointer to the new node. If the string is
5538 zerolength a NOTHING node is emitted.
5539
5540 On success RExC_parse is set to the char following the endbrace.
5541 Parsing failures will generate a fatal errorvia vFAIL(...)
5542
5543 NOTE: We cache all results from the charnames handler locally in
5544 the RExC_charnames hash (created on first use) to prevent a charnames
5545 handler from playing silly-buggers and returning a short string and
5546 then a long string for a given pattern. Since the regexp program
5547 size is calculated during an initial parse this would result
5548 in a buffer overrun so we cache to prevent the charname result from
5549 changing during the course of the parse.
5550
5551 */
5552STATIC regnode *
5553S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5554{
5555 char * name; /* start of the content of the name */
5556 char * endbrace; /* endbrace following the name */
5557 SV *sv_str = NULL;
5558 SV *sv_name = NULL;
5559 STRLEN len; /* this has various purposes throughout the code */
5560 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5561 regnode *ret = NULL;
5562
5563 if (*RExC_parse != '{') {
5564 vFAIL("Missing braces on \\N{}");
5565 }
5566 name = RExC_parse+1;
5567 endbrace = strchr(RExC_parse, '}');
5568 if ( ! endbrace ) {
5569 RExC_parse++;
5570 vFAIL("Missing right brace on \\N{}");
5571 }
5572 RExC_parse = endbrace + 1;
5573
5574
5575 /* RExC_parse points at the beginning brace,
5576 endbrace points at the last */
5577 if ( name[0]=='U' && name[1]=='+' ) {
5578 /* its a "unicode hex" notation {U+89AB} */
5579 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5580 | PERL_SCAN_DISALLOW_PREFIX
5581 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5582 UV cp;
196f1508 5583 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5584 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5585 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5586 cp = 0xFFFD;
5587 }
5588 if (cp > 0xff)
5589 RExC_utf8 = 1;
5590 if ( valuep ) {
5591 *valuep = cp;
5592 return NULL;
5593 }
5594 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5595 } else {
5596 /* fetch the charnames handler for this scope */
5597 HV * const table = GvHV(PL_hintgv);
5598 SV **cvp= table ?
5599 hv_fetchs(table, "charnames", FALSE) :
5600 NULL;
5601 SV *cv= cvp ? *cvp : NULL;
5602 HE *he_str;
5603 int count;
5604 /* create an SV with the name as argument */
5605 sv_name = newSVpvn(name, endbrace - name);
5606
5607 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5608 vFAIL2("Constant(\\N{%s}) unknown: "
5609 "(possibly a missing \"use charnames ...\")",
5610 SvPVX(sv_name));
5611 }
5612 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5613 vFAIL2("Constant(\\N{%s}): "
5614 "$^H{charnames} is not defined",SvPVX(sv_name));
5615 }
5616
5617
5618
5619 if (!RExC_charnames) {
5620 /* make sure our cache is allocated */
5621 RExC_charnames = newHV();
6bda09f9 5622 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5623 }
5624 /* see if we have looked this one up before */
5625 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5626 if ( he_str ) {
5627 sv_str = HeVAL(he_str);
5628 cached = 1;
5629 } else {
5630 dSP ;
5631
5632 ENTER ;
5633 SAVETMPS ;
5634 PUSHMARK(SP) ;
5635
5636 XPUSHs(sv_name);
5637
5638 PUTBACK ;
5639
5640 count= call_sv(cv, G_SCALAR);
5641
5642 if (count == 1) { /* XXXX is this right? dmq */
5643 sv_str = POPs;
5644 SvREFCNT_inc_simple_void(sv_str);
5645 }
5646
5647 SPAGAIN ;
5648 PUTBACK ;
5649 FREETMPS ;
5650 LEAVE ;
5651
5652 if ( !sv_str || !SvOK(sv_str) ) {
5653 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5654 "did not return a defined value",SvPVX(sv_name));
5655 }
5656 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5657 cached = 1;
5658 }
5659 }
5660 if (valuep) {
5661 char *p = SvPV(sv_str, len);
5662 if (len) {
5663 STRLEN numlen = 1;
5664 if ( SvUTF8(sv_str) ) {
196f1508 5665 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5666 if (*valuep > 0x7F)
5667 RExC_utf8 = 1;
5668 /* XXXX
5669 We have to turn on utf8 for high bit chars otherwise
5670 we get failures with
5671
5672 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5673 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5674
5675 This is different from what \x{} would do with the same
5676 codepoint, where the condition is > 0xFF.
5677 - dmq
5678 */
5679
5680
5681 } else {
5682 *valuep = (UV)*p;
5683 /* warn if we havent used the whole string? */
5684 }
5685 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5686 vWARN2(RExC_parse,
5687 "Ignoring excess chars from \\N{%s} in character class",
5688 SvPVX(sv_name)
5689 );
5690 }
5691 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5692 vWARN2(RExC_parse,
5693 "Ignoring zero length \\N{%s} in character class",
5694 SvPVX(sv_name)
5695 );
5696 }
5697 if (sv_name)
5698 SvREFCNT_dec(sv_name);
5699 if (!cached)
5700 SvREFCNT_dec(sv_str);
5701 return len ? NULL : (regnode *)&len;
5702 } else if(SvCUR(sv_str)) {
5703
5704 char *s;
5705 char *p, *pend;
5706 STRLEN charlen = 1;
5707 char * parse_start = name-3; /* needed for the offsets */
5708 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5709
5710 ret = reg_node(pRExC_state,
5711 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5712 s= STRING(ret);
5713
5714 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5715 sv_utf8_upgrade(sv_str);
5716 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5717 RExC_utf8= 1;
5718 }
5719
5720 p = SvPV(sv_str, len);
5721 pend = p + len;
5722 /* len is the length written, charlen is the size the char read */
5723 for ( len = 0; p < pend; p += charlen ) {
5724 if (UTF) {
196f1508 5725 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5726 if (FOLD) {
5727 STRLEN foldlen,numlen;
5728 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5729 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5730 /* Emit all the Unicode characters. */
5731
5732 for (foldbuf = tmpbuf;
5733 foldlen;
5734 foldlen -= numlen)
5735 {
5736 uvc = utf8_to_uvchr(foldbuf, &numlen);
5737 if (numlen > 0) {
5738 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5739 s += unilen;
5740 len += unilen;
5741 /* In EBCDIC the numlen
5742 * and unilen can differ. */
5743 foldbuf += numlen;
5744 if (numlen >= foldlen)
5745 break;
5746 }
5747 else
5748 break; /* "Can't happen." */
5749 }
5750 } else {
5751 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5752 if (unilen > 0) {
5753 s += unilen;
5754 len += unilen;
5755 }
5756 }
5757 } else {
5758 len++;
5759 REGC(*p, s++);
5760 }
5761 }
5762 if (SIZE_ONLY) {
5763 RExC_size += STR_SZ(len);
5764 } else {
5765 STR_LEN(ret) = len;
5766 RExC_emit += STR_SZ(len);
5767 }
5768 Set_Node_Cur_Length(ret); /* MJD */
5769 RExC_parse--;
5770 nextchar(pRExC_state);
5771 } else {
5772 ret = reg_node(pRExC_state,NOTHING);
5773 }
5774 if (!cached) {
5775 SvREFCNT_dec(sv_str);
5776 }
5777 if (sv_name) {
5778 SvREFCNT_dec(sv_name);
5779 }
5780 return ret;
5781
5782}
5783
5784
9e08bc66
TS
5785/*
5786 * reg_recode
5787 *
5788 * It returns the code point in utf8 for the value in *encp.
5789 * value: a code value in the source encoding
5790 * encp: a pointer to an Encode object
5791 *
5792 * If the result from Encode is not a single character,
5793 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
5794 */
5795STATIC UV
5796S_reg_recode(pTHX_ const char value, SV **encp)
5797{
5798 STRLEN numlen = 1;
5799 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
5800 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
5801 : SvPVX(sv);
5802 const STRLEN newlen = SvCUR(sv);
5803 UV uv = UNICODE_REPLACEMENT;
5804
5805 if (newlen)
5806 uv = SvUTF8(sv)
5807 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
5808 : *(U8*)s;
5809
5810 if (!newlen || numlen != newlen) {
5811 uv = UNICODE_REPLACEMENT;
5812 if (encp)
5813 *encp = NULL;
5814 }
5815 return uv;
5816}
5817
fc8cd66c 5818
a687059c
LW
5819/*
5820 - regatom - the lowest level
5821 *
5822 * Optimization: gobbles an entire sequence of ordinary characters so that
5823 * it can turn them into a single node, which is smaller to store and
5824 * faster to run. Backslashed characters are exceptions, each becoming a
5825 * separate node; the code is simpler that way and it's not worth fixing.
5826 *
7f6f358c
YO
5827 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5828 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5829 */
76e3520e 5830STATIC regnode *
3dab1dad 5831S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5832{
97aff369 5833 dVAR;
cbbf8932 5834 register regnode *ret = NULL;
a0d0e21e 5835 I32 flags;
45948336 5836 char *parse_start = RExC_parse;
3dab1dad
YO
5837 GET_RE_DEBUG_FLAGS_DECL;
5838 DEBUG_PARSE("atom");
a0d0e21e
LW
5839 *flagp = WORST; /* Tentatively. */
5840
5841tryagain:
830247a4 5842 switch (*RExC_parse) {
a0d0e21e 5843 case '^':
830247a4
IZ
5844 RExC_seen_zerolen++;
5845 nextchar(pRExC_state);
e2509266 5846 if (RExC_flags & PMf_MULTILINE)
830247a4 5847 ret = reg_node(pRExC_state, MBOL);
e2509266 5848 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5849 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5850 else
830247a4 5851 ret = reg_node(pRExC_state, BOL);
fac92740 5852 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5853 break;
5854 case '$':
830247a4 5855 nextchar(pRExC_state);
b81d288d 5856 if (*RExC_parse)
830247a4 5857 RExC_seen_zerolen++;
e2509266 5858 if (RExC_flags & PMf_MULTILINE)
830247a4 5859 ret = reg_node(pRExC_state, MEOL);
e2509266 5860 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5861 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5862 else
830247a4 5863 ret = reg_node(pRExC_state, EOL);
fac92740 5864 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5865 break;
5866 case '.':
830247a4 5867 nextchar(pRExC_state);
e2509266 5868 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5869 ret = reg_node(pRExC_state, SANY);
5870 else
5871 ret = reg_node(pRExC_state, REG_ANY);
5872 *flagp |= HASWIDTH|SIMPLE;
830247a4 5873 RExC_naughty++;
fac92740 5874 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5875 break;
5876 case '[':
b45f050a 5877 {
3dab1dad
YO
5878 char * const oregcomp_parse = ++RExC_parse;
5879 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5880 if (*RExC_parse != ']') {
5881 RExC_parse = oregcomp_parse;
b45f050a
JF
5882 vFAIL("Unmatched [");
5883 }
830247a4 5884 nextchar(pRExC_state);
a0d0e21e 5885 *flagp |= HASWIDTH|SIMPLE;
fac92740 5886 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5887 break;
b45f050a 5888 }
a0d0e21e 5889 case '(':
830247a4 5890 nextchar(pRExC_state);
3dab1dad 5891 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5892 if (ret == NULL) {
bf93d4cc 5893 if (flags & TRYAGAIN) {
830247a4 5894 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5895 /* Make parent create an empty node if needed. */
5896 *flagp |= TRYAGAIN;
5897 return(NULL);
5898 }
a0d0e21e 5899 goto tryagain;
bf93d4cc 5900 }
a0d0e21e
LW
5901 return(NULL);
5902 }
c277df42 5903 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5904 break;
5905 case '|':
5906 case ')':
5907 if (flags & TRYAGAIN) {
5908 *flagp |= TRYAGAIN;
5909 return NULL;
5910 }
b45f050a 5911 vFAIL("Internal urp");
a0d0e21e
LW
5912 /* Supposed to be caught earlier. */
5913 break;
85afd4ae 5914 case '{':
830247a4
IZ
5915 if (!regcurly(RExC_parse)) {
5916 RExC_parse++;
85afd4ae
CS
5917 goto defchar;
5918 }
5919 /* FALL THROUGH */
a0d0e21e
LW
5920 case '?':
5921 case '+':
5922 case '*':
830247a4 5923 RExC_parse++;
b45f050a 5924 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5925 break;
5926 case '\\':
830247a4 5927 switch (*++RExC_parse) {
a0d0e21e 5928 case 'A':
830247a4
IZ
5929 RExC_seen_zerolen++;
5930 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5931 *flagp |= SIMPLE;
830247a4 5932 nextchar(pRExC_state);
fac92740 5933 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5934 break;
5935 case 'G':
830247a4
IZ
5936 ret = reg_node(pRExC_state, GPOS);
5937 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5938 *flagp |= SIMPLE;
830247a4 5939 nextchar(pRExC_state);
fac92740 5940 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5941 break;
5942 case 'Z':
830247a4 5943 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5944 *flagp |= SIMPLE;
a1917ab9 5945 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5946 nextchar(pRExC_state);
a0d0e21e 5947 break;
b85d18e9 5948 case 'z':
830247a4 5949 ret = reg_node(pRExC_state, EOS);
b85d18e9 5950 *flagp |= SIMPLE;
830247a4
IZ
5951 RExC_seen_zerolen++; /* Do not optimize RE away */
5952 nextchar(pRExC_state);
fac92740 5953 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5954 break;
4a2d328f 5955 case 'C':
f33976b4
DB
5956 ret = reg_node(pRExC_state, CANY);
5957 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5958 *flagp |= HASWIDTH|SIMPLE;
830247a4 5959 nextchar(pRExC_state);
fac92740 5960 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5961 break;
5962 case 'X':
830247a4 5963 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5964 *flagp |= HASWIDTH;
830247a4 5965 nextchar(pRExC_state);
fac92740 5966 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5967 break;
a0d0e21e 5968 case 'w':
eb160463 5969 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5970 *flagp |= HASWIDTH|SIMPLE;
830247a4 5971 nextchar(pRExC_state);
fac92740 5972 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5973 break;
5974 case 'W':
eb160463 5975 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5976 *flagp |= HASWIDTH|SIMPLE;
830247a4 5977 nextchar(pRExC_state);
fac92740 5978 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5979 break;
5980 case 'b':
830247a4
IZ
5981 RExC_seen_zerolen++;
5982 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5983 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5984 *flagp |= SIMPLE;
830247a4 5985 nextchar(pRExC_state);
fac92740 5986 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5987 break;
5988 case 'B':
830247a4
IZ
5989 RExC_seen_zerolen++;
5990 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5991 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5992 *flagp |= SIMPLE;
830247a4 5993 nextchar(pRExC_state);
fac92740 5994 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5995 break;
5996 case 's':
eb160463 5997 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5998 *flagp |= HASWIDTH|SIMPLE;
830247a4 5999 nextchar(pRExC_state);
fac92740 6000 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6001 break;
6002 case 'S':
eb160463 6003 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6004 *flagp |= HASWIDTH|SIMPLE;
830247a4 6005 nextchar(pRExC_state);
fac92740 6006 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6007 break;
6008 case 'd':
ffc61ed2 6009 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6010 *flagp |= HASWIDTH|SIMPLE;
830247a4 6011 nextchar(pRExC_state);
fac92740 6012 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
6013 break;
6014 case 'D':
ffc61ed2 6015 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6016 *flagp |= HASWIDTH|SIMPLE;
830247a4 6017 nextchar(pRExC_state);
fac92740 6018 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 6019 break;
a14b48bc
LW
6020 case 'p':
6021 case 'P':
3568d838 6022 {
3dab1dad 6023 char* const oldregxend = RExC_end;
ccb2c380 6024 char* parse_start = RExC_parse - 2;
a14b48bc 6025
830247a4 6026 if (RExC_parse[1] == '{') {
3568d838 6027 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6028 RExC_end = strchr(RExC_parse, '}');
6029 if (!RExC_end) {
3dab1dad 6030 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6031 RExC_parse += 2;
6032 RExC_end = oldregxend;
0da60cf5 6033 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6034 }
830247a4 6035 RExC_end++;
a14b48bc 6036 }
af6f566e 6037 else {
830247a4 6038 RExC_end = RExC_parse + 2;
af6f566e
HS
6039 if (RExC_end > oldregxend)
6040 RExC_end = oldregxend;
6041 }
830247a4 6042 RExC_parse--;
a14b48bc 6043
3dab1dad 6044 ret = regclass(pRExC_state,depth+1);
a14b48bc 6045
830247a4
IZ
6046 RExC_end = oldregxend;
6047 RExC_parse--;
ccb2c380
MP
6048
6049 Set_Node_Offset(ret, parse_start + 2);
6050 Set_Node_Cur_Length(ret);
830247a4 6051 nextchar(pRExC_state);
a14b48bc
LW
6052 *flagp |= HASWIDTH|SIMPLE;
6053 }
6054 break;
fc8cd66c
YO
6055 case 'N':
6056 /* Handle \N{NAME} here and not below because it can be
6057 multicharacter. join_exact() will join them up later on.
6058 Also this makes sure that things like /\N{BLAH}+/ and
6059 \N{BLAH} being multi char Just Happen. dmq*/
6060 ++RExC_parse;
6061 ret= reg_namedseq(pRExC_state, NULL);
6062 break;
0a4db386 6063 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6064 {
6065 char ch= RExC_parse[1];
6066 if (ch != '<' && ch != '\'') {
6067 if (SIZE_ONLY)
6068 vWARN( RExC_parse + 1,
6069 "Possible broken named back reference treated as literal k");
6070 parse_start--;
6071 goto defchar;
6072 } else {
6073 char* name_start = (RExC_parse += 2);
6074 I32 num = 0;
0a4db386
YO
6075 SV *sv_dat = reg_scan_name(pRExC_state,
6076 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6077 ch= (ch == '<') ? '>' : '\'';
6078
81714fb9
YO
6079 if (RExC_parse == name_start || *RExC_parse != ch)
6080 vFAIL2("Sequence \\k%c... not terminated",
6081 (ch == '>' ? '<' : ch));
6082
6083 RExC_sawback = 1;
6084 ret = reganode(pRExC_state,
6085 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6086 num);
6087 *flagp |= HASWIDTH;
6088
6089
6090 if (!SIZE_ONLY) {
81714fb9
YO
6091 num = add_data( pRExC_state, 1, "S" );
6092 ARG_SET(ret,num);
6093 RExC_rx->data->data[num]=(void*)sv_dat;
6094 SvREFCNT_inc(sv_dat);
6095 }
6096 /* override incorrect value set in reganode MJD */
6097 Set_Node_Offset(ret, parse_start+1);
6098 Set_Node_Cur_Length(ret); /* MJD */
6099 nextchar(pRExC_state);
6100
6101 }
6102 break;
6103 }
a0d0e21e
LW
6104 case 'n':
6105 case 'r':
6106 case 't':
6107 case 'f':
6108 case 'e':
6109 case 'a':
6110 case 'x':
6111 case 'c':
6112 case '0':
6113 goto defchar;
6114 case '1': case '2': case '3': case '4':
6115 case '5': case '6': case '7': case '8': case '9':
6116 {
1df70142 6117 const I32 num = atoi(RExC_parse);
a0d0e21e 6118
830247a4 6119 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6120 goto defchar;
6121 else {
3dab1dad 6122 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6123 while (isDIGIT(*RExC_parse))
6124 RExC_parse++;
b45f050a 6125
eb160463 6126 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 6127 vFAIL("Reference to nonexistent group");
830247a4 6128 RExC_sawback = 1;
eb160463
GS
6129 ret = reganode(pRExC_state,
6130 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6131 num);
a0d0e21e 6132 *flagp |= HASWIDTH;
2af232bd 6133
fac92740 6134 /* override incorrect value set in reganode MJD */
2af232bd 6135 Set_Node_Offset(ret, parse_start+1);
fac92740 6136 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6137 RExC_parse--;
6138 nextchar(pRExC_state);
a0d0e21e
LW
6139 }
6140 }
6141 break;
6142 case '\0':
830247a4 6143 if (RExC_parse >= RExC_end)
b45f050a 6144 FAIL("Trailing \\");
a0d0e21e
LW
6145 /* FALL THROUGH */
6146 default:
a0288114 6147 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6148 back into the quick-grab loop below */
45948336 6149 parse_start--;
a0d0e21e
LW
6150 goto defchar;
6151 }
6152 break;
4633a7c4
LW
6153
6154 case '#':
e2509266 6155 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6156 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6157 RExC_parse++;
830247a4 6158 if (RExC_parse < RExC_end)
4633a7c4
LW
6159 goto tryagain;
6160 }
6161 /* FALL THROUGH */
6162
a0d0e21e 6163 default: {
ba210ebe 6164 register STRLEN len;
58ae7d3f 6165 register UV ender;
a0d0e21e 6166 register char *p;
3dab1dad 6167 char *s;
80aecb99 6168 STRLEN foldlen;
89ebb4a3 6169 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6170
6171 parse_start = RExC_parse - 1;
a0d0e21e 6172
830247a4 6173 RExC_parse++;
a0d0e21e
LW
6174
6175 defchar:
58ae7d3f 6176 ender = 0;
eb160463
GS
6177 ret = reg_node(pRExC_state,
6178 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6179 s = STRING(ret);
830247a4
IZ
6180 for (len = 0, p = RExC_parse - 1;
6181 len < 127 && p < RExC_end;
a0d0e21e
LW
6182 len++)
6183 {
3dab1dad 6184 char * const oldp = p;
5b5a24f7 6185
e2509266 6186 if (RExC_flags & PMf_EXTENDED)
830247a4 6187 p = regwhite(p, RExC_end);
a0d0e21e
LW
6188 switch (*p) {
6189 case '^':
6190 case '$':
6191 case '.':
6192 case '[':
6193 case '(':
6194 case ')':
6195 case '|':
6196 goto loopdone;
6197 case '\\':
6198 switch (*++p) {
6199 case 'A':
1ed8eac0
JF
6200 case 'C':
6201 case 'X':
a0d0e21e
LW
6202 case 'G':
6203 case 'Z':
b85d18e9 6204 case 'z':
a0d0e21e
LW
6205 case 'w':
6206 case 'W':
6207 case 'b':
6208 case 'B':
6209 case 's':
6210 case 'S':
6211 case 'd':
6212 case 'D':
a14b48bc
LW
6213 case 'p':
6214 case 'P':
fc8cd66c 6215 case 'N':
a0d0e21e
LW
6216 --p;
6217 goto loopdone;
6218 case 'n':
6219 ender = '\n';
6220 p++;
a687059c 6221 break;
a0d0e21e
LW
6222 case 'r':
6223 ender = '\r';
6224 p++;
a687059c 6225 break;
a0d0e21e
LW
6226 case 't':
6227 ender = '\t';
6228 p++;
a687059c 6229 break;
a0d0e21e
LW
6230 case 'f':
6231 ender = '\f';
6232 p++;
a687059c 6233 break;
a0d0e21e 6234 case 'e':
c7f1f016 6235 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6236 p++;
a687059c 6237 break;
a0d0e21e 6238 case 'a':
c7f1f016 6239 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6240 p++;
a687059c 6241 break;
a0d0e21e 6242 case 'x':
a0ed51b3 6243 if (*++p == '{') {
1df70142 6244 char* const e = strchr(p, '}');
b81d288d 6245
b45f050a 6246 if (!e) {
830247a4 6247 RExC_parse = p + 1;
b45f050a
JF
6248 vFAIL("Missing right brace on \\x{}");
6249 }
de5f0749 6250 else {
a4c04bdc
NC
6251 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6252 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6253 STRLEN numlen = e - p - 1;
53305cf1 6254 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6255 if (ender > 0xff)
6256 RExC_utf8 = 1;
a0ed51b3
LW
6257 p = e + 1;
6258 }
a0ed51b3
LW
6259 }
6260 else {
a4c04bdc 6261 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6262 STRLEN numlen = 2;
53305cf1 6263 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6264 p += numlen;
6265 }
9e08bc66
TS
6266 if (PL_encoding && ender < 0x100)
6267 goto recode_encoding;
a687059c 6268 break;
a0d0e21e
LW
6269 case 'c':
6270 p++;
bbce6d69 6271 ender = UCHARAT(p++);
6272 ender = toCTRL(ender);
a687059c 6273 break;
a0d0e21e
LW
6274 case '0': case '1': case '2': case '3':case '4':
6275 case '5': case '6': case '7': case '8':case '9':
6276 if (*p == '0' ||
830247a4 6277 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6278 I32 flags = 0;
1df70142 6279 STRLEN numlen = 3;
53305cf1 6280 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6281 p += numlen;
6282 }
6283 else {
6284 --p;
6285 goto loopdone;
a687059c 6286 }
9e08bc66
TS
6287 if (PL_encoding && ender < 0x100)
6288 goto recode_encoding;
6289 break;
6290 recode_encoding:
6291 {
6292 SV* enc = PL_encoding;
6293 ender = reg_recode((const char)(U8)ender, &enc);
6294 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6295 vWARN(p, "Invalid escape in the specified encoding");
6296 RExC_utf8 = 1;
6297 }
a687059c 6298 break;
a0d0e21e 6299 case '\0':
830247a4 6300 if (p >= RExC_end)
b45f050a 6301 FAIL("Trailing \\");
a687059c 6302 /* FALL THROUGH */
a0d0e21e 6303 default:
041457d9 6304 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6305 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6306 goto normal_default;
a0d0e21e
LW
6307 }
6308 break;
a687059c 6309 default:
a0ed51b3 6310 normal_default:
fd400ab9 6311 if (UTF8_IS_START(*p) && UTF) {
1df70142 6312 STRLEN numlen;
5e12f4fb 6313 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6314 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6315 p += numlen;
6316 }
6317 else
6318 ender = *p++;
a0d0e21e 6319 break;
a687059c 6320 }
e2509266 6321 if (RExC_flags & PMf_EXTENDED)
830247a4 6322 p = regwhite(p, RExC_end);
60a8b682
JH
6323 if (UTF && FOLD) {
6324 /* Prime the casefolded buffer. */
ac7e0132 6325 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6326 }
a0d0e21e
LW
6327 if (ISMULT2(p)) { /* Back off on ?+*. */
6328 if (len)
6329 p = oldp;
16ea2a2e 6330 else if (UTF) {
80aecb99 6331 if (FOLD) {
60a8b682 6332 /* Emit all the Unicode characters. */
1df70142 6333 STRLEN numlen;
80aecb99
JH
6334 for (foldbuf = tmpbuf;
6335 foldlen;
6336 foldlen -= numlen) {
6337 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6338 if (numlen > 0) {
71207a34 6339 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6340 s += unilen;
6341 len += unilen;
6342 /* In EBCDIC the numlen
6343 * and unilen can differ. */
9dc45d57 6344 foldbuf += numlen;
47654450
JH
6345 if (numlen >= foldlen)
6346 break;
9dc45d57
JH
6347 }
6348 else
6349 break; /* "Can't happen." */
80aecb99
JH
6350 }
6351 }
6352 else {
71207a34 6353 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6354 if (unilen > 0) {
0ebc6274
JH
6355 s += unilen;
6356 len += unilen;
9dc45d57 6357 }
80aecb99 6358 }
a0ed51b3 6359 }
a0d0e21e
LW
6360 else {
6361 len++;
eb160463 6362 REGC((char)ender, s++);
a0d0e21e
LW
6363 }
6364 break;
a687059c 6365 }
16ea2a2e 6366 if (UTF) {
80aecb99 6367 if (FOLD) {
60a8b682 6368 /* Emit all the Unicode characters. */
1df70142 6369 STRLEN numlen;
80aecb99
JH
6370 for (foldbuf = tmpbuf;
6371 foldlen;
6372 foldlen -= numlen) {
6373 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6374 if (numlen > 0) {
71207a34 6375 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6376 len += unilen;
6377 s += unilen;
6378 /* In EBCDIC the numlen
6379 * and unilen can differ. */
9dc45d57 6380 foldbuf += numlen;
47654450
JH
6381 if (numlen >= foldlen)
6382 break;
9dc45d57
JH
6383 }
6384 else
6385 break;
80aecb99
JH
6386 }
6387 }
6388 else {
71207a34 6389 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6390 if (unilen > 0) {
0ebc6274
JH
6391 s += unilen;
6392 len += unilen;
9dc45d57 6393 }
80aecb99
JH
6394 }
6395 len--;
a0ed51b3
LW
6396 }
6397 else
eb160463 6398 REGC((char)ender, s++);
a0d0e21e
LW
6399 }
6400 loopdone:
830247a4 6401 RExC_parse = p - 1;
fac92740 6402 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6403 nextchar(pRExC_state);
793db0cb
JH
6404 {
6405 /* len is STRLEN which is unsigned, need to copy to signed */
6406 IV iv = len;
6407 if (iv < 0)
6408 vFAIL("Internal disaster");
6409 }
a0d0e21e
LW
6410 if (len > 0)
6411 *flagp |= HASWIDTH;
090f7165 6412 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6413 *flagp |= SIMPLE;
3dab1dad 6414
cd439c50 6415 if (SIZE_ONLY)
830247a4 6416 RExC_size += STR_SZ(len);
3dab1dad
YO
6417 else {
6418 STR_LEN(ret) = len;
830247a4 6419 RExC_emit += STR_SZ(len);
07be1b83 6420 }
3dab1dad 6421 }
a0d0e21e
LW
6422 break;
6423 }
a687059c 6424
a0d0e21e 6425 return(ret);
a687059c
LW
6426}
6427
873ef191 6428STATIC char *
5f66b61c 6429S_regwhite(char *p, const char *e)
5b5a24f7
CS
6430{
6431 while (p < e) {
6432 if (isSPACE(*p))
6433 ++p;
6434 else if (*p == '#') {
6435 do {
6436 p++;
6437 } while (p < e && *p != '\n');
6438 }
6439 else
6440 break;
6441 }
6442 return p;
6443}
6444
b8c5462f
JH
6445/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6446 Character classes ([:foo:]) can also be negated ([:^foo:]).
6447 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6448 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6449 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6450
6451#define POSIXCC_DONE(c) ((c) == ':')
6452#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6453#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6454
b8c5462f 6455STATIC I32
830247a4 6456S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6457{
97aff369 6458 dVAR;
936ed897 6459 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6460
830247a4 6461 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6462 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6463 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6464 const char c = UCHARAT(RExC_parse);
097eb12c 6465 char* const s = RExC_parse++;
b81d288d 6466
9a86a77b 6467 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6468 RExC_parse++;
6469 if (RExC_parse == RExC_end)
620e46c5 6470 /* Grandfather lone [:, [=, [. */
830247a4 6471 RExC_parse = s;
620e46c5 6472 else {
3dab1dad 6473 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6474 assert(*t == c);
6475
9a86a77b 6476 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6477 const char *posixcc = s + 1;
830247a4 6478 RExC_parse++; /* skip over the ending ] */
3dab1dad 6479
b8c5462f 6480 if (*s == ':') {
1df70142
AL
6481 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6482 const I32 skip = t - posixcc;
80916619
NC
6483
6484 /* Initially switch on the length of the name. */
6485 switch (skip) {
6486 case 4:
3dab1dad
YO
6487 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6488 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6489 break;
80916619
NC
6490 case 5:
6491 /* Names all of length 5. */
6492 /* alnum alpha ascii blank cntrl digit graph lower
6493 print punct space upper */
6494 /* Offset 4 gives the best switch position. */
6495 switch (posixcc[4]) {
6496 case 'a':
3dab1dad
YO
6497 if (memEQ(posixcc, "alph", 4)) /* alpha */
6498 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6499 break;
6500 case 'e':
3dab1dad
YO
6501 if (memEQ(posixcc, "spac", 4)) /* space */
6502 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6503 break;
6504 case 'h':
3dab1dad
YO
6505 if (memEQ(posixcc, "grap", 4)) /* graph */
6506 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6507 break;
6508 case 'i':
3dab1dad
YO
6509 if (memEQ(posixcc, "asci", 4)) /* ascii */
6510 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6511 break;
6512 case 'k':
3dab1dad
YO
6513 if (memEQ(posixcc, "blan", 4)) /* blank */
6514 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6515 break;
6516 case 'l':
3dab1dad
YO
6517 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6518 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6519 break;
6520 case 'm':
3dab1dad
YO
6521 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6522 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6523 break;
6524 case 'r':
3dab1dad
YO
6525 if (memEQ(posixcc, "lowe", 4)) /* lower */
6526 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6527 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6528 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6529 break;
6530 case 't':
3dab1dad
YO
6531 if (memEQ(posixcc, "digi", 4)) /* digit */
6532 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6533 else if (memEQ(posixcc, "prin", 4)) /* print */
6534 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6535 else if (memEQ(posixcc, "punc", 4)) /* punct */
6536 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6537 break;
b8c5462f
JH
6538 }
6539 break;
80916619 6540 case 6:
3dab1dad
YO
6541 if (memEQ(posixcc, "xdigit", 6))
6542 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6543 break;
6544 }
80916619
NC
6545
6546 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6547 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6548 t - s - 1, s + 1);
80916619
NC
6549 assert (posixcc[skip] == ':');
6550 assert (posixcc[skip+1] == ']');
b45f050a 6551 } else if (!SIZE_ONLY) {
b8c5462f 6552 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6553
830247a4 6554 /* adjust RExC_parse so the warning shows after
b45f050a 6555 the class closes */
9a86a77b 6556 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6557 RExC_parse++;
b45f050a
JF
6558 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6559 }
b8c5462f
JH
6560 } else {
6561 /* Maternal grandfather:
6562 * "[:" ending in ":" but not in ":]" */
830247a4 6563 RExC_parse = s;
767d463e 6564 }
620e46c5
JH
6565 }
6566 }
6567
b8c5462f
JH
6568 return namedclass;
6569}
6570
6571STATIC void
830247a4 6572S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6573{
97aff369 6574 dVAR;
3dab1dad 6575 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6576 const char *s = RExC_parse;
6577 const char c = *s++;
b8c5462f 6578
3dab1dad 6579 while (isALNUM(*s))
b8c5462f
JH
6580 s++;
6581 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6582 if (ckWARN(WARN_REGEXP))
6583 vWARN3(s+2,
6584 "POSIX syntax [%c %c] belongs inside character classes",
6585 c, c);
b45f050a
JF
6586
6587 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6588 if (POSIXCC_NOTYET(c)) {
830247a4 6589 /* adjust RExC_parse so the error shows after
b45f050a 6590 the class closes */
9a86a77b 6591 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6592 NOOP;
b45f050a
JF
6593 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6594 }
b8c5462f
JH
6595 }
6596 }
620e46c5
JH
6597}
6598
7f6f358c
YO
6599
6600/*
6601 parse a class specification and produce either an ANYOF node that
6602 matches the pattern. If the pattern matches a single char only and
6603 that char is < 256 then we produce an EXACT node instead.
6604*/
76e3520e 6605STATIC regnode *
3dab1dad 6606S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6607{
97aff369 6608 dVAR;
9ef43ace 6609 register UV value = 0;
9a86a77b 6610 register UV nextvalue;
3568d838 6611 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6612 register IV range = 0;
c277df42 6613 register regnode *ret;
ba210ebe 6614 STRLEN numlen;
ffc61ed2 6615 IV namedclass;
cbbf8932 6616 char *rangebegin = NULL;
936ed897 6617 bool need_class = 0;
c445ea15 6618 SV *listsv = NULL;
ffc61ed2 6619 UV n;
9e55ce06 6620 bool optimize_invert = TRUE;
cbbf8932 6621 AV* unicode_alternate = NULL;
1b2d223b
JH
6622#ifdef EBCDIC
6623 UV literal_endpoint = 0;
6624#endif
7f6f358c 6625 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6626
3dab1dad 6627 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6628 case we need to change the emitted regop to an EXACT. */
07be1b83 6629 const char * orig_parse = RExC_parse;
72f13be8 6630 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6631#ifndef DEBUGGING
6632 PERL_UNUSED_ARG(depth);
6633#endif
72f13be8 6634
3dab1dad 6635 DEBUG_PARSE("clas");
7f6f358c
YO
6636
6637 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6638 ret = reganode(pRExC_state, ANYOF, 0);
6639
6640 if (!SIZE_ONLY)
6641 ANYOF_FLAGS(ret) = 0;
6642
9a86a77b 6643 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6644 RExC_naughty++;
6645 RExC_parse++;
6646 if (!SIZE_ONLY)
6647 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6648 }
a0d0e21e 6649
73060fc4 6650 if (SIZE_ONLY) {
830247a4 6651 RExC_size += ANYOF_SKIP;
73060fc4
JH
6652 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6653 }
936ed897 6654 else {
830247a4 6655 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6656 if (FOLD)
6657 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6658 if (LOC)
6659 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6660 ANYOF_BITMAP_ZERO(ret);
396482e1 6661 listsv = newSVpvs("# comment\n");
a0d0e21e 6662 }
b8c5462f 6663
9a86a77b
JH
6664 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6665
b938889d 6666 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6667 checkposixcc(pRExC_state);
b8c5462f 6668
f064b6ad
HS
6669 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6670 if (UCHARAT(RExC_parse) == ']')
6671 goto charclassloop;
ffc61ed2 6672
fc8cd66c 6673parseit:
9a86a77b 6674 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6675
6676 charclassloop:
6677
6678 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6679
73b437c8 6680 if (!range)
830247a4 6681 rangebegin = RExC_parse;
ffc61ed2 6682 if (UTF) {
5e12f4fb 6683 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6684 RExC_end - RExC_parse,
9f7f3913 6685 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6686 RExC_parse += numlen;
6687 }
6688 else
6689 value = UCHARAT(RExC_parse++);
7f6f358c 6690
9a86a77b
JH
6691 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6692 if (value == '[' && POSIXCC(nextvalue))
830247a4 6693 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6694 else if (value == '\\') {
ffc61ed2 6695 if (UTF) {
5e12f4fb 6696 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6697 RExC_end - RExC_parse,
9f7f3913 6698 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6699 RExC_parse += numlen;
6700 }
6701 else
6702 value = UCHARAT(RExC_parse++);
470c3474 6703 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6704 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6705 * be a problem later if we want switch on Unicode.
6706 * A similar issue a little bit later when switching on
6707 * namedclass. --jhi */
ffc61ed2 6708 switch ((I32)value) {
b8c5462f
JH
6709 case 'w': namedclass = ANYOF_ALNUM; break;
6710 case 'W': namedclass = ANYOF_NALNUM; break;
6711 case 's': namedclass = ANYOF_SPACE; break;
6712 case 'S': namedclass = ANYOF_NSPACE; break;
6713 case 'd': namedclass = ANYOF_DIGIT; break;
6714 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6715 case 'N': /* Handle \N{NAME} in class */
6716 {
6717 /* We only pay attention to the first char of
6718 multichar strings being returned. I kinda wonder
6719 if this makes sense as it does change the behaviour
6720 from earlier versions, OTOH that behaviour was broken
6721 as well. */
6722 UV v; /* value is register so we cant & it /grrr */
6723 if (reg_namedseq(pRExC_state, &v)) {
6724 goto parseit;
6725 }
6726 value= v;
6727 }
6728 break;
ffc61ed2
JH
6729 case 'p':
6730 case 'P':
3dab1dad
YO
6731 {
6732 char *e;
af6f566e 6733 if (RExC_parse >= RExC_end)
2a4859cd 6734 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6735 if (*RExC_parse == '{') {
1df70142 6736 const U8 c = (U8)value;
ffc61ed2
JH
6737 e = strchr(RExC_parse++, '}');
6738 if (!e)
0da60cf5 6739 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6740 while (isSPACE(UCHARAT(RExC_parse)))
6741 RExC_parse++;
6742 if (e == RExC_parse)
0da60cf5 6743 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6744 n = e - RExC_parse;
ab13f0c7
JH
6745 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6746 n--;
ffc61ed2
JH
6747 }
6748 else {
6749 e = RExC_parse;
6750 n = 1;
6751 }
6752 if (!SIZE_ONLY) {
ab13f0c7
JH
6753 if (UCHARAT(RExC_parse) == '^') {
6754 RExC_parse++;
6755 n--;
6756 value = value == 'p' ? 'P' : 'p'; /* toggle */
6757 while (isSPACE(UCHARAT(RExC_parse))) {
6758 RExC_parse++;
6759 n--;
6760 }
6761 }
097eb12c
AL
6762 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6763 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6764 }
6765 RExC_parse = e + 1;
6766 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6767 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6768 }
f81125e2 6769 break;
b8c5462f
JH
6770 case 'n': value = '\n'; break;
6771 case 'r': value = '\r'; break;
6772 case 't': value = '\t'; break;
6773 case 'f': value = '\f'; break;
6774 case 'b': value = '\b'; break;
c7f1f016
NIS
6775 case 'e': value = ASCII_TO_NATIVE('\033');break;
6776 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6777 case 'x':
ffc61ed2 6778 if (*RExC_parse == '{') {
a4c04bdc
NC
6779 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6780 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6781 char * const e = strchr(RExC_parse++, '}');
b81d288d 6782 if (!e)
ffc61ed2 6783 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6784
6785 numlen = e - RExC_parse;
6786 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6787 RExC_parse = e + 1;
6788 }
6789 else {
a4c04bdc 6790 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6791 numlen = 2;
6792 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6793 RExC_parse += numlen;
6794 }
9e08bc66
TS
6795 if (PL_encoding && value < 0x100)
6796 goto recode_encoding;
b8c5462f
JH
6797 break;
6798 case 'c':
830247a4 6799 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6800 value = toCTRL(value);
6801 break;
6802 case '0': case '1': case '2': case '3': case '4':
6803 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
6804 {
6805 I32 flags = 0;
6806 numlen = 3;
6807 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
6808 RExC_parse += numlen;
6809 if (PL_encoding && value < 0x100)
6810 goto recode_encoding;
6811 break;
6812 }
6813 recode_encoding:
6814 {
6815 SV* enc = PL_encoding;
6816 value = reg_recode((const char)(U8)value, &enc);
6817 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6818 vWARN(RExC_parse,
6819 "Invalid escape in the specified encoding");
6820 break;
6821 }
1028017a 6822 default:
041457d9 6823 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6824 vWARN2(RExC_parse,
6825 "Unrecognized escape \\%c in character class passed through",
6826 (int)value);
1028017a 6827 break;
b8c5462f 6828 }
ffc61ed2 6829 } /* end of \blah */
1b2d223b
JH
6830#ifdef EBCDIC
6831 else
6832 literal_endpoint++;
6833#endif
ffc61ed2
JH
6834
6835 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6836
6837 if (!SIZE_ONLY && !need_class)
936ed897 6838 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6839
936ed897 6840 need_class = 1;
ffc61ed2
JH
6841
6842 /* a bad range like a-\d, a-[:digit:] ? */
6843 if (range) {
73b437c8 6844 if (!SIZE_ONLY) {
afd78fd5 6845 if (ckWARN(WARN_REGEXP)) {
097eb12c 6846 const int w =
afd78fd5
JH
6847 RExC_parse >= rangebegin ?
6848 RExC_parse - rangebegin : 0;
830247a4 6849 vWARN4(RExC_parse,
b45f050a 6850 "False [] range \"%*.*s\"",
097eb12c 6851 w, w, rangebegin);
afd78fd5 6852 }
3568d838
JH
6853 if (prevvalue < 256) {
6854 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6855 ANYOF_BITMAP_SET(ret, '-');
6856 }
6857 else {
6858 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6859 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6860 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6861 }
b8c5462f 6862 }
ffc61ed2
JH
6863
6864 range = 0; /* this was not a true range */
73b437c8 6865 }
ffc61ed2 6866
73b437c8 6867 if (!SIZE_ONLY) {
c49a72a9
NC
6868 const char *what = NULL;
6869 char yesno = 0;
6870
3568d838
JH
6871 if (namedclass > OOB_NAMEDCLASS)
6872 optimize_invert = FALSE;
e2962f66
JH
6873 /* Possible truncation here but in some 64-bit environments
6874 * the compiler gets heartburn about switch on 64-bit values.
6875 * A similar issue a little earlier when switching on value.
98f323fa 6876 * --jhi */
e2962f66 6877 switch ((I32)namedclass) {
73b437c8
JH
6878 case ANYOF_ALNUM:
6879 if (LOC)
936ed897 6880 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6881 else {
6882 for (value = 0; value < 256; value++)
6883 if (isALNUM(value))
936ed897 6884 ANYOF_BITMAP_SET(ret, value);
73b437c8 6885 }
c49a72a9
NC
6886 yesno = '+';
6887 what = "Word";
73b437c8
JH
6888 break;
6889 case ANYOF_NALNUM:
6890 if (LOC)
936ed897 6891 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6892 else {
6893 for (value = 0; value < 256; value++)
6894 if (!isALNUM(value))
936ed897 6895 ANYOF_BITMAP_SET(ret, value);
73b437c8 6896 }
c49a72a9
NC
6897 yesno = '!';
6898 what = "Word";
73b437c8 6899 break;
ffc61ed2 6900 case ANYOF_ALNUMC:
73b437c8 6901 if (LOC)
ffc61ed2 6902 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6903 else {
6904 for (value = 0; value < 256; value++)
ffc61ed2 6905 if (isALNUMC(value))
936ed897 6906 ANYOF_BITMAP_SET(ret, value);
73b437c8 6907 }
c49a72a9
NC
6908 yesno = '+';
6909 what = "Alnum";
73b437c8
JH
6910 break;
6911 case ANYOF_NALNUMC:
6912 if (LOC)
936ed897 6913 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
6914 else {
6915 for (value = 0; value < 256; value++)
6916 if (!isALNUMC(value))
936ed897 6917 ANYOF_BITMAP_SET(ret, value);
73b437c8 6918 }
c49a72a9
NC
6919 yesno = '!';
6920 what = "Alnum";
73b437c8
JH
6921 break;
6922 case ANYOF_ALPHA:
6923 if (LOC)
936ed897 6924 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
6925 else {
6926 for (value = 0; value < 256; value++)
6927 if (isALPHA(value))
936ed897 6928 ANYOF_BITMAP_SET(ret, value);
73b437c8 6929 }
c49a72a9
NC
6930 yesno = '+';
6931 what = "Alpha";
73b437c8
JH
6932 break;
6933 case ANYOF_NALPHA:
6934 if (LOC)
936ed897 6935 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6936 else {
6937 for (value = 0; value < 256; value++)
6938 if (!isALPHA(value))
936ed897 6939 ANYOF_BITMAP_SET(ret, value);
73b437c8 6940 }
c49a72a9
NC
6941 yesno = '!';
6942 what = "Alpha";
73b437c8
JH
6943 break;
6944 case ANYOF_ASCII:
6945 if (LOC)
936ed897 6946 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6947 else {
c7f1f016 6948#ifndef EBCDIC
1ba5c669
JH
6949 for (value = 0; value < 128; value++)
6950 ANYOF_BITMAP_SET(ret, value);
6951#else /* EBCDIC */
ffbc6a93 6952 for (value = 0; value < 256; value++) {
3a3c4447
JH
6953 if (isASCII(value))
6954 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6955 }
1ba5c669 6956#endif /* EBCDIC */
73b437c8 6957 }
c49a72a9
NC
6958 yesno = '+';
6959 what = "ASCII";
73b437c8
JH
6960 break;
6961 case ANYOF_NASCII:
6962 if (LOC)
936ed897 6963 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6964 else {
c7f1f016 6965#ifndef EBCDIC
1ba5c669
JH
6966 for (value = 128; value < 256; value++)
6967 ANYOF_BITMAP_SET(ret, value);
6968#else /* EBCDIC */
ffbc6a93 6969 for (value = 0; value < 256; value++) {
3a3c4447
JH
6970 if (!isASCII(value))
6971 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6972 }
1ba5c669 6973#endif /* EBCDIC */
73b437c8 6974 }
c49a72a9
NC
6975 yesno = '!';
6976 what = "ASCII";
73b437c8 6977 break;
aaa51d5e
JF
6978 case ANYOF_BLANK:
6979 if (LOC)
6980 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6981 else {
6982 for (value = 0; value < 256; value++)
6983 if (isBLANK(value))
6984 ANYOF_BITMAP_SET(ret, value);
6985 }
c49a72a9
NC
6986 yesno = '+';
6987 what = "Blank";
aaa51d5e
JF
6988 break;
6989 case ANYOF_NBLANK:
6990 if (LOC)
6991 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6992 else {
6993 for (value = 0; value < 256; value++)
6994 if (!isBLANK(value))
6995 ANYOF_BITMAP_SET(ret, value);
6996 }
c49a72a9
NC
6997 yesno = '!';
6998 what = "Blank";
aaa51d5e 6999 break;
73b437c8
JH
7000 case ANYOF_CNTRL:
7001 if (LOC)
936ed897 7002 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
7003 else {
7004 for (value = 0; value < 256; value++)
7005 if (isCNTRL(value))
936ed897 7006 ANYOF_BITMAP_SET(ret, value);
73b437c8 7007 }
c49a72a9
NC
7008 yesno = '+';
7009 what = "Cntrl";
73b437c8
JH
7010 break;
7011 case ANYOF_NCNTRL:
7012 if (LOC)
936ed897 7013 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
7014 else {
7015 for (value = 0; value < 256; value++)
7016 if (!isCNTRL(value))
936ed897 7017 ANYOF_BITMAP_SET(ret, value);
73b437c8 7018 }
c49a72a9
NC
7019 yesno = '!';
7020 what = "Cntrl";
ffc61ed2
JH
7021 break;
7022 case ANYOF_DIGIT:
7023 if (LOC)
7024 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7025 else {
7026 /* consecutive digits assumed */
7027 for (value = '0'; value <= '9'; value++)
7028 ANYOF_BITMAP_SET(ret, value);
7029 }
c49a72a9
NC
7030 yesno = '+';
7031 what = "Digit";
ffc61ed2
JH
7032 break;
7033 case ANYOF_NDIGIT:
7034 if (LOC)
7035 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7036 else {
7037 /* consecutive digits assumed */
7038 for (value = 0; value < '0'; value++)
7039 ANYOF_BITMAP_SET(ret, value);
7040 for (value = '9' + 1; value < 256; value++)
7041 ANYOF_BITMAP_SET(ret, value);
7042 }
c49a72a9
NC
7043 yesno = '!';
7044 what = "Digit";
73b437c8
JH
7045 break;
7046 case ANYOF_GRAPH:
7047 if (LOC)
936ed897 7048 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
7049 else {
7050 for (value = 0; value < 256; value++)
7051 if (isGRAPH(value))
936ed897 7052 ANYOF_BITMAP_SET(ret, value);
73b437c8 7053 }
c49a72a9
NC
7054 yesno = '+';
7055 what = "Graph";
73b437c8
JH
7056 break;
7057 case ANYOF_NGRAPH:
7058 if (LOC)
936ed897 7059 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7060 else {
7061 for (value = 0; value < 256; value++)
7062 if (!isGRAPH(value))
936ed897 7063 ANYOF_BITMAP_SET(ret, value);
73b437c8 7064 }
c49a72a9
NC
7065 yesno = '!';
7066 what = "Graph";
73b437c8
JH
7067 break;
7068 case ANYOF_LOWER:
7069 if (LOC)
936ed897 7070 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7071 else {
7072 for (value = 0; value < 256; value++)
7073 if (isLOWER(value))
936ed897 7074 ANYOF_BITMAP_SET(ret, value);
73b437c8 7075 }
c49a72a9
NC
7076 yesno = '+';
7077 what = "Lower";
73b437c8
JH
7078 break;
7079 case ANYOF_NLOWER:
7080 if (LOC)
936ed897 7081 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7082 else {
7083 for (value = 0; value < 256; value++)
7084 if (!isLOWER(value))
936ed897 7085 ANYOF_BITMAP_SET(ret, value);
73b437c8 7086 }
c49a72a9
NC
7087 yesno = '!';
7088 what = "Lower";
73b437c8
JH
7089 break;
7090 case ANYOF_PRINT:
7091 if (LOC)
936ed897 7092 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7093 else {
7094 for (value = 0; value < 256; value++)
7095 if (isPRINT(value))
936ed897 7096 ANYOF_BITMAP_SET(ret, value);
73b437c8 7097 }
c49a72a9
NC
7098 yesno = '+';
7099 what = "Print";
73b437c8
JH
7100 break;
7101 case ANYOF_NPRINT:
7102 if (LOC)
936ed897 7103 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7104 else {
7105 for (value = 0; value < 256; value++)
7106 if (!isPRINT(value))
936ed897 7107 ANYOF_BITMAP_SET(ret, value);
73b437c8 7108 }
c49a72a9
NC
7109 yesno = '!';
7110 what = "Print";
73b437c8 7111 break;
aaa51d5e
JF
7112 case ANYOF_PSXSPC:
7113 if (LOC)
7114 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7115 else {
7116 for (value = 0; value < 256; value++)
7117 if (isPSXSPC(value))
7118 ANYOF_BITMAP_SET(ret, value);
7119 }
c49a72a9
NC
7120 yesno = '+';
7121 what = "Space";
aaa51d5e
JF
7122 break;
7123 case ANYOF_NPSXSPC:
7124 if (LOC)
7125 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7126 else {
7127 for (value = 0; value < 256; value++)
7128 if (!isPSXSPC(value))
7129 ANYOF_BITMAP_SET(ret, value);
7130 }
c49a72a9
NC
7131 yesno = '!';
7132 what = "Space";
aaa51d5e 7133 break;
73b437c8
JH
7134 case ANYOF_PUNCT:
7135 if (LOC)
936ed897 7136 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7137 else {
7138 for (value = 0; value < 256; value++)
7139 if (isPUNCT(value))
936ed897 7140 ANYOF_BITMAP_SET(ret, value);
73b437c8 7141 }
c49a72a9
NC
7142 yesno = '+';
7143 what = "Punct";
73b437c8
JH
7144 break;
7145 case ANYOF_NPUNCT:
7146 if (LOC)
936ed897 7147 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7148 else {
7149 for (value = 0; value < 256; value++)
7150 if (!isPUNCT(value))
936ed897 7151 ANYOF_BITMAP_SET(ret, value);
73b437c8 7152 }
c49a72a9
NC
7153 yesno = '!';
7154 what = "Punct";
ffc61ed2
JH
7155 break;
7156 case ANYOF_SPACE:
7157 if (LOC)
7158 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7159 else {
7160 for (value = 0; value < 256; value++)
7161 if (isSPACE(value))
7162 ANYOF_BITMAP_SET(ret, value);
7163 }
c49a72a9
NC
7164 yesno = '+';
7165 what = "SpacePerl";
ffc61ed2
JH
7166 break;
7167 case ANYOF_NSPACE:
7168 if (LOC)
7169 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7170 else {
7171 for (value = 0; value < 256; value++)
7172 if (!isSPACE(value))
7173 ANYOF_BITMAP_SET(ret, value);
7174 }
c49a72a9
NC
7175 yesno = '!';
7176 what = "SpacePerl";
73b437c8
JH
7177 break;
7178 case ANYOF_UPPER:
7179 if (LOC)
936ed897 7180 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7181 else {
7182 for (value = 0; value < 256; value++)
7183 if (isUPPER(value))
936ed897 7184 ANYOF_BITMAP_SET(ret, value);
73b437c8 7185 }
c49a72a9
NC
7186 yesno = '+';
7187 what = "Upper";
73b437c8
JH
7188 break;
7189 case ANYOF_NUPPER:
7190 if (LOC)
936ed897 7191 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7192 else {
7193 for (value = 0; value < 256; value++)
7194 if (!isUPPER(value))
936ed897 7195 ANYOF_BITMAP_SET(ret, value);
73b437c8 7196 }
c49a72a9
NC
7197 yesno = '!';
7198 what = "Upper";
73b437c8
JH
7199 break;
7200 case ANYOF_XDIGIT:
7201 if (LOC)
936ed897 7202 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7203 else {
7204 for (value = 0; value < 256; value++)
7205 if (isXDIGIT(value))
936ed897 7206 ANYOF_BITMAP_SET(ret, value);
73b437c8 7207 }
c49a72a9
NC
7208 yesno = '+';
7209 what = "XDigit";
73b437c8
JH
7210 break;
7211 case ANYOF_NXDIGIT:
7212 if (LOC)
936ed897 7213 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7214 else {
7215 for (value = 0; value < 256; value++)
7216 if (!isXDIGIT(value))
936ed897 7217 ANYOF_BITMAP_SET(ret, value);
73b437c8 7218 }
c49a72a9
NC
7219 yesno = '!';
7220 what = "XDigit";
73b437c8 7221 break;
f81125e2
JP
7222 case ANYOF_MAX:
7223 /* this is to handle \p and \P */
7224 break;
73b437c8 7225 default:
b45f050a 7226 vFAIL("Invalid [::] class");
73b437c8 7227 break;
b8c5462f 7228 }
c49a72a9
NC
7229 if (what) {
7230 /* Strings such as "+utf8::isWord\n" */
7231 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7232 }
b8c5462f 7233 if (LOC)
936ed897 7234 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7235 continue;
a0d0e21e 7236 }
ffc61ed2
JH
7237 } /* end of namedclass \blah */
7238
a0d0e21e 7239 if (range) {
eb160463 7240 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7241 const int w = RExC_parse - rangebegin;
7242 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7243 range = 0; /* not a valid range */
73b437c8 7244 }
a0d0e21e
LW
7245 }
7246 else {
3568d838 7247 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7248 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7249 RExC_parse[1] != ']') {
7250 RExC_parse++;
ffc61ed2
JH
7251
7252 /* a bad range like \w-, [:word:]- ? */
7253 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7254 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7255 const int w =
afd78fd5
JH
7256 RExC_parse >= rangebegin ?
7257 RExC_parse - rangebegin : 0;
830247a4 7258 vWARN4(RExC_parse,
b45f050a 7259 "False [] range \"%*.*s\"",
097eb12c 7260 w, w, rangebegin);
afd78fd5 7261 }
73b437c8 7262 if (!SIZE_ONLY)
936ed897 7263 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7264 } else
ffc61ed2
JH
7265 range = 1; /* yeah, it's a range! */
7266 continue; /* but do it the next time */
a0d0e21e 7267 }
a687059c 7268 }
ffc61ed2 7269
93733859 7270 /* now is the next time */
07be1b83 7271 /*stored += (value - prevvalue + 1);*/
ae5c130c 7272 if (!SIZE_ONLY) {
3568d838 7273 if (prevvalue < 256) {
1df70142 7274 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7275 IV i;
3568d838 7276#ifdef EBCDIC
1b2d223b
JH
7277 /* In EBCDIC [\x89-\x91] should include
7278 * the \x8e but [i-j] should not. */
7279 if (literal_endpoint == 2 &&
7280 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7281 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7282 {
3568d838
JH
7283 if (isLOWER(prevvalue)) {
7284 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7285 if (isLOWER(i))
7286 ANYOF_BITMAP_SET(ret, i);
7287 } else {
3568d838 7288 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7289 if (isUPPER(i))
7290 ANYOF_BITMAP_SET(ret, i);
7291 }
8ada0baa 7292 }
ffc61ed2 7293 else
8ada0baa 7294#endif
07be1b83
YO
7295 for (i = prevvalue; i <= ceilvalue; i++) {
7296 if (!ANYOF_BITMAP_TEST(ret,i)) {
7297 stored++;
7298 ANYOF_BITMAP_SET(ret, i);
7299 }
7300 }
3568d838 7301 }
a5961de5 7302 if (value > 255 || UTF) {
1df70142
AL
7303 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7304 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7305 stored+=2; /* can't optimize this class */
ffc61ed2 7306 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7307 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7308 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7309 prevnatvalue, natvalue);
7310 }
7311 else if (prevnatvalue == natvalue) {
7312 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7313 if (FOLD) {
89ebb4a3 7314 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7315 STRLEN foldlen;
1df70142 7316 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7317
e294cc5d
JH
7318#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7319 if (RExC_precomp[0] == ':' &&
7320 RExC_precomp[1] == '[' &&
7321 (f == 0xDF || f == 0x92)) {
7322 f = NATIVE_TO_UNI(f);
7323 }
7324#endif
c840d2a2
JH
7325 /* If folding and foldable and a single
7326 * character, insert also the folded version
7327 * to the charclass. */
9e55ce06 7328 if (f != value) {
e294cc5d
JH
7329#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7330 if ((RExC_precomp[0] == ':' &&
7331 RExC_precomp[1] == '[' &&
7332 (f == 0xA2 &&
7333 (value == 0xFB05 || value == 0xFB06))) ?
7334 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7335 foldlen == (STRLEN)UNISKIP(f) )
7336#else
eb160463 7337 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7338#endif
9e55ce06
JH
7339 Perl_sv_catpvf(aTHX_ listsv,
7340 "%04"UVxf"\n", f);
7341 else {
7342 /* Any multicharacter foldings
7343 * require the following transform:
7344 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7345 * where E folds into "pq" and F folds
7346 * into "rst", all other characters
7347 * fold to single characters. We save
7348 * away these multicharacter foldings,
7349 * to be later saved as part of the
7350 * additional "s" data. */
7351 SV *sv;
7352
7353 if (!unicode_alternate)
7354 unicode_alternate = newAV();
7355 sv = newSVpvn((char*)foldbuf, foldlen);
7356 SvUTF8_on(sv);
7357 av_push(unicode_alternate, sv);
7358 }
7359 }
254ba52a 7360
60a8b682
JH
7361 /* If folding and the value is one of the Greek
7362 * sigmas insert a few more sigmas to make the
7363 * folding rules of the sigmas to work right.
7364 * Note that not all the possible combinations
7365 * are handled here: some of them are handled
9e55ce06
JH
7366 * by the standard folding rules, and some of
7367 * them (literal or EXACTF cases) are handled
7368 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7369 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7370 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7371 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7372 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7373 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7374 }
7375 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7376 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7377 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7378 }
7379 }
ffc61ed2 7380 }
1b2d223b
JH
7381#ifdef EBCDIC
7382 literal_endpoint = 0;
7383#endif
8ada0baa 7384 }
ffc61ed2
JH
7385
7386 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7387 }
ffc61ed2 7388
936ed897 7389 if (need_class) {
4f66b38d 7390 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7391 if (SIZE_ONLY)
830247a4 7392 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7393 else
830247a4 7394 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7395 }
ffc61ed2 7396
7f6f358c
YO
7397
7398 if (SIZE_ONLY)
7399 return ret;
7400 /****** !SIZE_ONLY AFTER HERE *********/
7401
7402 if( stored == 1 && value < 256
7403 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7404 ) {
7405 /* optimize single char class to an EXACT node
7406 but *only* when its not a UTF/high char */
07be1b83
YO
7407 const char * cur_parse= RExC_parse;
7408 RExC_emit = (regnode *)orig_emit;
7409 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7410 ret = reg_node(pRExC_state,
7411 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7412 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7413 *STRING(ret)= (char)value;
7414 STR_LEN(ret)= 1;
7415 RExC_emit += STR_SZ(1);
7416 return ret;
7417 }
ae5c130c 7418 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7419 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7420 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7421 ) {
a0ed51b3 7422 for (value = 0; value < 256; ++value) {
936ed897 7423 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7424 UV fold = PL_fold[value];
ffc61ed2
JH
7425
7426 if (fold != value)
7427 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7428 }
7429 }
936ed897 7430 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7431 }
ffc61ed2 7432
ae5c130c 7433 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7434 if (optimize_invert &&
ffc61ed2
JH
7435 /* If the only flag is inversion. */
7436 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7437 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7438 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7439 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7440 }
7f6f358c 7441 {
097eb12c 7442 AV * const av = newAV();
ffc61ed2 7443 SV *rv;
9e55ce06 7444 /* The 0th element stores the character class description
6a0407ee 7445 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7446 * to initialize the appropriate swash (which gets stored in
7447 * the 1st element), and also useful for dumping the regnode.
7448 * The 2nd element stores the multicharacter foldings,
6a0407ee 7449 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7450 av_store(av, 0, listsv);
7451 av_store(av, 1, NULL);
9e55ce06 7452 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7453 rv = newRV_noinc((SV*)av);
19860706 7454 n = add_data(pRExC_state, 1, "s");
830247a4 7455 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7456 ARG_SET(ret, n);
a0ed51b3 7457 }
a0ed51b3
LW
7458 return ret;
7459}
7460
76e3520e 7461STATIC char*
830247a4 7462S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7463{
097eb12c 7464 char* const retval = RExC_parse++;
a0d0e21e 7465
4633a7c4 7466 for (;;) {
830247a4
IZ
7467 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7468 RExC_parse[2] == '#') {
e994fd66
AE
7469 while (*RExC_parse != ')') {
7470 if (RExC_parse == RExC_end)
7471 FAIL("Sequence (?#... not terminated");
830247a4 7472 RExC_parse++;
e994fd66 7473 }
830247a4 7474 RExC_parse++;
4633a7c4
LW
7475 continue;
7476 }
e2509266 7477 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7478 if (isSPACE(*RExC_parse)) {
7479 RExC_parse++;
748a9306
LW
7480 continue;
7481 }
830247a4 7482 else if (*RExC_parse == '#') {
e994fd66
AE
7483 while (RExC_parse < RExC_end)
7484 if (*RExC_parse++ == '\n') break;
748a9306
LW
7485 continue;
7486 }
748a9306 7487 }
4633a7c4 7488 return retval;
a0d0e21e 7489 }
a687059c
LW
7490}
7491
7492/*
c277df42 7493- reg_node - emit a node
a0d0e21e 7494*/
76e3520e 7495STATIC regnode * /* Location. */
830247a4 7496S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7497{
97aff369 7498 dVAR;
c277df42 7499 register regnode *ptr;
504618e9 7500 regnode * const ret = RExC_emit;
07be1b83 7501 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7502
c277df42 7503 if (SIZE_ONLY) {
830247a4
IZ
7504 SIZE_ALIGN(RExC_size);
7505 RExC_size += 1;
a0d0e21e
LW
7506 return(ret);
7507 }
c277df42 7508 NODE_ALIGN_FILL(ret);
a0d0e21e 7509 ptr = ret;
c277df42 7510 FILL_ADVANCE_NODE(ptr, op);
fac92740 7511 if (RExC_offsets) { /* MJD */
07be1b83 7512 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7513 "reg_node", __LINE__,
7514 reg_name[op],
07be1b83
YO
7515 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7516 ? "Overwriting end of array!\n" : "OK",
7517 (UV)(RExC_emit - RExC_emit_start),
7518 (UV)(RExC_parse - RExC_start),
7519 (UV)RExC_offsets[0]));
ccb2c380 7520 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7521 }
07be1b83 7522
830247a4 7523 RExC_emit = ptr;
a687059c 7524
a0d0e21e 7525 return(ret);
a687059c
LW
7526}
7527
7528/*
a0d0e21e
LW
7529- reganode - emit a node with an argument
7530*/
76e3520e 7531STATIC regnode * /* Location. */
830247a4 7532S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7533{
97aff369 7534 dVAR;
c277df42 7535 register regnode *ptr;
504618e9 7536 regnode * const ret = RExC_emit;
07be1b83 7537 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7538
c277df42 7539 if (SIZE_ONLY) {
830247a4
IZ
7540 SIZE_ALIGN(RExC_size);
7541 RExC_size += 2;
6bda09f9
YO
7542 /*
7543 We can't do this:
7544
7545 assert(2==regarglen[op]+1);
7546
7547 Anything larger than this has to allocate the extra amount.
7548 If we changed this to be:
7549
7550 RExC_size += (1 + regarglen[op]);
7551
7552 then it wouldn't matter. Its not clear what side effect
7553 might come from that so its not done so far.
7554 -- dmq
7555 */
a0d0e21e
LW
7556 return(ret);
7557 }
fe14fcc3 7558
c277df42 7559 NODE_ALIGN_FILL(ret);
a0d0e21e 7560 ptr = ret;
c277df42 7561 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7562 if (RExC_offsets) { /* MJD */
07be1b83 7563 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7564 "reganode",
ccb2c380
MP
7565 __LINE__,
7566 reg_name[op],
07be1b83 7567 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7568 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7569 (UV)(RExC_emit - RExC_emit_start),
7570 (UV)(RExC_parse - RExC_start),
7571 (UV)RExC_offsets[0]));
ccb2c380 7572 Set_Cur_Node_Offset;
fac92740
MJD
7573 }
7574
830247a4 7575 RExC_emit = ptr;
fe14fcc3 7576
a0d0e21e 7577 return(ret);
fe14fcc3
LW
7578}
7579
7580/*
cd439c50 7581- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7582*/
71207a34
AL
7583STATIC STRLEN
7584S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7585{
97aff369 7586 dVAR;
71207a34 7587 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7588}
7589
7590/*
a0d0e21e
LW
7591- reginsert - insert an operator in front of already-emitted operand
7592*
7593* Means relocating the operand.
7594*/
76e3520e 7595STATIC void
6bda09f9 7596S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7597{
97aff369 7598 dVAR;
c277df42
IZ
7599 register regnode *src;
7600 register regnode *dst;
7601 register regnode *place;
504618e9 7602 const int offset = regarglen[(U8)op];
6bda09f9 7603 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7604 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7605/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7606 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7607 if (SIZE_ONLY) {
6bda09f9 7608 RExC_size += size;
a0d0e21e
LW
7609 return;
7610 }
a687059c 7611
830247a4 7612 src = RExC_emit;
6bda09f9 7613 RExC_emit += size;
830247a4 7614 dst = RExC_emit;
40d049e4 7615 if (RExC_open_parens) {
6bda09f9 7616 int paren;
296c5fee 7617 DEBUG_PARSE_FMT("inst"," - %"IVdf,RExC_npar);
6bda09f9 7618 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7619 if ( RExC_open_parens[paren] >= opnd ) {
7620 DEBUG_PARSE_FMT("open"," - %d",size);
7621 RExC_open_parens[paren] += size;
7622 } else {
7623 DEBUG_PARSE_FMT("open"," - %s","ok");
7624 }
7625 if ( RExC_close_parens[paren] >= opnd ) {
7626 DEBUG_PARSE_FMT("close"," - %d",size);
7627 RExC_close_parens[paren] += size;
7628 } else {
7629 DEBUG_PARSE_FMT("close"," - %s","ok");
7630 }
7631 }
6bda09f9 7632 }
40d049e4 7633
fac92740 7634 while (src > opnd) {
c277df42 7635 StructCopy(--src, --dst, regnode);
fac92740 7636 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7637 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7638 "reg_insert",
ccb2c380
MP
7639 __LINE__,
7640 reg_name[op],
07be1b83
YO
7641 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7642 ? "Overwriting end of array!\n" : "OK",
7643 (UV)(src - RExC_emit_start),
7644 (UV)(dst - RExC_emit_start),
7645 (UV)RExC_offsets[0]));
ccb2c380
MP
7646 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7647 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7648 }
7649 }
7650
a0d0e21e
LW
7651
7652 place = opnd; /* Op node, where operand used to be. */
fac92740 7653 if (RExC_offsets) { /* MJD */
07be1b83 7654 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7655 "reginsert",
ccb2c380
MP
7656 __LINE__,
7657 reg_name[op],
07be1b83 7658 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7659 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7660 (UV)(place - RExC_emit_start),
7661 (UV)(RExC_parse - RExC_start),
786e8c11 7662 (UV)RExC_offsets[0]));
ccb2c380 7663 Set_Node_Offset(place, RExC_parse);
45948336 7664 Set_Node_Length(place, 1);
fac92740 7665 }
c277df42
IZ
7666 src = NEXTOPER(place);
7667 FILL_ADVANCE_NODE(place, op);
7668 Zero(src, offset, regnode);
a687059c
LW
7669}
7670
7671/*
c277df42 7672- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7673- SEE ALSO: regtail_study
a0d0e21e 7674*/
097eb12c 7675/* TODO: All three parms should be const */
76e3520e 7676STATIC void
3dab1dad 7677S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7678{
97aff369 7679 dVAR;
c277df42 7680 register regnode *scan;
72f13be8 7681 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7682#ifndef DEBUGGING
7683 PERL_UNUSED_ARG(depth);
7684#endif
a0d0e21e 7685
c277df42 7686 if (SIZE_ONLY)
a0d0e21e
LW
7687 return;
7688
7689 /* Find last node. */
7690 scan = p;
7691 for (;;) {
504618e9 7692 regnode * const temp = regnext(scan);
3dab1dad
YO
7693 DEBUG_PARSE_r({
7694 SV * const mysv=sv_newmortal();
7695 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7696 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7697 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7698 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7699 (temp == NULL ? "->" : ""),
7700 (temp == NULL ? reg_name[OP(val)] : "")
7701 );
3dab1dad
YO
7702 });
7703 if (temp == NULL)
7704 break;
7705 scan = temp;
7706 }
7707
7708 if (reg_off_by_arg[OP(scan)]) {
7709 ARG_SET(scan, val - scan);
7710 }
7711 else {
7712 NEXT_OFF(scan) = val - scan;
7713 }
7714}
7715
07be1b83 7716#ifdef DEBUGGING
3dab1dad
YO
7717/*
7718- regtail_study - set the next-pointer at the end of a node chain of p to val.
7719- Look for optimizable sequences at the same time.
7720- currently only looks for EXACT chains.
07be1b83
YO
7721
7722This is expermental code. The idea is to use this routine to perform
7723in place optimizations on branches and groups as they are constructed,
7724with the long term intention of removing optimization from study_chunk so
7725that it is purely analytical.
7726
7727Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7728to control which is which.
7729
3dab1dad
YO
7730*/
7731/* TODO: All four parms should be const */
07be1b83 7732
3dab1dad
YO
7733STATIC U8
7734S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7735{
7736 dVAR;
7737 register regnode *scan;
07be1b83
YO
7738 U8 exact = PSEUDO;
7739#ifdef EXPERIMENTAL_INPLACESCAN
7740 I32 min = 0;
7741#endif
7742
3dab1dad
YO
7743 GET_RE_DEBUG_FLAGS_DECL;
7744
07be1b83 7745
3dab1dad
YO
7746 if (SIZE_ONLY)
7747 return exact;
7748
7749 /* Find last node. */
7750
7751 scan = p;
7752 for (;;) {
7753 regnode * const temp = regnext(scan);
07be1b83
YO
7754#ifdef EXPERIMENTAL_INPLACESCAN
7755 if (PL_regkind[OP(scan)] == EXACT)
7756 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7757 return EXACT;
7758#endif
3dab1dad
YO
7759 if ( exact ) {
7760 switch (OP(scan)) {
7761 case EXACT:
7762 case EXACTF:
7763 case EXACTFL:
7764 if( exact == PSEUDO )
7765 exact= OP(scan);
07be1b83
YO
7766 else if ( exact != OP(scan) )
7767 exact= 0;
3dab1dad
YO
7768 case NOTHING:
7769 break;
7770 default:
7771 exact= 0;
7772 }
7773 }
7774 DEBUG_PARSE_r({
7775 SV * const mysv=sv_newmortal();
7776 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7777 regprop(RExC_rx, mysv, scan);
eaf3ca90 7778 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7779 SvPV_nolen_const(mysv),
eaf3ca90
YO
7780 REG_NODE_NUM(scan),
7781 reg_name[exact]);
3dab1dad 7782 });
a0d0e21e
LW
7783 if (temp == NULL)
7784 break;
7785 scan = temp;
7786 }
07be1b83
YO
7787 DEBUG_PARSE_r({
7788 SV * const mysv_val=sv_newmortal();
7789 DEBUG_PARSE_MSG("");
7790 regprop(RExC_rx, mysv_val, val);
7791 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7792 SvPV_nolen_const(mysv_val),
7793 REG_NODE_NUM(val),
7794 val - scan
7795 );
7796 });
c277df42
IZ
7797 if (reg_off_by_arg[OP(scan)]) {
7798 ARG_SET(scan, val - scan);
a0ed51b3
LW
7799 }
7800 else {
c277df42
IZ
7801 NEXT_OFF(scan) = val - scan;
7802 }
3dab1dad
YO
7803
7804 return exact;
a687059c 7805}
07be1b83 7806#endif
a687059c
LW
7807
7808/*
a687059c
LW
7809 - regcurly - a little FSA that accepts {\d+,?\d*}
7810 */
79072805 7811STATIC I32
5f66b61c 7812S_regcurly(register const char *s)
a687059c
LW
7813{
7814 if (*s++ != '{')
7815 return FALSE;
f0fcb552 7816 if (!isDIGIT(*s))
a687059c 7817 return FALSE;
f0fcb552 7818 while (isDIGIT(*s))
a687059c
LW
7819 s++;
7820 if (*s == ',')
7821 s++;
f0fcb552 7822 while (isDIGIT(*s))
a687059c
LW
7823 s++;
7824 if (*s != '}')
7825 return FALSE;
7826 return TRUE;
7827}
7828
a687059c
LW
7829
7830/*
fd181c75 7831 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7832 */
7833void
097eb12c 7834Perl_regdump(pTHX_ const regexp *r)
a687059c 7835{
35ff7856 7836#ifdef DEBUGGING
97aff369 7837 dVAR;
c445ea15 7838 SV * const sv = sv_newmortal();
ab3bbdeb 7839 SV *dsv= sv_newmortal();
a687059c 7840
786e8c11 7841 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7842
7843 /* Header fields of interest. */
ab3bbdeb
YO
7844 if (r->anchored_substr) {
7845 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7846 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7847 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7848 "anchored %s%s at %"IVdf" ",
7849 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7850 (IV)r->anchored_offset);
ab3bbdeb
YO
7851 } else if (r->anchored_utf8) {
7852 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7853 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7854 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7855 "anchored utf8 %s%s at %"IVdf" ",
7856 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7857 (IV)r->anchored_offset);
ab3bbdeb
YO
7858 }
7859 if (r->float_substr) {
7860 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7861 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7862 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7863 "floating %s%s at %"IVdf"..%"UVuf" ",
7864 s, RE_SV_TAIL(r->float_substr),
7b0972df 7865 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7866 } else if (r->float_utf8) {
7867 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7868 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7869 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7870 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7871 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7872 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7873 }
33b8afdf 7874 if (r->check_substr || r->check_utf8)
b81d288d 7875 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7876 (const char *)
7877 (r->check_substr == r->float_substr
7878 && r->check_utf8 == r->float_utf8
7879 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7880 if (r->reganch & ROPT_NOSCAN)
7881 PerlIO_printf(Perl_debug_log, " noscan");
7882 if (r->reganch & ROPT_CHECK_ALL)
7883 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7884 if (r->check_substr || r->check_utf8)
c277df42
IZ
7885 PerlIO_printf(Perl_debug_log, ") ");
7886
46fc3d4c 7887 if (r->regstclass) {
32fc9b6a 7888 regprop(r, sv, r->regstclass);
1de06328 7889 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7890 }
774d564b 7891 if (r->reganch & ROPT_ANCH) {
7892 PerlIO_printf(Perl_debug_log, "anchored");
7893 if (r->reganch & ROPT_ANCH_BOL)
7894 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7895 if (r->reganch & ROPT_ANCH_MBOL)
7896 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7897 if (r->reganch & ROPT_ANCH_SBOL)
7898 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7899 if (r->reganch & ROPT_ANCH_GPOS)
7900 PerlIO_printf(Perl_debug_log, "(GPOS)");
7901 PerlIO_putc(Perl_debug_log, ' ');
7902 }
c277df42
IZ
7903 if (r->reganch & ROPT_GPOS_SEEN)
7904 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 7905 if (r->reganch & ROPT_SKIP)
760ac839 7906 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 7907 if (r->reganch & ROPT_IMPLICIT)
760ac839 7908 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 7909 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
7910 if (r->reganch & ROPT_EVAL_SEEN)
7911 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 7912 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 7913#else
96a5add6 7914 PERL_UNUSED_CONTEXT;
65e66c80 7915 PERL_UNUSED_ARG(r);
17c3b450 7916#endif /* DEBUGGING */
a687059c
LW
7917}
7918
7919/*
a0d0e21e
LW
7920- regprop - printable representation of opcode
7921*/
46fc3d4c 7922void
32fc9b6a 7923Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 7924{
35ff7856 7925#ifdef DEBUGGING
97aff369 7926 dVAR;
9b155405 7927 register int k;
1de06328 7928 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 7929
54dc92de 7930 sv_setpvn(sv, "", 0);
03363afd 7931 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
7932 /* It would be nice to FAIL() here, but this may be called from
7933 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 7934 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 7935 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 7936
3dab1dad 7937 k = PL_regkind[OP(o)];
9b155405 7938
2a782b5b 7939 if (k == EXACT) {
396482e1 7940 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
7941 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
7942 * is a crude hack but it may be the best for now since
7943 * we have no flag "this EXACTish node was UTF-8"
7944 * --jhi */
7945 const char * const s =
ddc5bc0f 7946 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
7947 PL_colors[0], PL_colors[1],
7948 PERL_PV_ESCAPE_UNI_DETECT |
7949 PERL_PV_PRETTY_ELIPSES |
7950 PERL_PV_PRETTY_LTGT
7951 );
7952 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 7953 } else if (k == TRIE) {
3dab1dad 7954 /* print the details of the trie in dumpuntil instead, as
4f639d21 7955 * prog->data isn't available here */
1de06328
YO
7956 const char op = OP(o);
7957 const I32 n = ARG(o);
7958 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
7959 (reg_ac_data *)prog->data->data[n] :
7960 NULL;
7961 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
7962 (reg_trie_data*)prog->data->data[n] :
7963 ac->trie;
7964
7965 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
7966 DEBUG_TRIE_COMPILE_r(
7967 Perl_sv_catpvf(aTHX_ sv,
7968 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7969 (UV)trie->startstate,
1e2e3d02 7970 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
7971 (UV)trie->wordcount,
7972 (UV)trie->minlen,
7973 (UV)trie->maxlen,
7974 (UV)TRIE_CHARCOUNT(trie),
7975 (UV)trie->uniquecharcount
7976 )
7977 );
7978 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7979 int i;
7980 int rangestart = -1;
f46cb337 7981 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7982 Perl_sv_catpvf(aTHX_ sv, "[");
7983 for (i = 0; i <= 256; i++) {
7984 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7985 if (rangestart == -1)
7986 rangestart = i;
7987 } else if (rangestart != -1) {
7988 if (i <= rangestart + 3)
7989 for (; rangestart < i; rangestart++)
7990 put_byte(sv, rangestart);
7991 else {
7992 put_byte(sv, rangestart);
7993 sv_catpvs(sv, "-");
7994 put_byte(sv, i - 1);
7995 }
7996 rangestart = -1;
7997 }
7998 }
7999 Perl_sv_catpvf(aTHX_ sv, "]");
8000 }
8001
a3621e74 8002 } else if (k == CURLY) {
cb434fcc 8003 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8004 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8005 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8006 }
2c2d71f5
JH
8007 else if (k == WHILEM && o->flags) /* Ordinal/of */
8008 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6bda09f9 8009 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP)
894356b3 8010 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 8011 else if (k == GOSUB)
6bda09f9 8012 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9b155405 8013 else if (k == LOGICAL)
04ebc1ab 8014 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8015 else if (k == ANYOF) {
8016 int i, rangestart = -1;
2d03de9c 8017 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8018
8019 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8020 static const char * const anyofs[] = {
653099ff
GS
8021 "\\w",
8022 "\\W",
8023 "\\s",
8024 "\\S",
8025 "\\d",
8026 "\\D",
8027 "[:alnum:]",
8028 "[:^alnum:]",
8029 "[:alpha:]",
8030 "[:^alpha:]",
8031 "[:ascii:]",
8032 "[:^ascii:]",
8033 "[:ctrl:]",
8034 "[:^ctrl:]",
8035 "[:graph:]",
8036 "[:^graph:]",
8037 "[:lower:]",
8038 "[:^lower:]",
8039 "[:print:]",
8040 "[:^print:]",
8041 "[:punct:]",
8042 "[:^punct:]",
8043 "[:upper:]",
aaa51d5e 8044 "[:^upper:]",
653099ff 8045 "[:xdigit:]",
aaa51d5e
JF
8046 "[:^xdigit:]",
8047 "[:space:]",
8048 "[:^space:]",
8049 "[:blank:]",
8050 "[:^blank:]"
653099ff
GS
8051 };
8052
19860706 8053 if (flags & ANYOF_LOCALE)
396482e1 8054 sv_catpvs(sv, "{loc}");
19860706 8055 if (flags & ANYOF_FOLD)
396482e1 8056 sv_catpvs(sv, "{i}");
653099ff 8057 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8058 if (flags & ANYOF_INVERT)
396482e1 8059 sv_catpvs(sv, "^");
ffc61ed2
JH
8060 for (i = 0; i <= 256; i++) {
8061 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8062 if (rangestart == -1)
8063 rangestart = i;
8064 } else if (rangestart != -1) {
8065 if (i <= rangestart + 3)
8066 for (; rangestart < i; rangestart++)
653099ff 8067 put_byte(sv, rangestart);
ffc61ed2
JH
8068 else {
8069 put_byte(sv, rangestart);
396482e1 8070 sv_catpvs(sv, "-");
ffc61ed2 8071 put_byte(sv, i - 1);
653099ff 8072 }
ffc61ed2 8073 rangestart = -1;
653099ff 8074 }
847a199f 8075 }
ffc61ed2
JH
8076
8077 if (o->flags & ANYOF_CLASS)
bb7a0f54 8078 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8079 if (ANYOF_CLASS_TEST(o,i))
8080 sv_catpv(sv, anyofs[i]);
8081
8082 if (flags & ANYOF_UNICODE)
396482e1 8083 sv_catpvs(sv, "{unicode}");
1aa99e6b 8084 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8085 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8086
8087 {
8088 SV *lv;
32fc9b6a 8089 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8090
ffc61ed2
JH
8091 if (lv) {
8092 if (sw) {
89ebb4a3 8093 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8094
ffc61ed2 8095 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8096 uvchr_to_utf8(s, i);
ffc61ed2 8097
3568d838 8098 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8099 if (rangestart == -1)
8100 rangestart = i;
8101 } else if (rangestart != -1) {
ffc61ed2
JH
8102 if (i <= rangestart + 3)
8103 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8104 const U8 * const e = uvchr_to_utf8(s,rangestart);
8105 U8 *p;
8106 for(p = s; p < e; p++)
ffc61ed2
JH
8107 put_byte(sv, *p);
8108 }
8109 else {
2d03de9c
AL
8110 const U8 *e = uvchr_to_utf8(s,rangestart);
8111 U8 *p;
8112 for (p = s; p < e; p++)
ffc61ed2 8113 put_byte(sv, *p);
396482e1 8114 sv_catpvs(sv, "-");
2d03de9c
AL
8115 e = uvchr_to_utf8(s, i-1);
8116 for (p = s; p < e; p++)
1df70142 8117 put_byte(sv, *p);
ffc61ed2
JH
8118 }
8119 rangestart = -1;
8120 }
19860706 8121 }
ffc61ed2 8122
396482e1 8123 sv_catpvs(sv, "..."); /* et cetera */
19860706 8124 }
fde631ed 8125
ffc61ed2 8126 {
2e0de35c 8127 char *s = savesvpv(lv);
c445ea15 8128 char * const origs = s;
b81d288d 8129
3dab1dad
YO
8130 while (*s && *s != '\n')
8131 s++;
b81d288d 8132
ffc61ed2 8133 if (*s == '\n') {
2d03de9c 8134 const char * const t = ++s;
ffc61ed2
JH
8135
8136 while (*s) {
8137 if (*s == '\n')
8138 *s = ' ';
8139 s++;
8140 }
8141 if (s[-1] == ' ')
8142 s[-1] = 0;
8143
8144 sv_catpv(sv, t);
fde631ed 8145 }
b81d288d 8146
ffc61ed2 8147 Safefree(origs);
fde631ed
JH
8148 }
8149 }
653099ff 8150 }
ffc61ed2 8151
653099ff
GS
8152 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8153 }
9b155405 8154 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8155 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8156#else
96a5add6 8157 PERL_UNUSED_CONTEXT;
65e66c80
SP
8158 PERL_UNUSED_ARG(sv);
8159 PERL_UNUSED_ARG(o);
f9049ba1 8160 PERL_UNUSED_ARG(prog);
17c3b450 8161#endif /* DEBUGGING */
35ff7856 8162}
a687059c 8163
cad2e5aa
JH
8164SV *
8165Perl_re_intuit_string(pTHX_ regexp *prog)
8166{ /* Assume that RE_INTUIT is set */
97aff369 8167 dVAR;
a3621e74 8168 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8169 PERL_UNUSED_CONTEXT;
8170
a3621e74 8171 DEBUG_COMPILE_r(
cfd0369c 8172 {
2d03de9c 8173 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8174 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8175
8176 if (!PL_colorset) reginitcolors();
8177 PerlIO_printf(Perl_debug_log,
a0288114 8178 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8179 PL_colors[4],
8180 prog->check_substr ? "" : "utf8 ",
8181 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8182 s,
8183 PL_colors[1],
8184 (strlen(s) > 60 ? "..." : ""));
8185 } );
8186
33b8afdf 8187 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8188}
8189
84da74a7
YO
8190/*
8191 pregfree - free a regexp
8192
8193 See regdupe below if you change anything here.
8194*/
8195
2b69d0c2 8196void
864dbfa3 8197Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8198{
27da23d5 8199 dVAR;
0df25f3d 8200
fc32ee4a 8201 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8202
7821416a
IZ
8203 if (!r || (--r->refcnt > 0))
8204 return;
ab3bbdeb 8205 DEBUG_COMPILE_r({
0df25f3d
YO
8206 if (!PL_colorset)
8207 reginitcolors();
ab3bbdeb
YO
8208 if (RX_DEBUG(r)){
8209 SV *dsv= sv_newmortal();
8210 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8211 dsv, r->precomp, r->prelen, 60);
8212 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8213 PL_colors[4],PL_colors[5],s);
8214 }
9e55ce06 8215 });
cad2e5aa 8216
43c5f42d
NC
8217 /* gcov results gave these as non-null 100% of the time, so there's no
8218 optimisation in checking them before calling Safefree */
8219 Safefree(r->precomp);
8220 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8221 RX_MATCH_COPY_FREE(r);
f8c7b90f 8222#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8223 if (r->saved_copy)
8224 SvREFCNT_dec(r->saved_copy);
8225#endif
a193d654
GS
8226 if (r->substrs) {
8227 if (r->anchored_substr)
8228 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8229 if (r->anchored_utf8)
8230 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8231 if (r->float_substr)
8232 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8233 if (r->float_utf8)
8234 SvREFCNT_dec(r->float_utf8);
2779dcf1 8235 Safefree(r->substrs);
a193d654 8236 }
81714fb9
YO
8237 if (r->paren_names)
8238 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8239 if (r->data) {
8240 int n = r->data->count;
f3548bdc
DM
8241 PAD* new_comppad = NULL;
8242 PAD* old_comppad;
4026c95a 8243 PADOFFSET refcnt;
dfad63ad 8244
c277df42 8245 while (--n >= 0) {
261faec3 8246 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8247 switch (r->data->what[n]) {
8248 case 's':
81714fb9 8249 case 'S':
c277df42
IZ
8250 SvREFCNT_dec((SV*)r->data->data[n]);
8251 break;
653099ff
GS
8252 case 'f':
8253 Safefree(r->data->data[n]);
8254 break;
dfad63ad
HS
8255 case 'p':
8256 new_comppad = (AV*)r->data->data[n];
8257 break;
c277df42 8258 case 'o':
dfad63ad 8259 if (new_comppad == NULL)
cea2e8a9 8260 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8261 PAD_SAVE_LOCAL(old_comppad,
8262 /* Watch out for global destruction's random ordering. */
c445ea15 8263 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8264 );
b34c0dd4 8265 OP_REFCNT_LOCK;
4026c95a
SH
8266 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8267 OP_REFCNT_UNLOCK;
8268 if (!refcnt)
9b978d73 8269 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8270
f3548bdc 8271 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8272 SvREFCNT_dec((SV*)new_comppad);
8273 new_comppad = NULL;
c277df42
IZ
8274 break;
8275 case 'n':
9e55ce06 8276 break;
07be1b83 8277 case 'T':
be8e71aa
YO
8278 { /* Aho Corasick add-on structure for a trie node.
8279 Used in stclass optimization only */
07be1b83
YO
8280 U32 refcount;
8281 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8282 OP_REFCNT_LOCK;
8283 refcount = --aho->refcount;
8284 OP_REFCNT_UNLOCK;
8285 if ( !refcount ) {
8286 Safefree(aho->states);
8287 Safefree(aho->fail);
8288 aho->trie=NULL; /* not necessary to free this as it is
8289 handled by the 't' case */
8290 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8291 Safefree(r->regstclass);
07be1b83
YO
8292 }
8293 }
8294 break;
a3621e74 8295 case 't':
07be1b83 8296 {
be8e71aa 8297 /* trie structure. */
07be1b83
YO
8298 U32 refcount;
8299 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8300 OP_REFCNT_LOCK;
8301 refcount = --trie->refcount;
8302 OP_REFCNT_UNLOCK;
8303 if ( !refcount ) {
8304 Safefree(trie->charmap);
8305 if (trie->widecharmap)
8306 SvREFCNT_dec((SV*)trie->widecharmap);
8307 Safefree(trie->states);
8308 Safefree(trie->trans);
8309 if (trie->bitmap)
8310 Safefree(trie->bitmap);
8311 if (trie->wordlen)
8312 Safefree(trie->wordlen);
786e8c11
YO
8313 if (trie->jump)
8314 Safefree(trie->jump);
8315 if (trie->nextword)
8316 Safefree(trie->nextword);
a3621e74 8317#ifdef DEBUGGING
be8e71aa
YO
8318 if (RX_DEBUG(r)) {
8319 if (trie->words)
8320 SvREFCNT_dec((SV*)trie->words);
8321 if (trie->revcharmap)
8322 SvREFCNT_dec((SV*)trie->revcharmap);
8323 }
a3621e74 8324#endif
07be1b83 8325 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8326 }
07be1b83
YO
8327 }
8328 break;
c277df42 8329 default:
830247a4 8330 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8331 }
8332 }
8333 Safefree(r->data->what);
8334 Safefree(r->data);
a0d0e21e
LW
8335 }
8336 Safefree(r->startp);
8337 Safefree(r->endp);
8338 Safefree(r);
a687059c 8339}
c277df42 8340
84da74a7
YO
8341#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8342#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8343#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8344#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8345
8346/*
8347 regdupe - duplicate a regexp.
8348
8349 This routine is called by sv.c's re_dup and is expected to clone a
8350 given regexp structure. It is a no-op when not under USE_ITHREADS.
8351 (Originally this *was* re_dup() for change history see sv.c)
8352
8353 See pregfree() above if you change anything here.
8354*/
a3c0e9ca 8355#if defined(USE_ITHREADS)
84da74a7
YO
8356regexp *
8357Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8358{
84da74a7
YO
8359 dVAR;
8360 REGEXP *ret;
8361 int i, len, npar;
8362 struct reg_substr_datum *s;
8363
8364 if (!r)
8365 return (REGEXP *)NULL;
8366
8367 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8368 return ret;
8369
8370 len = r->offsets[0];
8371 npar = r->nparens+1;
8372
8373 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8374 Copy(r->program, ret->program, len+1, regnode);
8375
8376 Newx(ret->startp, npar, I32);
8377 Copy(r->startp, ret->startp, npar, I32);
8378 Newx(ret->endp, npar, I32);
8379 Copy(r->startp, ret->startp, npar, I32);
8380
8381 Newx(ret->substrs, 1, struct reg_substr_data);
8382 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8383 s->min_offset = r->substrs->data[i].min_offset;
8384 s->max_offset = r->substrs->data[i].max_offset;
8385 s->end_shift = r->substrs->data[i].end_shift;
8386 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8387 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8388 }
8389
8390 ret->regstclass = NULL;
8391 if (r->data) {
8392 struct reg_data *d;
8393 const int count = r->data->count;
8394 int i;
8395
8396 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8397 char, struct reg_data);
8398 Newx(d->what, count, U8);
8399
8400 d->count = count;
8401 for (i = 0; i < count; i++) {
8402 d->what[i] = r->data->what[i];
8403 switch (d->what[i]) {
8404 /* legal options are one of: sfpont
8405 see also regcomp.h and pregfree() */
8406 case 's':
81714fb9 8407 case 'S':
84da74a7
YO
8408 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8409 break;
8410 case 'p':
8411 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8412 break;
8413 case 'f':
8414 /* This is cheating. */
8415 Newx(d->data[i], 1, struct regnode_charclass_class);
8416 StructCopy(r->data->data[i], d->data[i],
8417 struct regnode_charclass_class);
8418 ret->regstclass = (regnode*)d->data[i];
8419 break;
8420 case 'o':
8421 /* Compiled op trees are readonly, and can thus be
8422 shared without duplication. */
8423 OP_REFCNT_LOCK;
8424 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8425 OP_REFCNT_UNLOCK;
8426 break;
8427 case 'n':
8428 d->data[i] = r->data->data[i];
8429 break;
8430 case 't':
8431 d->data[i] = r->data->data[i];
8432 OP_REFCNT_LOCK;
8433 ((reg_trie_data*)d->data[i])->refcount++;
8434 OP_REFCNT_UNLOCK;
8435 break;
8436 case 'T':
8437 d->data[i] = r->data->data[i];
8438 OP_REFCNT_LOCK;
8439 ((reg_ac_data*)d->data[i])->refcount++;
8440 OP_REFCNT_UNLOCK;
8441 /* Trie stclasses are readonly and can thus be shared
8442 * without duplication. We free the stclass in pregfree
8443 * when the corresponding reg_ac_data struct is freed.
8444 */
8445 ret->regstclass= r->regstclass;
8446 break;
8447 default:
8448 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8449 }
8450 }
8451
8452 ret->data = d;
8453 }
8454 else
8455 ret->data = NULL;
8456
8457 Newx(ret->offsets, 2*len+1, U32);
8458 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8459
8460 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8461 ret->refcnt = r->refcnt;
8462 ret->minlen = r->minlen;
8463 ret->prelen = r->prelen;
8464 ret->nparens = r->nparens;
8465 ret->lastparen = r->lastparen;
8466 ret->lastcloseparen = r->lastcloseparen;
8467 ret->reganch = r->reganch;
8468
8469 ret->sublen = r->sublen;
8470
f9f4320a 8471 ret->engine = r->engine;
81714fb9
YO
8472
8473 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8474
84da74a7
YO
8475 if (RX_MATCH_COPIED(ret))
8476 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8477 else
8478 ret->subbeg = NULL;
8479#ifdef PERL_OLD_COPY_ON_WRITE
8480 ret->saved_copy = NULL;
8481#endif
8482
8483 ptr_table_store(PL_ptr_table, r, ret);
8484 return ret;
84da74a7 8485}
a3c0e9ca 8486#endif
84da74a7 8487
76234dfb 8488#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8489/*
8490 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8491 */
8492regnode *
864dbfa3 8493Perl_regnext(pTHX_ register regnode *p)
c277df42 8494{
97aff369 8495 dVAR;
c277df42
IZ
8496 register I32 offset;
8497
3280af22 8498 if (p == &PL_regdummy)
c277df42
IZ
8499 return(NULL);
8500
8501 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8502 if (offset == 0)
8503 return(NULL);
8504
c277df42 8505 return(p+offset);
c277df42 8506}
76234dfb 8507#endif
c277df42 8508
01f988be 8509STATIC void
cea2e8a9 8510S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8511{
8512 va_list args;
8513 STRLEN l1 = strlen(pat1);
8514 STRLEN l2 = strlen(pat2);
8515 char buf[512];
06bf62c7 8516 SV *msv;
73d840c0 8517 const char *message;
c277df42
IZ
8518
8519 if (l1 > 510)
8520 l1 = 510;
8521 if (l1 + l2 > 510)
8522 l2 = 510 - l1;
8523 Copy(pat1, buf, l1 , char);
8524 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8525 buf[l1 + l2] = '\n';
8526 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8527#ifdef I_STDARG
8528 /* ANSI variant takes additional second argument */
c277df42 8529 va_start(args, pat2);
8736538c
AS
8530#else
8531 va_start(args);
8532#endif
5a844595 8533 msv = vmess(buf, &args);
c277df42 8534 va_end(args);
cfd0369c 8535 message = SvPV_const(msv,l1);
c277df42
IZ
8536 if (l1 > 512)
8537 l1 = 512;
8538 Copy(message, buf, l1 , char);
197cf9b9 8539 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8540 Perl_croak(aTHX_ "%s", buf);
c277df42 8541}
a0ed51b3
LW
8542
8543/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8544
76234dfb 8545#ifndef PERL_IN_XSUB_RE
a0ed51b3 8546void
864dbfa3 8547Perl_save_re_context(pTHX)
b81d288d 8548{
97aff369 8549 dVAR;
1ade1aa1
NC
8550
8551 struct re_save_state *state;
8552
8553 SAVEVPTR(PL_curcop);
8554 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8555
8556 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8557 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8558 SSPUSHINT(SAVEt_RE_STATE);
8559
46ab3289 8560 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8561
a0ed51b3 8562 PL_reg_start_tmp = 0;
a0ed51b3 8563 PL_reg_start_tmpl = 0;
c445ea15 8564 PL_reg_oldsaved = NULL;
a5db57d6 8565 PL_reg_oldsavedlen = 0;
a5db57d6 8566 PL_reg_maxiter = 0;
a5db57d6 8567 PL_reg_leftiter = 0;
c445ea15 8568 PL_reg_poscache = NULL;
a5db57d6 8569 PL_reg_poscache_size = 0;
1ade1aa1
NC
8570#ifdef PERL_OLD_COPY_ON_WRITE
8571 PL_nrs = NULL;
8572#endif
ada6e8a9 8573
c445ea15
AL
8574 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8575 if (PL_curpm) {
8576 const REGEXP * const rx = PM_GETRE(PL_curpm);
8577 if (rx) {
1df70142 8578 U32 i;
ada6e8a9 8579 for (i = 1; i <= rx->nparens; i++) {
1df70142 8580 char digits[TYPE_CHARS(long)];
d9fad198 8581 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8582 GV *const *const gvp
8583 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8584
b37c2d43
AL
8585 if (gvp) {
8586 GV * const gv = *gvp;
8587 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8588 save_scalar(gv);
49f27e4b 8589 }
ada6e8a9
AMS
8590 }
8591 }
8592 }
a0ed51b3 8593}
76234dfb 8594#endif
51371543 8595
51371543 8596static void
acfe0abc 8597clear_re(pTHX_ void *r)
51371543 8598{
97aff369 8599 dVAR;
51371543
GS
8600 ReREFCNT_dec((regexp *)r);
8601}
ffbc6a93 8602
a28509cc
AL
8603#ifdef DEBUGGING
8604
8605STATIC void
8606S_put_byte(pTHX_ SV *sv, int c)
8607{
8608 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8609 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8610 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8611 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8612 else
8613 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8614}
8615
786e8c11 8616
3dab1dad
YO
8617#define CLEAR_OPTSTART \
8618 if (optstart) STMT_START { \
07be1b83 8619 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8620 optstart=NULL; \
8621 } STMT_END
8622
786e8c11 8623#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8624
b5a2f8d8
NC
8625STATIC const regnode *
8626S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8627 const regnode *last, const regnode *plast,
8628 SV* sv, I32 indent, U32 depth)
a28509cc 8629{
97aff369 8630 dVAR;
786e8c11 8631 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8632 register const regnode *next;
3dab1dad
YO
8633 const regnode *optstart= NULL;
8634 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8635
786e8c11
YO
8636#ifdef DEBUG_DUMPUNTIL
8637 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8638 last ? last-start : 0,plast ? plast-start : 0);
8639#endif
8640
8641 if (plast && plast < last)
8642 last= plast;
8643
8644 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8645 /* While that wasn't END last time... */
8646
8647 NODE_ALIGN(node);
8648 op = OP(node);
8649 if (op == CLOSE)
786e8c11 8650 indent--;
b5a2f8d8 8651 next = regnext((regnode *)node);
07be1b83 8652
a28509cc 8653 /* Where, what. */
8e11feef 8654 if (OP(node) == OPTIMIZED) {
e68ec53f 8655 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8656 optstart = node;
3dab1dad 8657 else
8e11feef 8658 goto after_print;
3dab1dad
YO
8659 } else
8660 CLEAR_OPTSTART;
07be1b83 8661
32fc9b6a 8662 regprop(r, sv, node);
a28509cc 8663 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8664 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8665
8666 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8667 if (next == NULL) /* Next ptr. */
8668 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8669 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8670 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8671 else
8672 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8673
1de06328 8674 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8675 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8676 }
8677
a28509cc
AL
8678 after_print:
8679 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8680 assert(next);
8681 {
8682 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8683 ? regnext((regnode *)next)
8684 : next);
be8e71aa
YO
8685 if (last && nnode > last)
8686 nnode = last;
786e8c11 8687 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8688 }
a28509cc
AL
8689 }
8690 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8691 assert(next);
786e8c11 8692 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8693 }
8694 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 8695 const regnode *this_trie = node;
1de06328 8696 const char op = OP(node);
a28509cc 8697 const I32 n = ARG(node);
1de06328
YO
8698 const reg_ac_data * const ac = op>=AHOCORASICK ?
8699 (reg_ac_data *)r->data->data[n] :
8700 NULL;
8701 const reg_trie_data * const trie = op<AHOCORASICK ?
8702 (reg_trie_data*)r->data->data[n] :
8703 ac->trie;
786e8c11 8704 const regnode *nextbranch= NULL;
a28509cc 8705 I32 word_idx;
1de06328 8706 sv_setpvn(sv, "", 0);
786e8c11 8707 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8708 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8709
8710 PerlIO_printf(Perl_debug_log, "%*s%s ",
8711 (int)(2*(indent+3)), "",
8712 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8713 PL_colors[0], PL_colors[1],
8714 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8715 PERL_PV_PRETTY_ELIPSES |
7f69552c 8716 PERL_PV_PRETTY_LTGT
786e8c11
YO
8717 )
8718 : "???"
8719 );
8720 if (trie->jump) {
40d049e4 8721 U16 dist= trie->jump[word_idx+1];
7f69552c 8722 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 8723 (dist ? this_trie + dist : next) - start);
786e8c11
YO
8724 if (dist) {
8725 if (!nextbranch)
24b23f37 8726 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
8727 DUMPUNTIL(this_trie + dist, nextbranch);
8728 }
786e8c11
YO
8729 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8730 nextbranch= regnext((regnode *)nextbranch);
8731 } else {
8732 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8733 }
786e8c11
YO
8734 }
8735 if (last && next > last)
8736 node= last;
8737 else
8738 node= next;
a28509cc 8739 }
786e8c11
YO
8740 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8741 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8742 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8743 }
8744 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8745 assert(next);
786e8c11 8746 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8747 }
8748 else if ( op == PLUS || op == STAR) {
786e8c11 8749 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8750 }
8751 else if (op == ANYOF) {
8752 /* arglen 1 + class block */
8753 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8754 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8755 node = NEXTOPER(node);
8756 }
8757 else if (PL_regkind[(U8)op] == EXACT) {
8758 /* Literal string, where present. */
8759 node += NODE_SZ_STR(node) - 1;
8760 node = NEXTOPER(node);
8761 }
8762 else {
8763 node = NEXTOPER(node);
8764 node += regarglen[(U8)op];
8765 }
8766 if (op == CURLYX || op == OPEN)
786e8c11 8767 indent++;
a28509cc 8768 else if (op == WHILEM)
786e8c11 8769 indent--;
a28509cc 8770 }
3dab1dad 8771 CLEAR_OPTSTART;
786e8c11
YO
8772#ifdef DEBUG_DUMPUNTIL
8773 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8774#endif
1de06328 8775 return node;
a28509cc
AL
8776}
8777
8778#endif /* DEBUGGING */
8779
241d1a3b
NC
8780/*
8781 * Local variables:
8782 * c-indentation-style: bsd
8783 * c-basic-offset: 4
8784 * indent-tabs-mode: t
8785 * End:
8786 *
37442d52
RGS
8787 * ex: set ts=8 sts=4 sw=4 noet:
8788 */