This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[patch at 21983] factor out $^O eq 'darwin' from XSLoader.pm
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
b94e2f88 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4
IZ
104 char *precomp; /* uncompiled string. */
105 regexp *rx;
fac92740 106 char *start; /* Start of input for compile */
830247a4
IZ
107 char *end; /* End of input for compile */
108 char *parse; /* Input-scan pointer. */
109 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 110 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 111 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
112 I32 naughty; /* How bad is this pattern? */
113 I32 sawback; /* Did we see \1, ...? */
114 U32 seen;
115 I32 size; /* Code size. */
116 I32 npar; /* () count. */
117 I32 extralen;
118 I32 seen_zerolen;
119 I32 seen_evals;
40d049e4
YO
120 regnode **open_parens; /* pointers to open parens */
121 regnode **close_parens; /* pointers to close parens */
122 regnode *opend; /* END node in program */
1aa99e6b 123 I32 utf8;
6bda09f9 124 HV *charnames; /* cache of named sequences */
81714fb9 125 HV *paren_names; /* Paren names */
40d049e4
YO
126 regnode **recurse; /* Recurse regops */
127 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
128#if ADD_TO_REGEXEC
129 char *starttry; /* -Dr: where regtry was called. */
130#define RExC_starttry (pRExC_state->starttry)
131#endif
3dab1dad 132#ifdef DEBUGGING
be8e71aa 133 const char *lastparse;
3dab1dad
YO
134 I32 lastnum;
135#define RExC_lastparse (pRExC_state->lastparse)
136#define RExC_lastnum (pRExC_state->lastnum)
137#endif
830247a4
IZ
138} RExC_state_t;
139
e2509266 140#define RExC_flags (pRExC_state->flags)
830247a4
IZ
141#define RExC_precomp (pRExC_state->precomp)
142#define RExC_rx (pRExC_state->rx)
fac92740 143#define RExC_start (pRExC_state->start)
830247a4
IZ
144#define RExC_end (pRExC_state->end)
145#define RExC_parse (pRExC_state->parse)
146#define RExC_whilem_seen (pRExC_state->whilem_seen)
fac92740 147#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
830247a4 148#define RExC_emit (pRExC_state->emit)
fac92740 149#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
150#define RExC_naughty (pRExC_state->naughty)
151#define RExC_sawback (pRExC_state->sawback)
152#define RExC_seen (pRExC_state->seen)
153#define RExC_size (pRExC_state->size)
154#define RExC_npar (pRExC_state->npar)
155#define RExC_extralen (pRExC_state->extralen)
156#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
157#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 158#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 159#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
160#define RExC_open_parens (pRExC_state->open_parens)
161#define RExC_close_parens (pRExC_state->close_parens)
162#define RExC_opend (pRExC_state->opend)
81714fb9 163#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
164#define RExC_recurse (pRExC_state->recurse)
165#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 166
a687059c
LW
167#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
168#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
169 ((*s) == '{' && regcurly(s)))
a687059c 170
35c8bce7
LW
171#ifdef SPSTART
172#undef SPSTART /* dratted cpp namespace... */
173#endif
a687059c
LW
174/*
175 * Flags to be passed up and down.
176 */
a687059c 177#define WORST 0 /* Worst case. */
821b33a5 178#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
179#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
180#define SPSTART 0x4 /* Starts with * or +. */
181#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 182
3dab1dad
YO
183#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
184
07be1b83
YO
185/* whether trie related optimizations are enabled */
186#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
187#define TRIE_STUDY_OPT
786e8c11 188#define FULL_TRIE_STUDY
07be1b83
YO
189#define TRIE_STCLASS
190#endif
1de06328
YO
191
192
40d049e4
YO
193
194#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
195#define PBITVAL(paren) (1 << ((paren) & 7))
196#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
197#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
198#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
199
200
1de06328
YO
201/* About scan_data_t.
202
203 During optimisation we recurse through the regexp program performing
204 various inplace (keyhole style) optimisations. In addition study_chunk
205 and scan_commit populate this data structure with information about
206 what strings MUST appear in the pattern. We look for the longest
207 string that must appear for at a fixed location, and we look for the
208 longest string that may appear at a floating location. So for instance
209 in the pattern:
210
211 /FOO[xX]A.*B[xX]BAR/
212
213 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
214 strings (because they follow a .* construct). study_chunk will identify
215 both FOO and BAR as being the longest fixed and floating strings respectively.
216
217 The strings can be composites, for instance
218
219 /(f)(o)(o)/
220
221 will result in a composite fixed substring 'foo'.
222
223 For each string some basic information is maintained:
224
225 - offset or min_offset
226 This is the position the string must appear at, or not before.
227 It also implicitly (when combined with minlenp) tells us how many
228 character must match before the string we are searching.
229 Likewise when combined with minlenp and the length of the string
230 tells us how many characters must appear after the string we have
231 found.
232
233 - max_offset
234 Only used for floating strings. This is the rightmost point that
235 the string can appear at. Ifset to I32 max it indicates that the
236 string can occur infinitely far to the right.
237
238 - minlenp
239 A pointer to the minimum length of the pattern that the string
240 was found inside. This is important as in the case of positive
241 lookahead or positive lookbehind we can have multiple patterns
242 involved. Consider
243
244 /(?=FOO).*F/
245
246 The minimum length of the pattern overall is 3, the minimum length
247 of the lookahead part is 3, but the minimum length of the part that
248 will actually match is 1. So 'FOO's minimum length is 3, but the
249 minimum length for the F is 1. This is important as the minimum length
250 is used to determine offsets in front of and behind the string being
251 looked for. Since strings can be composites this is the length of the
252 pattern at the time it was commited with a scan_commit. Note that
253 the length is calculated by study_chunk, so that the minimum lengths
254 are not known until the full pattern has been compiled, thus the
255 pointer to the value.
256
257 - lookbehind
258
259 In the case of lookbehind the string being searched for can be
260 offset past the start point of the final matching string.
261 If this value was just blithely removed from the min_offset it would
262 invalidate some of the calculations for how many chars must match
263 before or after (as they are derived from min_offset and minlen and
264 the length of the string being searched for).
265 When the final pattern is compiled and the data is moved from the
266 scan_data_t structure into the regexp structure the information
267 about lookbehind is factored in, with the information that would
268 have been lost precalculated in the end_shift field for the
269 associated string.
270
271 The fields pos_min and pos_delta are used to store the minimum offset
272 and the delta to the maximum offset at the current point in the pattern.
273
274*/
2c2d71f5
JH
275
276typedef struct scan_data_t {
1de06328
YO
277 /*I32 len_min; unused */
278 /*I32 len_delta; unused */
2c2d71f5
JH
279 I32 pos_min;
280 I32 pos_delta;
281 SV *last_found;
1de06328 282 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
283 I32 last_start_min;
284 I32 last_start_max;
1de06328
YO
285 SV **longest; /* Either &l_fixed, or &l_float. */
286 SV *longest_fixed; /* longest fixed string found in pattern */
287 I32 offset_fixed; /* offset where it starts */
288 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
289 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
290 SV *longest_float; /* longest floating string found in pattern */
291 I32 offset_float_min; /* earliest point in string it can appear */
292 I32 offset_float_max; /* latest point in string it can appear */
293 I32 *minlen_float; /* pointer to the minlen relevent to the string */
294 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
295 I32 flags;
296 I32 whilem_c;
cb434fcc 297 I32 *last_closep;
653099ff 298 struct regnode_charclass_class *start_class;
2c2d71f5
JH
299} scan_data_t;
300
a687059c 301/*
e50aee73 302 * Forward declarations for pregcomp()'s friends.
a687059c 303 */
a0d0e21e 304
27da23d5 305static const scan_data_t zero_scan_data =
1de06328 306 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
307
308#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
309#define SF_BEFORE_SEOL 0x0001
310#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
311#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
312#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
313
09b7f37c
CB
314#ifdef NO_UNARY_PLUS
315# define SF_FIX_SHIFT_EOL (0+2)
316# define SF_FL_SHIFT_EOL (0+4)
317#else
318# define SF_FIX_SHIFT_EOL (+2)
319# define SF_FL_SHIFT_EOL (+4)
320#endif
c277df42
IZ
321
322#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
323#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
324
325#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
326#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
327#define SF_IS_INF 0x0040
328#define SF_HAS_PAR 0x0080
329#define SF_IN_PAR 0x0100
330#define SF_HAS_EVAL 0x0200
331#define SCF_DO_SUBSTR 0x0400
653099ff
GS
332#define SCF_DO_STCLASS_AND 0x0800
333#define SCF_DO_STCLASS_OR 0x1000
334#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 335#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 336
786e8c11
YO
337#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
338
07be1b83 339
eb160463 340#define UTF (RExC_utf8 != 0)
e2509266
JH
341#define LOC ((RExC_flags & PMf_LOCALE) != 0)
342#define FOLD ((RExC_flags & PMf_FOLD) != 0)
a0ed51b3 343
ffc61ed2 344#define OOB_UNICODE 12345678
93733859 345#define OOB_NAMEDCLASS -1
b8c5462f 346
a0ed51b3
LW
347#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
348#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
349
8615cb43 350
b45f050a
JF
351/* length of regex to show in messages that don't mark a position within */
352#define RegexLengthToShowInErrorMessages 127
353
354/*
355 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
356 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
357 * op/pragma/warn/regcomp.
358 */
7253e4e3
RK
359#define MARKER1 "<-- HERE" /* marker as it appears in the description */
360#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 361
7253e4e3 362#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
363
364/*
365 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
366 * arg. Show regex, up to a maximum length. If it's too long, chop and add
367 * "...".
368 */
ccb2c380 369#define FAIL(msg) STMT_START { \
bfed75c6 370 const char *ellipses = ""; \
ccb2c380
MP
371 IV len = RExC_end - RExC_precomp; \
372 \
373 if (!SIZE_ONLY) \
374 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
375 if (len > RegexLengthToShowInErrorMessages) { \
376 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
377 len = RegexLengthToShowInErrorMessages - 10; \
378 ellipses = "..."; \
379 } \
380 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
381 msg, (int)len, RExC_precomp, ellipses); \
382} STMT_END
8615cb43 383
b45f050a 384/*
b45f050a
JF
385 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
386 */
ccb2c380 387#define Simple_vFAIL(m) STMT_START { \
a28509cc 388 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
389 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
391} STMT_END
b45f050a
JF
392
393/*
394 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
395 */
ccb2c380
MP
396#define vFAIL(m) STMT_START { \
397 if (!SIZE_ONLY) \
398 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
399 Simple_vFAIL(m); \
400} STMT_END
b45f050a
JF
401
402/*
403 * Like Simple_vFAIL(), but accepts two arguments.
404 */
ccb2c380 405#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
408 (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
413 */
ccb2c380
MP
414#define vFAIL2(m,a1) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL2(m, a1); \
418} STMT_END
b45f050a
JF
419
420
421/*
422 * Like Simple_vFAIL(), but accepts three arguments.
423 */
ccb2c380 424#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
432 */
ccb2c380
MP
433#define vFAIL3(m,a1,a2) STMT_START { \
434 if (!SIZE_ONLY) \
435 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
436 Simple_vFAIL3(m, a1, a2); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Like Simple_vFAIL(), but accepts four arguments.
441 */
ccb2c380 442#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a 447
ccb2c380 448#define vWARN(loc,m) STMT_START { \
a28509cc 449 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
450 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
451 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
452} STMT_END
453
454#define vWARNdep(loc,m) STMT_START { \
a28509cc 455 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
456 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
457 "%s" REPORT_LOCATION, \
458 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
459} STMT_END
460
461
462#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 463 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
464 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
465 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
466} STMT_END
467
468#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 469 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
470 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
471 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
472} STMT_END
473
474#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 475 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
476 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
477 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
9d1d55b5 485
8615cb43 486
cd439c50 487/* Allow for side effects in s */
ccb2c380
MP
488#define REGC(c,s) STMT_START { \
489 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
490} STMT_END
cd439c50 491
fac92740
MJD
492/* Macros for recording node offsets. 20001227 mjd@plover.com
493 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
494 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
495 * Element 0 holds the number n.
07be1b83 496 * Position is 1 indexed.
fac92740
MJD
497 */
498
ccb2c380
MP
499#define Set_Node_Offset_To_R(node,byte) STMT_START { \
500 if (! SIZE_ONLY) { \
501 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
07be1b83 502 __LINE__, (node), (int)(byte))); \
ccb2c380 503 if((node) < 0) { \
551405c4 504 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
505 } else { \
506 RExC_offsets[2*(node)-1] = (byte); \
507 } \
508 } \
509} STMT_END
510
511#define Set_Node_Offset(node,byte) \
512 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
513#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
514
515#define Set_Node_Length_To_R(node,len) STMT_START { \
516 if (! SIZE_ONLY) { \
517 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 518 __LINE__, (int)(node), (int)(len))); \
ccb2c380 519 if((node) < 0) { \
551405c4 520 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
521 } else { \
522 RExC_offsets[2*(node)] = (len); \
523 } \
524 } \
525} STMT_END
526
527#define Set_Node_Length(node,len) \
528 Set_Node_Length_To_R((node)-RExC_emit_start, len)
529#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
530#define Set_Node_Cur_Length(node) \
531 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
532
533/* Get offsets and lengths */
534#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
535#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
536
07be1b83
YO
537#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
538 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
539 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
540} STMT_END
541
542
543#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
544#define EXPERIMENTAL_INPLACESCAN
545#endif
546
1de06328 547#define DEBUG_STUDYDATA(data,depth) \
a5ca303d 548DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328
YO
549 PerlIO_printf(Perl_debug_log, \
550 "%*s"/* Len:%"IVdf"/%"IVdf" */" Pos:%"IVdf"/%"IVdf \
551 " Flags: %"IVdf" Whilem_c: %"IVdf" Lcp: %"IVdf" ", \
552 (int)(depth)*2, "", \
553 (IV)((data)->pos_min), \
554 (IV)((data)->pos_delta), \
555 (IV)((data)->flags), \
556 (IV)((data)->whilem_c), \
557 (IV)((data)->last_closep ? *((data)->last_closep) : -1) \
558 ); \
559 if ((data)->last_found) \
560 PerlIO_printf(Perl_debug_log, \
561 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
562 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
563 SvPVX_const((data)->last_found), \
564 (IV)((data)->last_end), \
565 (IV)((data)->last_start_min), \
566 (IV)((data)->last_start_max), \
567 ((data)->longest && \
568 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
569 SvPVX_const((data)->longest_fixed), \
570 (IV)((data)->offset_fixed), \
571 ((data)->longest && \
572 (data)->longest==&((data)->longest_float)) ? "*" : "", \
573 SvPVX_const((data)->longest_float), \
574 (IV)((data)->offset_float_min), \
575 (IV)((data)->offset_float_max) \
576 ); \
577 PerlIO_printf(Perl_debug_log,"\n"); \
578});
579
acfe0abc 580static void clear_re(pTHX_ void *r);
4327152a 581
653099ff 582/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 583 Update the longest found anchored substring and the longest found
653099ff
GS
584 floating substrings if needed. */
585
4327152a 586STATIC void
1de06328 587S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp)
c277df42 588{
e1ec3a88
AL
589 const STRLEN l = CHR_SVLEN(data->last_found);
590 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 591 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 592
c277df42 593 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 594 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
595 if (*data->longest == data->longest_fixed) {
596 data->offset_fixed = l ? data->last_start_min : data->pos_min;
597 if (data->flags & SF_BEFORE_EOL)
b81d288d 598 data->flags
c277df42
IZ
599 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
600 else
601 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
602 data->minlen_fixed=minlenp;
603 data->lookbehind_fixed=0;
a0ed51b3
LW
604 }
605 else {
c277df42 606 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
607 data->offset_float_max = (l
608 ? data->last_start_max
c277df42 609 : data->pos_min + data->pos_delta);
9051bda5
HS
610 if ((U32)data->offset_float_max > (U32)I32_MAX)
611 data->offset_float_max = I32_MAX;
c277df42 612 if (data->flags & SF_BEFORE_EOL)
b81d288d 613 data->flags
c277df42
IZ
614 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
615 else
616 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
617 data->minlen_float=minlenp;
618 data->lookbehind_float=0;
c277df42
IZ
619 }
620 }
621 SvCUR_set(data->last_found, 0);
0eda9292 622 {
a28509cc 623 SV * const sv = data->last_found;
097eb12c
AL
624 if (SvUTF8(sv) && SvMAGICAL(sv)) {
625 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
626 if (mg)
627 mg->mg_len = 0;
628 }
0eda9292 629 }
c277df42
IZ
630 data->last_end = -1;
631 data->flags &= ~SF_BEFORE_EOL;
1de06328 632 DEBUG_STUDYDATA(data,0);
c277df42
IZ
633}
634
653099ff
GS
635/* Can match anything (initialization) */
636STATIC void
097eb12c 637S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 638{
653099ff 639 ANYOF_CLASS_ZERO(cl);
f8bef550 640 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 641 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
642 if (LOC)
643 cl->flags |= ANYOF_LOCALE;
644}
645
646/* Can match anything (initialization) */
647STATIC int
5f66b61c 648S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
649{
650 int value;
651
aaa51d5e 652 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
653 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
654 return 1;
1aa99e6b
IH
655 if (!(cl->flags & ANYOF_UNICODE_ALL))
656 return 0;
10edeb5d 657 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 658 return 0;
653099ff
GS
659 return 1;
660}
661
662/* Can match anything (initialization) */
663STATIC void
097eb12c 664S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 665{
8ecf7187 666 Zero(cl, 1, struct regnode_charclass_class);
653099ff 667 cl->type = ANYOF;
830247a4 668 cl_anything(pRExC_state, cl);
653099ff
GS
669}
670
671STATIC void
097eb12c 672S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 673{
8ecf7187 674 Zero(cl, 1, struct regnode_charclass_class);
653099ff 675 cl->type = ANYOF;
830247a4 676 cl_anything(pRExC_state, cl);
653099ff
GS
677 if (LOC)
678 cl->flags |= ANYOF_LOCALE;
679}
680
681/* 'And' a given class with another one. Can create false positives */
682/* We assume that cl is not inverted */
683STATIC void
5f66b61c 684S_cl_and(struct regnode_charclass_class *cl,
a28509cc 685 const struct regnode_charclass_class *and_with)
653099ff 686{
40d049e4
YO
687
688 assert(and_with->type == ANYOF);
653099ff
GS
689 if (!(and_with->flags & ANYOF_CLASS)
690 && !(cl->flags & ANYOF_CLASS)
691 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
692 && !(and_with->flags & ANYOF_FOLD)
693 && !(cl->flags & ANYOF_FOLD)) {
694 int i;
695
696 if (and_with->flags & ANYOF_INVERT)
697 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
698 cl->bitmap[i] &= ~and_with->bitmap[i];
699 else
700 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
701 cl->bitmap[i] &= and_with->bitmap[i];
702 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
703 if (!(and_with->flags & ANYOF_EOS))
704 cl->flags &= ~ANYOF_EOS;
1aa99e6b 705
14ebb1a2
JH
706 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
707 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
708 cl->flags &= ~ANYOF_UNICODE_ALL;
709 cl->flags |= ANYOF_UNICODE;
710 ARG_SET(cl, ARG(and_with));
711 }
14ebb1a2
JH
712 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
713 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 714 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
715 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
716 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 717 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
718}
719
720/* 'OR' a given class with another one. Can create false positives */
721/* We assume that cl is not inverted */
722STATIC void
097eb12c 723S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 724{
653099ff
GS
725 if (or_with->flags & ANYOF_INVERT) {
726 /* We do not use
727 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
728 * <= (B1 | !B2) | (CL1 | !CL2)
729 * which is wasteful if CL2 is small, but we ignore CL2:
730 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
731 * XXXX Can we handle case-fold? Unclear:
732 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
733 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
734 */
735 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
736 && !(or_with->flags & ANYOF_FOLD)
737 && !(cl->flags & ANYOF_FOLD) ) {
738 int i;
739
740 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
741 cl->bitmap[i] |= ~or_with->bitmap[i];
742 } /* XXXX: logic is complicated otherwise */
743 else {
830247a4 744 cl_anything(pRExC_state, cl);
653099ff
GS
745 }
746 } else {
747 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
748 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 749 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
750 || (cl->flags & ANYOF_FOLD)) ) {
751 int i;
752
753 /* OR char bitmap and class bitmap separately */
754 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
755 cl->bitmap[i] |= or_with->bitmap[i];
756 if (or_with->flags & ANYOF_CLASS) {
757 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
758 cl->classflags[i] |= or_with->classflags[i];
759 cl->flags |= ANYOF_CLASS;
760 }
761 }
762 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 763 cl_anything(pRExC_state, cl);
653099ff
GS
764 }
765 }
766 if (or_with->flags & ANYOF_EOS)
767 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
768
769 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
770 ARG(cl) != ARG(or_with)) {
771 cl->flags |= ANYOF_UNICODE_ALL;
772 cl->flags &= ~ANYOF_UNICODE;
773 }
774 if (or_with->flags & ANYOF_UNICODE_ALL) {
775 cl->flags |= ANYOF_UNICODE_ALL;
776 cl->flags &= ~ANYOF_UNICODE;
777 }
653099ff
GS
778}
779
a3621e74
YO
780#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
781#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
782#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
783#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
784
3dab1dad
YO
785
786#ifdef DEBUGGING
07be1b83 787/*
3dab1dad
YO
788 dump_trie(trie)
789 dump_trie_interim_list(trie,next_alloc)
790 dump_trie_interim_table(trie,next_alloc)
791
792 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
793 The _interim_ variants are used for debugging the interim
794 tables that are used to generate the final compressed
795 representation which is what dump_trie expects.
796
3dab1dad
YO
797 Part of the reason for their existance is to provide a form
798 of documentation as to how the different representations function.
07be1b83
YO
799
800*/
3dab1dad
YO
801
802/*
803 dump_trie(trie)
804 Dumps the final compressed table form of the trie to Perl_debug_log.
805 Used for debugging make_trie().
806*/
807
808STATIC void
809S_dump_trie(pTHX_ const struct _reg_trie_data *trie,U32 depth)
810{
811 U32 state;
ab3bbdeb
YO
812 SV *sv=sv_newmortal();
813 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
814 GET_RE_DEBUG_FLAGS_DECL;
815
ab3bbdeb 816
3dab1dad
YO
817 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
818 (int)depth * 2 + 2,"",
819 "Match","Base","Ofs" );
820
821 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
be8e71aa 822 SV ** const tmp = av_fetch( trie->revcharmap, state, 0);
3dab1dad 823 if ( tmp ) {
ab3bbdeb
YO
824 PerlIO_printf( Perl_debug_log, "%*s",
825 colwidth,
ddc5bc0f 826 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
827 PL_colors[0], PL_colors[1],
828 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
829 PERL_PV_ESCAPE_FIRSTCHAR
830 )
831 );
3dab1dad
YO
832 }
833 }
834 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
835 (int)depth * 2 + 2,"");
836
837 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 838 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
839 PerlIO_printf( Perl_debug_log, "\n");
840
1e2e3d02 841 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 842 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
843
844 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
845
846 if ( trie->states[ state ].wordnum ) {
847 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
848 } else {
849 PerlIO_printf( Perl_debug_log, "%6s", "" );
850 }
851
852 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
853
854 if ( base ) {
855 U32 ofs = 0;
856
857 while( ( base + ofs < trie->uniquecharcount ) ||
858 ( base + ofs - trie->uniquecharcount < trie->lasttrans
859 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
860 ofs++;
861
862 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
863
864 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
865 if ( ( base + ofs >= trie->uniquecharcount ) &&
866 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
867 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
868 {
ab3bbdeb
YO
869 PerlIO_printf( Perl_debug_log, "%*"UVXf,
870 colwidth,
3dab1dad
YO
871 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
872 } else {
ab3bbdeb 873 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
874 }
875 }
876
877 PerlIO_printf( Perl_debug_log, "]");
878
879 }
880 PerlIO_printf( Perl_debug_log, "\n" );
881 }
882}
883/*
884 dump_trie_interim_list(trie,next_alloc)
885 Dumps a fully constructed but uncompressed trie in list form.
886 List tries normally only are used for construction when the number of
887 possible chars (trie->uniquecharcount) is very high.
888 Used for debugging make_trie().
889*/
890STATIC void
891S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc,U32 depth)
892{
893 U32 state;
ab3bbdeb
YO
894 SV *sv=sv_newmortal();
895 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
896 GET_RE_DEBUG_FLAGS_DECL;
897 /* print out the table precompression. */
ab3bbdeb
YO
898 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
899 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
900 "------:-----+-----------------\n" );
3dab1dad
YO
901
902 for( state=1 ; state < next_alloc ; state ++ ) {
903 U16 charid;
904
ab3bbdeb 905 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
906 (int)depth * 2 + 2,"", (UV)state );
907 if ( ! trie->states[ state ].wordnum ) {
908 PerlIO_printf( Perl_debug_log, "%5s| ","");
909 } else {
910 PerlIO_printf( Perl_debug_log, "W%4x| ",
911 trie->states[ state ].wordnum
912 );
913 }
914 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
be8e71aa 915 SV ** const tmp = av_fetch( trie->revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
916 if ( tmp ) {
917 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
918 colwidth,
ddc5bc0f 919 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
920 PL_colors[0], PL_colors[1],
921 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
922 PERL_PV_ESCAPE_FIRSTCHAR
923 ) ,
1e2e3d02
YO
924 TRIE_LIST_ITEM(state,charid).forid,
925 (UV)TRIE_LIST_ITEM(state,charid).newstate
926 );
927 if (!(charid % 10))
664e119d
RGS
928 PerlIO_printf(Perl_debug_log, "\n%*s| ",
929 (int)((depth * 2) + 14), "");
1e2e3d02 930 }
ab3bbdeb
YO
931 }
932 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
933 }
934}
935
936/*
937 dump_trie_interim_table(trie,next_alloc)
938 Dumps a fully constructed but uncompressed trie in table form.
939 This is the normal DFA style state transition table, with a few
940 twists to facilitate compression later.
941 Used for debugging make_trie().
942*/
943STATIC void
944S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, U32 next_alloc, U32 depth)
945{
946 U32 state;
947 U16 charid;
ab3bbdeb
YO
948 SV *sv=sv_newmortal();
949 int colwidth= trie->widecharmap ? 6 : 4;
3dab1dad
YO
950 GET_RE_DEBUG_FLAGS_DECL;
951
952 /*
953 print out the table precompression so that we can do a visual check
954 that they are identical.
955 */
956
957 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
958
959 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
be8e71aa 960 SV ** const tmp = av_fetch( trie->revcharmap, charid, 0);
3dab1dad 961 if ( tmp ) {
ab3bbdeb
YO
962 PerlIO_printf( Perl_debug_log, "%*s",
963 colwidth,
ddc5bc0f 964 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
965 PL_colors[0], PL_colors[1],
966 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
967 PERL_PV_ESCAPE_FIRSTCHAR
968 )
969 );
3dab1dad
YO
970 }
971 }
972
973 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
974
975 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 976 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
977 }
978
979 PerlIO_printf( Perl_debug_log, "\n" );
980
981 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
982
983 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
984 (int)depth * 2 + 2,"",
985 (UV)TRIE_NODENUM( state ) );
986
987 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
988 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
989 if (v)
990 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
991 else
992 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
993 }
994 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
995 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
996 } else {
997 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
998 trie->states[ TRIE_NODENUM( state ) ].wordnum );
999 }
1000 }
07be1b83 1001}
3dab1dad
YO
1002
1003#endif
1004
786e8c11
YO
1005/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1006 startbranch: the first branch in the whole branch sequence
1007 first : start branch of sequence of branch-exact nodes.
1008 May be the same as startbranch
1009 last : Thing following the last branch.
1010 May be the same as tail.
1011 tail : item following the branch sequence
1012 count : words in the sequence
1013 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1014 depth : indent depth
3dab1dad 1015
786e8c11 1016Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1017
786e8c11
YO
1018A trie is an N'ary tree where the branches are determined by digital
1019decomposition of the key. IE, at the root node you look up the 1st character and
1020follow that branch repeat until you find the end of the branches. Nodes can be
1021marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1022
786e8c11 1023 /he|she|his|hers/
72f13be8 1024
786e8c11
YO
1025would convert into the following structure. Numbers represent states, letters
1026following numbers represent valid transitions on the letter from that state, if
1027the number is in square brackets it represents an accepting state, otherwise it
1028will be in parenthesis.
07be1b83 1029
786e8c11
YO
1030 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1031 | |
1032 | (2)
1033 | |
1034 (1) +-i->(6)-+-s->[7]
1035 |
1036 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1037
786e8c11
YO
1038 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1039
1040This shows that when matching against the string 'hers' we will begin at state 1
1041read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1042then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1043is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1044single traverse. We store a mapping from accepting to state to which word was
1045matched, and then when we have multiple possibilities we try to complete the
1046rest of the regex in the order in which they occured in the alternation.
1047
1048The only prior NFA like behaviour that would be changed by the TRIE support is
1049the silent ignoring of duplicate alternations which are of the form:
1050
1051 / (DUPE|DUPE) X? (?{ ... }) Y /x
1052
1053Thus EVAL blocks follwing a trie may be called a different number of times with
1054and without the optimisation. With the optimisations dupes will be silently
1055ignored. This inconsistant behaviour of EVAL type nodes is well established as
1056the following demonstrates:
1057
1058 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1059
1060which prints out 'word' three times, but
1061
1062 'words'=~/(word|word|word)(?{ print $1 })S/
1063
1064which doesnt print it out at all. This is due to other optimisations kicking in.
1065
1066Example of what happens on a structural level:
1067
1068The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1069
1070 1: CURLYM[1] {1,32767}(18)
1071 5: BRANCH(8)
1072 6: EXACT <ac>(16)
1073 8: BRANCH(11)
1074 9: EXACT <ad>(16)
1075 11: BRANCH(14)
1076 12: EXACT <ab>(16)
1077 16: SUCCEED(0)
1078 17: NOTHING(18)
1079 18: END(0)
1080
1081This would be optimizable with startbranch=5, first=5, last=16, tail=16
1082and should turn into:
1083
1084 1: CURLYM[1] {1,32767}(18)
1085 5: TRIE(16)
1086 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1087 <ac>
1088 <ad>
1089 <ab>
1090 16: SUCCEED(0)
1091 17: NOTHING(18)
1092 18: END(0)
1093
1094Cases where tail != last would be like /(?foo|bar)baz/:
1095
1096 1: BRANCH(4)
1097 2: EXACT <foo>(8)
1098 4: BRANCH(7)
1099 5: EXACT <bar>(8)
1100 7: TAIL(8)
1101 8: EXACT <baz>(10)
1102 10: END(0)
1103
1104which would be optimizable with startbranch=1, first=1, last=7, tail=8
1105and would end up looking like:
1106
1107 1: TRIE(8)
1108 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1109 <foo>
1110 <bar>
1111 7: TAIL(8)
1112 8: EXACT <baz>(10)
1113 10: END(0)
1114
1115 d = uvuni_to_utf8_flags(d, uv, 0);
1116
1117is the recommended Unicode-aware way of saying
1118
1119 *(d++) = uv;
1120*/
1121
1e2e3d02 1122#define TRIE_STORE_REVCHAR \
786e8c11 1123 STMT_START { \
1e2e3d02 1124 SV *tmp = newSVpvs(""); \
786e8c11 1125 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1126 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
786e8c11
YO
1127 av_push( TRIE_REVCHARMAP(trie), tmp ); \
1128 } STMT_END
1129
1130#define TRIE_READ_CHAR STMT_START { \
1131 wordlen++; \
1132 if ( UTF ) { \
1133 if ( folder ) { \
1134 if ( foldlen > 0 ) { \
1135 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1136 foldlen -= len; \
1137 scan += len; \
1138 len = 0; \
1139 } else { \
1140 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1141 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1142 foldlen -= UNISKIP( uvc ); \
1143 scan = foldbuf + UNISKIP( uvc ); \
1144 } \
1145 } else { \
1146 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1147 } \
1148 } else { \
1149 uvc = (U32)*uc; \
1150 len = 1; \
1151 } \
1152} STMT_END
1153
1154
1155
1156#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1157 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1158 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1159 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1160 } \
1161 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1162 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1163 TRIE_LIST_CUR( state )++; \
1164} STMT_END
07be1b83 1165
786e8c11
YO
1166#define TRIE_LIST_NEW(state) STMT_START { \
1167 Newxz( trie->states[ state ].trans.list, \
1168 4, reg_trie_trans_le ); \
1169 TRIE_LIST_CUR( state ) = 1; \
1170 TRIE_LIST_LEN( state ) = 4; \
1171} STMT_END
07be1b83 1172
786e8c11
YO
1173#define TRIE_HANDLE_WORD(state) STMT_START { \
1174 U16 dupe= trie->states[ state ].wordnum; \
1175 regnode * const noper_next = regnext( noper ); \
1176 \
1177 if (trie->wordlen) \
1178 trie->wordlen[ curword ] = wordlen; \
1179 DEBUG_r({ \
1180 /* store the word for dumping */ \
1181 SV* tmp; \
1182 if (OP(noper) != NOTHING) \
1183 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1184 else \
1185 tmp = newSVpvn( "", 0 ); \
1186 if ( UTF ) SvUTF8_on( tmp ); \
1187 av_push( trie->words, tmp ); \
1188 }); \
1189 \
1190 curword++; \
1191 \
1192 if ( noper_next < tail ) { \
1193 if (!trie->jump) \
1194 Newxz( trie->jump, word_count + 1, U16); \
7f69552c 1195 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1196 if (!jumper) \
1197 jumper = noper_next; \
1198 if (!nextbranch) \
1199 nextbranch= regnext(cur); \
1200 } \
1201 \
1202 if ( dupe ) { \
1203 /* So it's a dupe. This means we need to maintain a */\
1204 /* linked-list from the first to the next. */\
1205 /* we only allocate the nextword buffer when there */\
1206 /* a dupe, so first time we have to do the allocation */\
1207 if (!trie->nextword) \
1208 Newxz( trie->nextword, word_count + 1, U16); \
1209 while ( trie->nextword[dupe] ) \
1210 dupe= trie->nextword[dupe]; \
1211 trie->nextword[dupe]= curword; \
1212 } else { \
1213 /* we haven't inserted this word yet. */ \
1214 trie->states[ state ].wordnum = curword; \
1215 } \
1216} STMT_END
07be1b83 1217
3dab1dad 1218
786e8c11
YO
1219#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1220 ( ( base + charid >= ucharcount \
1221 && base + charid < ubound \
1222 && state == trie->trans[ base - ucharcount + charid ].check \
1223 && trie->trans[ base - ucharcount + charid ].next ) \
1224 ? trie->trans[ base - ucharcount + charid ].next \
1225 : ( state==1 ? special : 0 ) \
1226 )
3dab1dad 1227
786e8c11
YO
1228#define MADE_TRIE 1
1229#define MADE_JUMP_TRIE 2
1230#define MADE_EXACT_TRIE 4
3dab1dad 1231
a3621e74 1232STATIC I32
786e8c11 1233S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1234{
27da23d5 1235 dVAR;
a3621e74
YO
1236 /* first pass, loop through and scan words */
1237 reg_trie_data *trie;
1238 regnode *cur;
9f7f3913 1239 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1240 STRLEN len = 0;
1241 UV uvc = 0;
1242 U16 curword = 0;
1243 U32 next_alloc = 0;
786e8c11
YO
1244 regnode *jumper = NULL;
1245 regnode *nextbranch = NULL;
7f69552c 1246 regnode *convert = NULL;
a3621e74 1247 /* we just use folder as a flag in utf8 */
e1ec3a88 1248 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1249 ? PL_fold
1250 : ( flags == EXACTFL
1251 ? PL_fold_locale
1252 : NULL
1253 )
1254 );
1255
e1ec3a88 1256 const U32 data_slot = add_data( pRExC_state, 1, "t" );
a3621e74 1257 SV *re_trie_maxbuff;
3dab1dad
YO
1258#ifndef DEBUGGING
1259 /* these are only used during construction but are useful during
8e11feef 1260 * debugging so we store them in the struct when debugging.
8e11feef 1261 */
3dab1dad 1262 STRLEN trie_charcount=0;
3dab1dad
YO
1263 AV *trie_revcharmap;
1264#endif
a3621e74 1265 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1266#ifndef DEBUGGING
1267 PERL_UNUSED_ARG(depth);
1268#endif
a3621e74 1269
a02a5408 1270 Newxz( trie, 1, reg_trie_data );
a3621e74 1271 trie->refcount = 1;
3dab1dad 1272 trie->startstate = 1;
786e8c11 1273 trie->wordcount = word_count;
a3621e74 1274 RExC_rx->data->data[ data_slot ] = (void*)trie;
a02a5408 1275 Newxz( trie->charmap, 256, U16 );
3dab1dad
YO
1276 if (!(UTF && folder))
1277 Newxz( trie->bitmap, ANYOF_BITMAP_SIZE, char );
a3621e74
YO
1278 DEBUG_r({
1279 trie->words = newAV();
a3621e74 1280 });
3dab1dad 1281 TRIE_REVCHARMAP(trie) = newAV();
a3621e74 1282
0111c4fd 1283 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1284 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1285 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1286 }
3dab1dad
YO
1287 DEBUG_OPTIMISE_r({
1288 PerlIO_printf( Perl_debug_log,
786e8c11 1289 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1290 (int)depth * 2 + 2, "",
1291 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1292 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1293 (int)depth);
3dab1dad 1294 });
7f69552c
YO
1295
1296 /* Find the node we are going to overwrite */
1297 if ( first == startbranch && OP( last ) != BRANCH ) {
1298 /* whole branch chain */
1299 convert = first;
1300 } else {
1301 /* branch sub-chain */
1302 convert = NEXTOPER( first );
1303 }
1304
a3621e74
YO
1305 /* -- First loop and Setup --
1306
1307 We first traverse the branches and scan each word to determine if it
1308 contains widechars, and how many unique chars there are, this is
1309 important as we have to build a table with at least as many columns as we
1310 have unique chars.
1311
1312 We use an array of integers to represent the character codes 0..255
1313 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1314 native representation of the character value as the key and IV's for the
1315 coded index.
1316
1317 *TODO* If we keep track of how many times each character is used we can
1318 remap the columns so that the table compression later on is more
1319 efficient in terms of memory by ensuring most common value is in the
1320 middle and the least common are on the outside. IMO this would be better
1321 than a most to least common mapping as theres a decent chance the most
1322 common letter will share a node with the least common, meaning the node
1323 will not be compressable. With a middle is most common approach the worst
1324 case is when we have the least common nodes twice.
1325
1326 */
1327
a3621e74 1328 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1329 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1330 const U8 *uc = (U8*)STRING( noper );
a28509cc 1331 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1332 STRLEN foldlen = 0;
1333 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1334 const U8 *scan = (U8*)NULL;
07be1b83 1335 U32 wordlen = 0; /* required init */
3dab1dad 1336 STRLEN chars=0;
a3621e74 1337
3dab1dad
YO
1338 if (OP(noper) == NOTHING) {
1339 trie->minlen= 0;
1340 continue;
1341 }
1342 if (trie->bitmap) {
1343 TRIE_BITMAP_SET(trie,*uc);
1344 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1345 }
a3621e74 1346 for ( ; uc < e ; uc += len ) {
3dab1dad 1347 TRIE_CHARCOUNT(trie)++;
a3621e74 1348 TRIE_READ_CHAR;
3dab1dad 1349 chars++;
a3621e74
YO
1350 if ( uvc < 256 ) {
1351 if ( !trie->charmap[ uvc ] ) {
1352 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1353 if ( folder )
1354 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1355 TRIE_STORE_REVCHAR;
a3621e74
YO
1356 }
1357 } else {
1358 SV** svpp;
1359 if ( !trie->widecharmap )
1360 trie->widecharmap = newHV();
1361
1362 svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1363
1364 if ( !svpp )
e4584336 1365 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1366
1367 if ( !SvTRUE( *svpp ) ) {
1368 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1369 TRIE_STORE_REVCHAR;
a3621e74
YO
1370 }
1371 }
1372 }
3dab1dad
YO
1373 if( cur == first ) {
1374 trie->minlen=chars;
1375 trie->maxlen=chars;
1376 } else if (chars < trie->minlen) {
1377 trie->minlen=chars;
1378 } else if (chars > trie->maxlen) {
1379 trie->maxlen=chars;
1380 }
1381
a3621e74
YO
1382 } /* end first pass */
1383 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1384 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1385 (int)depth * 2 + 2,"",
85c3142d 1386 ( trie->widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1387 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1388 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1389 );
786e8c11 1390 Newxz( trie->wordlen, word_count, U32 );
a3621e74
YO
1391
1392 /*
1393 We now know what we are dealing with in terms of unique chars and
1394 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1395 representation using a flat table will take. If it's over a reasonable
1396 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1397 conservative but potentially much slower representation using an array
1398 of lists.
1399
1400 At the end we convert both representations into the same compressed
1401 form that will be used in regexec.c for matching with. The latter
1402 is a form that cannot be used to construct with but has memory
1403 properties similar to the list form and access properties similar
1404 to the table form making it both suitable for fast searches and
1405 small enough that its feasable to store for the duration of a program.
1406
1407 See the comment in the code where the compressed table is produced
1408 inplace from the flat tabe representation for an explanation of how
1409 the compression works.
1410
1411 */
1412
1413
3dab1dad 1414 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1415 /*
1416 Second Pass -- Array Of Lists Representation
1417
1418 Each state will be represented by a list of charid:state records
1419 (reg_trie_trans_le) the first such element holds the CUR and LEN
1420 points of the allocated array. (See defines above).
1421
1422 We build the initial structure using the lists, and then convert
1423 it into the compressed table form which allows faster lookups
1424 (but cant be modified once converted).
a3621e74
YO
1425 */
1426
a3621e74
YO
1427 STRLEN transcount = 1;
1428
1e2e3d02
YO
1429 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1430 "%*sCompiling trie using list compiler\n",
1431 (int)depth * 2 + 2, ""));
1432
3dab1dad 1433 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1434 TRIE_LIST_NEW(1);
1435 next_alloc = 2;
1436
1437 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1438
c445ea15
AL
1439 regnode * const noper = NEXTOPER( cur );
1440 U8 *uc = (U8*)STRING( noper );
1441 const U8 * const e = uc + STR_LEN( noper );
1442 U32 state = 1; /* required init */
1443 U16 charid = 0; /* sanity init */
1444 U8 *scan = (U8*)NULL; /* sanity init */
1445 STRLEN foldlen = 0; /* required init */
07be1b83 1446 U32 wordlen = 0; /* required init */
c445ea15
AL
1447 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1448
3dab1dad 1449 if (OP(noper) != NOTHING) {
786e8c11 1450 for ( ; uc < e ; uc += len ) {
c445ea15 1451
786e8c11 1452 TRIE_READ_CHAR;
c445ea15 1453
786e8c11
YO
1454 if ( uvc < 256 ) {
1455 charid = trie->charmap[ uvc ];
c445ea15 1456 } else {
786e8c11
YO
1457 SV** const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1458 if ( !svpp ) {
1459 charid = 0;
1460 } else {
1461 charid=(U16)SvIV( *svpp );
1462 }
c445ea15 1463 }
786e8c11
YO
1464 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1465 if ( charid ) {
a3621e74 1466
786e8c11
YO
1467 U16 check;
1468 U32 newstate = 0;
a3621e74 1469
786e8c11
YO
1470 charid--;
1471 if ( !trie->states[ state ].trans.list ) {
1472 TRIE_LIST_NEW( state );
c445ea15 1473 }
786e8c11
YO
1474 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1475 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1476 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1477 break;
1478 }
1479 }
1480 if ( ! newstate ) {
1481 newstate = next_alloc++;
1482 TRIE_LIST_PUSH( state, charid, newstate );
1483 transcount++;
1484 }
1485 state = newstate;
1486 } else {
1487 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1488 }
a28509cc 1489 }
c445ea15 1490 }
3dab1dad 1491 TRIE_HANDLE_WORD(state);
a3621e74
YO
1492
1493 } /* end second pass */
1494
1e2e3d02
YO
1495 /* next alloc is the NEXT state to be allocated */
1496 trie->statecount = next_alloc;
a3621e74
YO
1497 Renew( trie->states, next_alloc, reg_trie_state );
1498
3dab1dad
YO
1499 /* and now dump it out before we compress it */
1500 DEBUG_TRIE_COMPILE_MORE_r(
1501 dump_trie_interim_list(trie,next_alloc,depth+1)
1e2e3d02 1502 );
a3621e74 1503
a02a5408 1504 Newxz( trie->trans, transcount ,reg_trie_trans );
a3621e74
YO
1505 {
1506 U32 state;
a3621e74
YO
1507 U32 tp = 0;
1508 U32 zp = 0;
1509
1510
1511 for( state=1 ; state < next_alloc ; state ++ ) {
1512 U32 base=0;
1513
1514 /*
1515 DEBUG_TRIE_COMPILE_MORE_r(
1516 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1517 );
1518 */
1519
1520 if (trie->states[state].trans.list) {
1521 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1522 U16 maxid=minid;
a28509cc 1523 U16 idx;
a3621e74
YO
1524
1525 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1526 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1527 if ( forid < minid ) {
1528 minid=forid;
1529 } else if ( forid > maxid ) {
1530 maxid=forid;
1531 }
a3621e74
YO
1532 }
1533 if ( transcount < tp + maxid - minid + 1) {
1534 transcount *= 2;
1535 Renew( trie->trans, transcount, reg_trie_trans );
1536 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1537 }
1538 base = trie->uniquecharcount + tp - minid;
1539 if ( maxid == minid ) {
1540 U32 set = 0;
1541 for ( ; zp < tp ; zp++ ) {
1542 if ( ! trie->trans[ zp ].next ) {
1543 base = trie->uniquecharcount + zp - minid;
1544 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1545 trie->trans[ zp ].check = state;
1546 set = 1;
1547 break;
1548 }
1549 }
1550 if ( !set ) {
1551 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1552 trie->trans[ tp ].check = state;
1553 tp++;
1554 zp = tp;
1555 }
1556 } else {
1557 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1558 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1559 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1560 trie->trans[ tid ].check = state;
1561 }
1562 tp += ( maxid - minid + 1 );
1563 }
1564 Safefree(trie->states[ state ].trans.list);
1565 }
1566 /*
1567 DEBUG_TRIE_COMPILE_MORE_r(
1568 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1569 );
1570 */
1571 trie->states[ state ].trans.base=base;
1572 }
cc601c31 1573 trie->lasttrans = tp + 1;
a3621e74
YO
1574 }
1575 } else {
1576 /*
1577 Second Pass -- Flat Table Representation.
1578
1579 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1580 We know that we will need Charcount+1 trans at most to store the data
1581 (one row per char at worst case) So we preallocate both structures
1582 assuming worst case.
1583
1584 We then construct the trie using only the .next slots of the entry
1585 structs.
1586
1587 We use the .check field of the first entry of the node temporarily to
1588 make compression both faster and easier by keeping track of how many non
1589 zero fields are in the node.
1590
1591 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1592 transition.
1593
1594 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1595 number representing the first entry of the node, and state as a
1596 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1597 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1598 are 2 entrys per node. eg:
1599
1600 A B A B
1601 1. 2 4 1. 3 7
1602 2. 0 3 3. 0 5
1603 3. 0 0 5. 0 0
1604 4. 0 0 7. 0 0
1605
1606 The table is internally in the right hand, idx form. However as we also
1607 have to deal with the states array which is indexed by nodenum we have to
1608 use TRIE_NODENUM() to convert.
1609
1610 */
1e2e3d02
YO
1611 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1612 "%*sCompiling trie using table compiler\n",
1613 (int)depth * 2 + 2, ""));
3dab1dad
YO
1614
1615 Newxz( trie->trans, ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1,
a3621e74 1616 reg_trie_trans );
3dab1dad 1617 Newxz( trie->states, TRIE_CHARCOUNT(trie) + 2, reg_trie_state );
a3621e74
YO
1618 next_alloc = trie->uniquecharcount + 1;
1619
3dab1dad 1620
a3621e74
YO
1621 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1622
c445ea15 1623 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1624 const U8 *uc = (U8*)STRING( noper );
1625 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1626
1627 U32 state = 1; /* required init */
1628
1629 U16 charid = 0; /* sanity init */
1630 U32 accept_state = 0; /* sanity init */
1631 U8 *scan = (U8*)NULL; /* sanity init */
1632
1633 STRLEN foldlen = 0; /* required init */
07be1b83 1634 U32 wordlen = 0; /* required init */
a3621e74
YO
1635 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1636
3dab1dad 1637 if ( OP(noper) != NOTHING ) {
786e8c11 1638 for ( ; uc < e ; uc += len ) {
a3621e74 1639
786e8c11 1640 TRIE_READ_CHAR;
a3621e74 1641
786e8c11
YO
1642 if ( uvc < 256 ) {
1643 charid = trie->charmap[ uvc ];
1644 } else {
1645 SV* const * const svpp = hv_fetch( trie->widecharmap, (char*)&uvc, sizeof( UV ), 0);
1646 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1647 }
786e8c11
YO
1648 if ( charid ) {
1649 charid--;
1650 if ( !trie->trans[ state + charid ].next ) {
1651 trie->trans[ state + charid ].next = next_alloc;
1652 trie->trans[ state ].check++;
1653 next_alloc += trie->uniquecharcount;
1654 }
1655 state = trie->trans[ state + charid ].next;
1656 } else {
1657 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1658 }
1659 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1660 }
a3621e74 1661 }
3dab1dad
YO
1662 accept_state = TRIE_NODENUM( state );
1663 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1664
1665 } /* end second pass */
1666
3dab1dad
YO
1667 /* and now dump it out before we compress it */
1668 DEBUG_TRIE_COMPILE_MORE_r(
1669 dump_trie_interim_table(trie,next_alloc,depth+1)
1670 );
a3621e74 1671
a3621e74
YO
1672 {
1673 /*
1674 * Inplace compress the table.*
1675
1676 For sparse data sets the table constructed by the trie algorithm will
1677 be mostly 0/FAIL transitions or to put it another way mostly empty.
1678 (Note that leaf nodes will not contain any transitions.)
1679
1680 This algorithm compresses the tables by eliminating most such
1681 transitions, at the cost of a modest bit of extra work during lookup:
1682
1683 - Each states[] entry contains a .base field which indicates the
1684 index in the state[] array wheres its transition data is stored.
1685
1686 - If .base is 0 there are no valid transitions from that node.
1687
1688 - If .base is nonzero then charid is added to it to find an entry in
1689 the trans array.
1690
1691 -If trans[states[state].base+charid].check!=state then the
1692 transition is taken to be a 0/Fail transition. Thus if there are fail
1693 transitions at the front of the node then the .base offset will point
1694 somewhere inside the previous nodes data (or maybe even into a node
1695 even earlier), but the .check field determines if the transition is
1696 valid.
1697
786e8c11 1698 XXX - wrong maybe?
a3621e74
YO
1699 The following process inplace converts the table to the compressed
1700 table: We first do not compress the root node 1,and mark its all its
1701 .check pointers as 1 and set its .base pointer as 1 as well. This
1702 allows to do a DFA construction from the compressed table later, and
1703 ensures that any .base pointers we calculate later are greater than
1704 0.
1705
1706 - We set 'pos' to indicate the first entry of the second node.
1707
1708 - We then iterate over the columns of the node, finding the first and
1709 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1710 and set the .check pointers accordingly, and advance pos
1711 appropriately and repreat for the next node. Note that when we copy
1712 the next pointers we have to convert them from the original
1713 NODEIDX form to NODENUM form as the former is not valid post
1714 compression.
1715
1716 - If a node has no transitions used we mark its base as 0 and do not
1717 advance the pos pointer.
1718
1719 - If a node only has one transition we use a second pointer into the
1720 structure to fill in allocated fail transitions from other states.
1721 This pointer is independent of the main pointer and scans forward
1722 looking for null transitions that are allocated to a state. When it
1723 finds one it writes the single transition into the "hole". If the
786e8c11 1724 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1725
1726 - Once compressed we can Renew/realloc the structures to release the
1727 excess space.
1728
1729 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1730 specifically Fig 3.47 and the associated pseudocode.
1731
1732 demq
1733 */
a3b680e6 1734 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1735 U32 state, charid;
a3621e74 1736 U32 pos = 0, zp=0;
1e2e3d02 1737 trie->statecount = laststate;
a3621e74
YO
1738
1739 for ( state = 1 ; state < laststate ; state++ ) {
1740 U8 flag = 0;
a28509cc
AL
1741 const U32 stateidx = TRIE_NODEIDX( state );
1742 const U32 o_used = trie->trans[ stateidx ].check;
1743 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1744 trie->trans[ stateidx ].check = 0;
1745
1746 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1747 if ( flag || trie->trans[ stateidx + charid ].next ) {
1748 if ( trie->trans[ stateidx + charid ].next ) {
1749 if (o_used == 1) {
1750 for ( ; zp < pos ; zp++ ) {
1751 if ( ! trie->trans[ zp ].next ) {
1752 break;
1753 }
1754 }
1755 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1756 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1757 trie->trans[ zp ].check = state;
1758 if ( ++zp > pos ) pos = zp;
1759 break;
1760 }
1761 used--;
1762 }
1763 if ( !flag ) {
1764 flag = 1;
1765 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1766 }
1767 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1768 trie->trans[ pos ].check = state;
1769 pos++;
1770 }
1771 }
1772 }
cc601c31 1773 trie->lasttrans = pos + 1;
1e2e3d02 1774 Renew( trie->states, laststate, reg_trie_state);
a3621e74 1775 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1776 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1777 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1778 (int)depth * 2 + 2,"",
1779 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1780 (IV)next_alloc,
1781 (IV)pos,
a3621e74
YO
1782 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1783 );
1784
1785 } /* end table compress */
1786 }
1e2e3d02
YO
1787 DEBUG_TRIE_COMPILE_MORE_r(
1788 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1789 (int)depth * 2 + 2, "",
1790 (UV)trie->statecount,
1791 (UV)trie->lasttrans)
1792 );
cc601c31
YO
1793 /* resize the trans array to remove unused space */
1794 Renew( trie->trans, trie->lasttrans, reg_trie_trans);
a3621e74 1795
3dab1dad
YO
1796 /* and now dump out the compressed format */
1797 DEBUG_TRIE_COMPILE_r(
1798 dump_trie(trie,depth+1)
1799 );
07be1b83 1800
3dab1dad 1801 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1802 U8 nodetype =(U8)(flags & 0xFF);
1803 char *str=NULL;
786e8c11 1804
07be1b83 1805#ifdef DEBUGGING
e62cc96a 1806 regnode *optimize = NULL;
b57a0404
JH
1807 U32 mjd_offset = 0;
1808 U32 mjd_nodelen = 0;
07be1b83 1809#endif
a3621e74 1810 /*
3dab1dad
YO
1811 This means we convert either the first branch or the first Exact,
1812 depending on whether the thing following (in 'last') is a branch
1813 or not and whther first is the startbranch (ie is it a sub part of
1814 the alternation or is it the whole thing.)
1815 Assuming its a sub part we conver the EXACT otherwise we convert
1816 the whole branch sequence, including the first.
a3621e74 1817 */
3dab1dad 1818 /* Find the node we are going to overwrite */
7f69552c 1819 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1820 /* branch sub-chain */
3dab1dad 1821 NEXT_OFF( first ) = (U16)(last - first);
07be1b83
YO
1822 DEBUG_r({
1823 mjd_offset= Node_Offset((convert));
1824 mjd_nodelen= Node_Length((convert));
1825 });
7f69552c
YO
1826 /* whole branch chain */
1827 } else {
1828 DEBUG_r({
1829 const regnode *nop = NEXTOPER( convert );
1830 mjd_offset= Node_Offset((nop));
1831 mjd_nodelen= Node_Length((nop));
1832 });
07be1b83 1833 }
7f69552c 1834
07be1b83
YO
1835 DEBUG_OPTIMISE_r(
1836 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1837 (int)depth * 2 + 2, "",
786e8c11 1838 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1839 );
a3621e74 1840
3dab1dad
YO
1841 /* But first we check to see if there is a common prefix we can
1842 split out as an EXACT and put in front of the TRIE node. */
1843 trie->startstate= 1;
786e8c11 1844 if ( trie->bitmap && !trie->widecharmap && !trie->jump ) {
3dab1dad 1845 U32 state;
1e2e3d02 1846 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1847 U32 ofs = 0;
8e11feef
RGS
1848 I32 idx = -1;
1849 U32 count = 0;
1850 const U32 base = trie->states[ state ].trans.base;
a3621e74 1851
3dab1dad 1852 if ( trie->states[state].wordnum )
8e11feef 1853 count = 1;
a3621e74 1854
8e11feef 1855 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1856 if ( ( base + ofs >= trie->uniquecharcount ) &&
1857 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1858 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1859 {
3dab1dad 1860 if ( ++count > 1 ) {
8e11feef 1861 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), ofs, 0);
07be1b83 1862 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1863 if ( state == 1 ) break;
3dab1dad
YO
1864 if ( count == 2 ) {
1865 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1866 DEBUG_OPTIMISE_r(
8e11feef
RGS
1867 PerlIO_printf(Perl_debug_log,
1868 "%*sNew Start State=%"UVuf" Class: [",
1869 (int)depth * 2 + 2, "",
786e8c11 1870 (UV)state));
be8e71aa
YO
1871 if (idx >= 0) {
1872 SV ** const tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
1873 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1874
3dab1dad 1875 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1876 if ( folder )
1877 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1878 DEBUG_OPTIMISE_r(
07be1b83 1879 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1880 );
8e11feef
RGS
1881 }
1882 }
1883 TRIE_BITMAP_SET(trie,*ch);
1884 if ( folder )
1885 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1886 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1887 }
1888 idx = ofs;
1889 }
3dab1dad
YO
1890 }
1891 if ( count == 1 ) {
1892 SV **tmp = av_fetch( TRIE_REVCHARMAP(trie), idx, 0);
8e11feef 1893 const char *ch = SvPV_nolen_const( *tmp );
3dab1dad 1894 DEBUG_OPTIMISE_r(
8e11feef
RGS
1895 PerlIO_printf( Perl_debug_log,
1896 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1897 (int)depth * 2 + 2, "",
786e8c11 1898 (UV)state, (UV)idx, ch)
3dab1dad
YO
1899 );
1900 if ( state==1 ) {
1901 OP( convert ) = nodetype;
1902 str=STRING(convert);
1903 STR_LEN(convert)=0;
1904 }
1905 *str++=*ch;
1906 STR_LEN(convert)++;
a3621e74 1907
8e11feef 1908 } else {
f9049ba1 1909#ifdef DEBUGGING
8e11feef
RGS
1910 if (state>1)
1911 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1912#endif
8e11feef
RGS
1913 break;
1914 }
1915 }
3dab1dad 1916 if (str) {
8e11feef 1917 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1918 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1919 trie->startstate = state;
07be1b83
YO
1920 trie->minlen -= (state - 1);
1921 trie->maxlen -= (state - 1);
1922 DEBUG_r({
1923 regnode *fix = convert;
1924 mjd_nodelen++;
1925 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1926 while( ++fix < n ) {
1927 Set_Node_Offset_Length(fix, 0, 0);
1928 }
1929 });
8e11feef
RGS
1930 if (trie->maxlen) {
1931 convert = n;
1932 } else {
3dab1dad 1933 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 1934 DEBUG_r(optimize= n);
3dab1dad
YO
1935 }
1936 }
1937 }
a5ca303d
YO
1938 if (!jumper)
1939 jumper = last;
3dab1dad 1940 if ( trie->maxlen ) {
8e11feef
RGS
1941 NEXT_OFF( convert ) = (U16)(tail - convert);
1942 ARG_SET( convert, data_slot );
786e8c11
YO
1943 /* Store the offset to the first unabsorbed branch in
1944 jump[0], which is otherwise unused by the jump logic.
1945 We use this when dumping a trie and during optimisation. */
1946 if (trie->jump)
7f69552c 1947 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 1948
786e8c11
YO
1949 /* XXXX */
1950 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 1951 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
1952 {
1953 OP( convert ) = TRIEC;
1954 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1955 Safefree(trie->bitmap);
1956 trie->bitmap= NULL;
1957 } else
1958 OP( convert ) = TRIE;
a3621e74 1959
3dab1dad
YO
1960 /* store the type in the flags */
1961 convert->flags = nodetype;
a5ca303d
YO
1962 DEBUG_r({
1963 optimize = convert
1964 + NODE_STEP_REGNODE
1965 + regarglen[ OP( convert ) ];
1966 });
1967 /* XXX We really should free up the resource in trie now,
1968 as we won't use them - (which resources?) dmq */
3dab1dad 1969 }
a3621e74 1970 /* needed for dumping*/
e62cc96a 1971 DEBUG_r(if (optimize) {
07be1b83 1972 regnode *opt = convert;
e62cc96a 1973 while ( ++opt < optimize) {
07be1b83
YO
1974 Set_Node_Offset_Length(opt,0,0);
1975 }
786e8c11
YO
1976 /*
1977 Try to clean up some of the debris left after the
1978 optimisation.
a3621e74 1979 */
786e8c11 1980 while( optimize < jumper ) {
07be1b83 1981 mjd_nodelen += Node_Length((optimize));
a3621e74 1982 OP( optimize ) = OPTIMIZED;
07be1b83 1983 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
1984 optimize++;
1985 }
07be1b83 1986 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
1987 });
1988 } /* end node insert */
07be1b83 1989#ifndef DEBUGGING
6e8b4190 1990 SvREFCNT_dec(TRIE_REVCHARMAP(trie));
07be1b83 1991#endif
786e8c11
YO
1992 return trie->jump
1993 ? MADE_JUMP_TRIE
1994 : trie->startstate>1
1995 ? MADE_EXACT_TRIE
1996 : MADE_TRIE;
1997}
1998
1999STATIC void
2000S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2001{
2002/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2003
2004 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2005 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2006 ISBN 0-201-10088-6
2007
2008 We find the fail state for each state in the trie, this state is the longest proper
2009 suffix of the current states 'word' that is also a proper prefix of another word in our
2010 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2011 the DFA not to have to restart after its tried and failed a word at a given point, it
2012 simply continues as though it had been matching the other word in the first place.
2013 Consider
2014 'abcdgu'=~/abcdefg|cdgu/
2015 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2016 fail, which would bring use to the state representing 'd' in the second word where we would
2017 try 'g' and succeed, prodceding to match 'cdgu'.
2018 */
2019 /* add a fail transition */
2020 reg_trie_data *trie=(reg_trie_data *)RExC_rx->data->data[ARG(source)];
2021 U32 *q;
2022 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2023 const U32 numstates = trie->statecount;
786e8c11
YO
2024 const U32 ubound = trie->lasttrans + ucharcount;
2025 U32 q_read = 0;
2026 U32 q_write = 0;
2027 U32 charid;
2028 U32 base = trie->states[ 1 ].trans.base;
2029 U32 *fail;
2030 reg_ac_data *aho;
2031 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2032 GET_RE_DEBUG_FLAGS_DECL;
2033#ifndef DEBUGGING
2034 PERL_UNUSED_ARG(depth);
2035#endif
2036
2037
2038 ARG_SET( stclass, data_slot );
2039 Newxz( aho, 1, reg_ac_data );
2040 RExC_rx->data->data[ data_slot ] = (void*)aho;
2041 aho->trie=trie;
2042 aho->states=(reg_trie_state *)savepvn((const char*)trie->states,
1e2e3d02 2043 numstates * sizeof(reg_trie_state));
786e8c11
YO
2044 Newxz( q, numstates, U32);
2045 Newxz( aho->fail, numstates, U32 );
2046 aho->refcount = 1;
2047 fail = aho->fail;
2048 /* initialize fail[0..1] to be 1 so that we always have
2049 a valid final fail state */
2050 fail[ 0 ] = fail[ 1 ] = 1;
2051
2052 for ( charid = 0; charid < ucharcount ; charid++ ) {
2053 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2054 if ( newstate ) {
2055 q[ q_write ] = newstate;
2056 /* set to point at the root */
2057 fail[ q[ q_write++ ] ]=1;
2058 }
2059 }
2060 while ( q_read < q_write) {
2061 const U32 cur = q[ q_read++ % numstates ];
2062 base = trie->states[ cur ].trans.base;
2063
2064 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2065 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2066 if (ch_state) {
2067 U32 fail_state = cur;
2068 U32 fail_base;
2069 do {
2070 fail_state = fail[ fail_state ];
2071 fail_base = aho->states[ fail_state ].trans.base;
2072 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2073
2074 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2075 fail[ ch_state ] = fail_state;
2076 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2077 {
2078 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2079 }
2080 q[ q_write++ % numstates] = ch_state;
2081 }
2082 }
2083 }
2084 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2085 when we fail in state 1, this allows us to use the
2086 charclass scan to find a valid start char. This is based on the principle
2087 that theres a good chance the string being searched contains lots of stuff
2088 that cant be a start char.
2089 */
2090 fail[ 0 ] = fail[ 1 ] = 0;
2091 DEBUG_TRIE_COMPILE_r({
1e2e3d02
YO
2092 PerlIO_printf(Perl_debug_log, "%*sStclass Failtable (%"UVuf" states): 0",
2093 (int)(depth * 2), "", numstates
2094 );
786e8c11
YO
2095 for( q_read=1; q_read<numstates; q_read++ ) {
2096 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2097 }
2098 PerlIO_printf(Perl_debug_log, "\n");
2099 });
2100 Safefree(q);
2101 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2102}
2103
786e8c11 2104
a3621e74 2105/*
5d1c421c
JH
2106 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2107 * These need to be revisited when a newer toolchain becomes available.
2108 */
2109#if defined(__sparc64__) && defined(__GNUC__)
2110# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2111# undef SPARC64_GCC_WORKAROUND
2112# define SPARC64_GCC_WORKAROUND 1
2113# endif
2114#endif
2115
07be1b83
YO
2116#define DEBUG_PEEP(str,scan,depth) \
2117 DEBUG_OPTIMISE_r({ \
2118 SV * const mysv=sv_newmortal(); \
2119 regnode *Next = regnext(scan); \
2120 regprop(RExC_rx, mysv, scan); \
7f69552c 2121 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2122 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2123 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2124 });
2125
1de06328
YO
2126
2127
2128
2129
07be1b83
YO
2130#define JOIN_EXACT(scan,min,flags) \
2131 if (PL_regkind[OP(scan)] == EXACT) \
2132 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2133
be8e71aa 2134STATIC U32
07be1b83
YO
2135S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2136 /* Merge several consecutive EXACTish nodes into one. */
2137 regnode *n = regnext(scan);
2138 U32 stringok = 1;
2139 regnode *next = scan + NODE_SZ_STR(scan);
2140 U32 merged = 0;
2141 U32 stopnow = 0;
2142#ifdef DEBUGGING
2143 regnode *stop = scan;
72f13be8 2144 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2145#else
d47053eb
RGS
2146 PERL_UNUSED_ARG(depth);
2147#endif
2148#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2149 PERL_UNUSED_ARG(flags);
2150 PERL_UNUSED_ARG(val);
07be1b83 2151#endif
07be1b83
YO
2152 DEBUG_PEEP("join",scan,depth);
2153
2154 /* Skip NOTHING, merge EXACT*. */
2155 while (n &&
2156 ( PL_regkind[OP(n)] == NOTHING ||
2157 (stringok && (OP(n) == OP(scan))))
2158 && NEXT_OFF(n)
2159 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2160
2161 if (OP(n) == TAIL || n > next)
2162 stringok = 0;
2163 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2164 DEBUG_PEEP("skip:",n,depth);
2165 NEXT_OFF(scan) += NEXT_OFF(n);
2166 next = n + NODE_STEP_REGNODE;
2167#ifdef DEBUGGING
2168 if (stringok)
2169 stop = n;
2170#endif
2171 n = regnext(n);
2172 }
2173 else if (stringok) {
786e8c11 2174 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2175 regnode * const nnext = regnext(n);
2176
2177 DEBUG_PEEP("merg",n,depth);
2178
2179 merged++;
2180 if (oldl + STR_LEN(n) > U8_MAX)
2181 break;
2182 NEXT_OFF(scan) += NEXT_OFF(n);
2183 STR_LEN(scan) += STR_LEN(n);
2184 next = n + NODE_SZ_STR(n);
2185 /* Now we can overwrite *n : */
2186 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2187#ifdef DEBUGGING
2188 stop = next - 1;
2189#endif
2190 n = nnext;
2191 if (stopnow) break;
2192 }
2193
d47053eb
RGS
2194#ifdef EXPERIMENTAL_INPLACESCAN
2195 if (flags && !NEXT_OFF(n)) {
2196 DEBUG_PEEP("atch", val, depth);
2197 if (reg_off_by_arg[OP(n)]) {
2198 ARG_SET(n, val - n);
2199 }
2200 else {
2201 NEXT_OFF(n) = val - n;
2202 }
2203 stopnow = 1;
2204 }
07be1b83
YO
2205#endif
2206 }
2207
2208 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2209 /*
2210 Two problematic code points in Unicode casefolding of EXACT nodes:
2211
2212 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2213 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2214
2215 which casefold to
2216
2217 Unicode UTF-8
2218
2219 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2220 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2221
2222 This means that in case-insensitive matching (or "loose matching",
2223 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2224 length of the above casefolded versions) can match a target string
2225 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2226 This would rather mess up the minimum length computation.
2227
2228 What we'll do is to look for the tail four bytes, and then peek
2229 at the preceding two bytes to see whether we need to decrease
2230 the minimum length by four (six minus two).
2231
2232 Thanks to the design of UTF-8, there cannot be false matches:
2233 A sequence of valid UTF-8 bytes cannot be a subsequence of
2234 another valid sequence of UTF-8 bytes.
2235
2236 */
2237 char * const s0 = STRING(scan), *s, *t;
2238 char * const s1 = s0 + STR_LEN(scan) - 1;
2239 char * const s2 = s1 - 4;
e294cc5d
JH
2240#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2241 const char t0[] = "\xaf\x49\xaf\x42";
2242#else
07be1b83 2243 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2244#endif
07be1b83
YO
2245 const char * const t1 = t0 + 3;
2246
2247 for (s = s0 + 2;
2248 s < s2 && (t = ninstr(s, s1, t0, t1));
2249 s = t + 4) {
e294cc5d
JH
2250#ifdef EBCDIC
2251 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2252 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2253#else
07be1b83
YO
2254 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2255 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2256#endif
07be1b83
YO
2257 *min -= 4;
2258 }
2259 }
2260
2261#ifdef DEBUGGING
2262 /* Allow dumping */
2263 n = scan + NODE_SZ_STR(scan);
2264 while (n <= stop) {
2265 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2266 OP(n) = OPTIMIZED;
2267 NEXT_OFF(n) = 0;
2268 }
2269 n++;
2270 }
2271#endif
2272 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2273 return stopnow;
2274}
2275
653099ff
GS
2276/* REx optimizer. Converts nodes into quickier variants "in place".
2277 Finds fixed substrings. */
2278
a0288114 2279/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2280 to the position after last scanned or to NULL. */
2281
40d049e4
YO
2282#define INIT_AND_WITHP \
2283 assert(!and_withp); \
2284 Newx(and_withp,1,struct regnode_charclass_class); \
2285 SAVEFREEPV(and_withp)
07be1b83 2286
76e3520e 2287STATIC I32
40d049e4 2288S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2289 I32 *minlenp, I32 *deltap,
40d049e4
YO
2290 regnode *last,
2291 scan_data_t *data,
2292 I32 stopparen,
2293 U8* recursed,
2294 struct regnode_charclass_class *and_withp,
2295 U32 flags, U32 depth)
c277df42
IZ
2296 /* scanp: Start here (read-write). */
2297 /* deltap: Write maxlen-minlen here. */
2298 /* last: Stop before this one. */
40d049e4
YO
2299 /* data: string data about the pattern */
2300 /* stopparen: treat close N as END */
2301 /* recursed: which subroutines have we recursed into */
2302 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2303{
97aff369 2304 dVAR;
c277df42
IZ
2305 I32 min = 0, pars = 0, code;
2306 regnode *scan = *scanp, *next;
2307 I32 delta = 0;
2308 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2309 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2310 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2311 scan_data_t data_fake;
a3621e74 2312 SV *re_trie_maxbuff = NULL;
786e8c11 2313 regnode *first_non_open = scan;
a3621e74 2314 GET_RE_DEBUG_FLAGS_DECL;
13a24bad 2315#ifdef DEBUGGING
40d049e4 2316 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2317#endif
40d049e4 2318
786e8c11 2319 if ( depth == 0 ) {
40d049e4 2320 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2321 first_non_open=regnext(first_non_open);
2322 }
2323
b81d288d 2324
c277df42
IZ
2325 while (scan && OP(scan) != END && scan < last) {
2326 /* Peephole optimizer: */
1de06328 2327 DEBUG_STUDYDATA(data,depth);
07be1b83 2328 DEBUG_PEEP("Peep",scan,depth);
07be1b83 2329 JOIN_EXACT(scan,&min,0);
a3621e74 2330
653099ff
GS
2331 /* Follow the next-chain of the current node and optimize
2332 away all the NOTHINGs from it. */
c277df42 2333 if (OP(scan) != CURLYX) {
a3b680e6 2334 const int max = (reg_off_by_arg[OP(scan)]
048cfca1
GS
2335 ? I32_MAX
2336 /* I32 may be smaller than U16 on CRAYs! */
2337 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
c277df42
IZ
2338 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2339 int noff;
2340 regnode *n = scan;
b81d288d 2341
c277df42
IZ
2342 /* Skip NOTHING and LONGJMP. */
2343 while ((n = regnext(n))
3dab1dad 2344 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
c277df42
IZ
2345 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2346 && off + noff < max)
2347 off += noff;
2348 if (reg_off_by_arg[OP(scan)])
2349 ARG(scan) = off;
b81d288d 2350 else
c277df42
IZ
2351 NEXT_OFF(scan) = off;
2352 }
a3621e74 2353
07be1b83 2354
3dab1dad 2355
653099ff
GS
2356 /* The principal pseudo-switch. Cannot be a switch, since we
2357 look into several different things. */
b81d288d 2358 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
c277df42
IZ
2359 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
2360 next = regnext(scan);
2361 code = OP(scan);
a3621e74 2362 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
b81d288d
AB
2363
2364 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
786e8c11
YO
2365 /* NOTE - There is similar code to this block below for handling
2366 TRIE nodes on a re-study. If you change stuff here check there
2367 too. */
c277df42 2368 I32 max1 = 0, min1 = I32_MAX, num = 0;
653099ff 2369 struct regnode_charclass_class accum;
d4c19fe8 2370 regnode * const startbranch=scan;
c277df42 2371
653099ff 2372 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 2373 scan_commit(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
653099ff 2374 if (flags & SCF_DO_STCLASS)
830247a4 2375 cl_init_zero(pRExC_state, &accum);
a3621e74 2376
c277df42 2377 while (OP(scan) == code) {
830247a4 2378 I32 deltanext, minnext, f = 0, fake;
653099ff 2379 struct regnode_charclass_class this_class;
c277df42
IZ
2380
2381 num++;
2382 data_fake.flags = 0;
40d049e4 2383 if (data) {
2c2d71f5 2384 data_fake.whilem_c = data->whilem_c;
cb434fcc
IZ
2385 data_fake.last_closep = data->last_closep;
2386 }
2387 else
2388 data_fake.last_closep = &fake;
c277df42
IZ
2389 next = regnext(scan);
2390 scan = NEXTOPER(scan);
2391 if (code != BRANCH)
2392 scan = NEXTOPER(scan);
653099ff 2393 if (flags & SCF_DO_STCLASS) {
830247a4 2394 cl_init(pRExC_state, &this_class);
653099ff
GS
2395 data_fake.start_class = &this_class;
2396 f = SCF_DO_STCLASS_AND;
b81d288d 2397 }
e1901655
IZ
2398 if (flags & SCF_WHILEM_VISITED_POS)
2399 f |= SCF_WHILEM_VISITED_POS;
a3621e74 2400
653099ff 2401 /* we suppose the run is continuous, last=next...*/
1de06328 2402 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
40d049e4
YO
2403 next, &data_fake,
2404 stopparen, recursed, NULL, f,depth+1);
b81d288d 2405 if (min1 > minnext)
c277df42
IZ
2406 min1 = minnext;
2407 if (max1 < minnext + deltanext)
2408 max1 = minnext + deltanext;
2409 if (deltanext == I32_MAX)
aca2d497 2410 is_inf = is_inf_internal = 1;
c277df42
IZ
2411 scan = next;
2412 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2413 pars++;
3dab1dad
YO
2414 if (data) {
2415 if (data_fake.flags & SF_HAS_EVAL)
07be1b83 2416 data->flags |= SF_HAS_EVAL;
2c2d71f5 2417 data->whilem_c = data_fake.whilem_c;
3dab1dad 2418 }
653099ff 2419 if (flags & SCF_DO_STCLASS)
830247a4 2420 cl_or(pRExC_state, &accum, &this_class);
b81d288d 2421 if (code == SUSPEND)
c277df42
IZ
2422 break;
2423 }
2424 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2425 min1 = 0;
2426 if (flags & SCF_DO_SUBSTR) {
2427 data->pos_min += min1;
2428 data->pos_delta += max1 - min1;
2429 if (max1 != min1 || is_inf)
2430 data->longest = &(data->longest_float);
2431 }
2432 min += min1;
2433 delta += max1 - min1;
653099ff 2434 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2435 cl_or(pRExC_state, data->start_class, &accum);
653099ff 2436 if (min1) {
40d049e4 2437 cl_and(data->start_class, and_withp);
653099ff
GS
2438 flags &= ~SCF_DO_STCLASS;
2439 }
2440 }
2441 else if (flags & SCF_DO_STCLASS_AND) {
de0c8cb8
GS
2442 if (min1) {
2443 cl_and(data->start_class, &accum);
653099ff 2444 flags &= ~SCF_DO_STCLASS;
de0c8cb8
GS
2445 }
2446 else {
b81d288d 2447 /* Switch to OR mode: cache the old value of
de0c8cb8 2448 * data->start_class */
40d049e4
YO
2449 INIT_AND_WITHP;
2450 StructCopy(data->start_class, and_withp,
de0c8cb8
GS
2451 struct regnode_charclass_class);
2452 flags &= ~SCF_DO_STCLASS_AND;
2453 StructCopy(&accum, data->start_class,
2454 struct regnode_charclass_class);
2455 flags |= SCF_DO_STCLASS_OR;
2456 data->start_class->flags |= ANYOF_EOS;
2457 }
653099ff 2458 }
a3621e74 2459
786e8c11 2460 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
a3621e74
YO
2461 /* demq.
2462
2463 Assuming this was/is a branch we are dealing with: 'scan' now
2464 points at the item that follows the branch sequence, whatever
2465 it is. We now start at the beginning of the sequence and look
2466 for subsequences of
2467
786e8c11
YO
2468 BRANCH->EXACT=>x1
2469 BRANCH->EXACT=>x2
2470 tail
a3621e74
YO
2471
2472 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2473
2474 If we can find such a subseqence we need to turn the first
2475 element into a trie and then add the subsequent branch exact
2476 strings to the trie.
2477
2478 We have two cases
2479
786e8c11 2480 1. patterns where the whole set of branch can be converted.
a3621e74 2481
786e8c11 2482 2. patterns where only a subset can be converted.
a3621e74
YO
2483
2484 In case 1 we can replace the whole set with a single regop
2485 for the trie. In case 2 we need to keep the start and end
2486 branchs so
2487
2488 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2489 becomes BRANCH TRIE; BRANCH X;
2490
786e8c11
YO
2491 There is an additional case, that being where there is a
2492 common prefix, which gets split out into an EXACT like node
2493 preceding the TRIE node.
2494
2495 If x(1..n)==tail then we can do a simple trie, if not we make
2496 a "jump" trie, such that when we match the appropriate word
2497 we "jump" to the appopriate tail node. Essentailly we turn
2498 a nested if into a case structure of sorts.
a3621e74
YO
2499
2500 */
786e8c11 2501
3dab1dad 2502 int made=0;
0111c4fd
RGS
2503 if (!re_trie_maxbuff) {
2504 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2505 if (!SvIOK(re_trie_maxbuff))
2506 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2507 }
786e8c11 2508 if ( SvIV(re_trie_maxbuff)>=0 ) {
a3621e74
YO
2509 regnode *cur;
2510 regnode *first = (regnode *)NULL;
2511 regnode *last = (regnode *)NULL;
2512 regnode *tail = scan;
2513 U8 optype = 0;
2514 U32 count=0;
2515
2516#ifdef DEBUGGING
c445ea15 2517 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74
YO
2518#endif
2519 /* var tail is used because there may be a TAIL
2520 regop in the way. Ie, the exacts will point to the
2521 thing following the TAIL, but the last branch will
2522 point at the TAIL. So we advance tail. If we
2523 have nested (?:) we may have to move through several
2524 tails.
2525 */
2526
2527 while ( OP( tail ) == TAIL ) {
2528 /* this is the TAIL generated by (?:) */
2529 tail = regnext( tail );
2530 }
2531
3dab1dad 2532
a3621e74 2533 DEBUG_OPTIMISE_r({
32fc9b6a 2534 regprop(RExC_rx, mysv, tail );
3dab1dad
YO
2535 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2536 (int)depth * 2 + 2, "",
2537 "Looking for TRIE'able sequences. Tail node is: ",
2538 SvPV_nolen_const( mysv )
a3621e74
YO
2539 );
2540 });
3dab1dad 2541
a3621e74
YO
2542 /*
2543
2544 step through the branches, cur represents each
2545 branch, noper is the first thing to be matched
2546 as part of that branch and noper_next is the
2547 regnext() of that node. if noper is an EXACT
2548 and noper_next is the same as scan (our current
2549 position in the regex) then the EXACT branch is
2550 a possible optimization target. Once we have
2551 two or more consequetive such branches we can
2552 create a trie of the EXACT's contents and stich
2553 it in place. If the sequence represents all of
2554 the branches we eliminate the whole thing and
2555 replace it with a single TRIE. If it is a
2556 subsequence then we need to stitch it in. This
2557 means the first branch has to remain, and needs
2558 to be repointed at the item on the branch chain
2559 following the last branch optimized. This could
2560 be either a BRANCH, in which case the
2561 subsequence is internal, or it could be the
2562 item following the branch sequence in which
2563 case the subsequence is at the end.
2564
2565 */
2566
2567 /* dont use tail as the end marker for this traverse */
2568 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
aec46f14 2569 regnode * const noper = NEXTOPER( cur );
be981c67 2570#if defined(DEBUGGING) || defined(NOJUMPTRIE)
aec46f14 2571 regnode * const noper_next = regnext( noper );
be981c67 2572#endif
a3621e74 2573
a3621e74 2574 DEBUG_OPTIMISE_r({
32fc9b6a 2575 regprop(RExC_rx, mysv, cur);
3dab1dad
YO
2576 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2577 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
a3621e74 2578
32fc9b6a 2579 regprop(RExC_rx, mysv, noper);
a3621e74 2580 PerlIO_printf( Perl_debug_log, " -> %s",
cfd0369c 2581 SvPV_nolen_const(mysv));
a3621e74
YO
2582
2583 if ( noper_next ) {
32fc9b6a 2584 regprop(RExC_rx, mysv, noper_next );
a3621e74 2585 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
cfd0369c 2586 SvPV_nolen_const(mysv));
a3621e74 2587 }
3dab1dad
YO
2588 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2589 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
a3621e74 2590 });
3dab1dad
YO
2591 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2592 : PL_regkind[ OP( noper ) ] == EXACT )
2593 || OP(noper) == NOTHING )
786e8c11
YO
2594#ifdef NOJUMPTRIE
2595 && noper_next == tail
2596#endif
2597 && count < U16_MAX)
a3621e74
YO
2598 {
2599 count++;
3dab1dad
YO
2600 if ( !first || optype == NOTHING ) {
2601 if (!first) first = cur;
a3621e74
YO
2602 optype = OP( noper );
2603 } else {
a3621e74 2604 last = cur;
a3621e74
YO
2605 }
2606 } else {
2607 if ( last ) {
786e8c11
YO
2608 make_trie( pRExC_state,
2609 startbranch, first, cur, tail, count,
2610 optype, depth+1 );
a3621e74 2611 }
3dab1dad 2612 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11
YO
2613#ifdef NOJUMPTRIE
2614 && noper_next == tail
2615#endif
2616 ){
a3621e74
YO
2617 count = 1;
2618 first = cur;
2619 optype = OP( noper );
2620 } else {
2621 count = 0;
2622 first = NULL;
2623 optype = 0;
2624 }
2625 last = NULL;
2626 }
2627 }
2628 DEBUG_OPTIMISE_r({
32fc9b6a 2629 regprop(RExC_rx, mysv, cur);
a3621e74 2630 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2631 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2632 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
a3621e74
YO
2633
2634 });
2635 if ( last ) {
786e8c11 2636 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2637#ifdef TRIE_STUDY_OPT
786e8c11
YO
2638 if ( ((made == MADE_EXACT_TRIE &&
2639 startbranch == first)
2640 || ( first_non_open == first )) &&
2641 depth==0 )
2642 flags |= SCF_TRIE_RESTUDY;
3dab1dad 2643#endif
07be1b83 2644 }
a3621e74 2645 }
3dab1dad
YO
2646
2647 } /* do trie */
786e8c11 2648
a0ed51b3 2649 }
a3621e74 2650 else if ( code == BRANCHJ ) { /* single branch is optimized. */
c277df42 2651 scan = NEXTOPER(NEXTOPER(scan));
a3621e74 2652 } else /* single branch is optimized. */
c277df42
IZ
2653 scan = NEXTOPER(scan);
2654 continue;
a0ed51b3
LW
2655 }
2656 else if (OP(scan) == EXACT) {
cd439c50 2657 I32 l = STR_LEN(scan);
c445ea15 2658 UV uc;
a0ed51b3 2659 if (UTF) {
a3b680e6 2660 const U8 * const s = (U8*)STRING(scan);
1aa99e6b 2661 l = utf8_length(s, s + l);
9041c2e3 2662 uc = utf8_to_uvchr(s, NULL);
c445ea15
AL
2663 } else {
2664 uc = *((U8*)STRING(scan));
a0ed51b3
LW
2665 }
2666 min += l;
c277df42 2667 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
c277df42
IZ
2668 /* The code below prefers earlier match for fixed
2669 offset, later match for variable offset. */
2670 if (data->last_end == -1) { /* Update the start info. */
2671 data->last_start_min = data->pos_min;
2672 data->last_start_max = is_inf
b81d288d 2673 ? I32_MAX : data->pos_min + data->pos_delta;
c277df42 2674 }
cd439c50 2675 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
45f47268
NC
2676 if (UTF)
2677 SvUTF8_on(data->last_found);
0eda9292 2678 {
9a957fbc 2679 SV * const sv = data->last_found;
a28509cc 2680 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
0eda9292
JH
2681 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2682 if (mg && mg->mg_len >= 0)
5e43f467
JH
2683 mg->mg_len += utf8_length((U8*)STRING(scan),
2684 (U8*)STRING(scan)+STR_LEN(scan));
0eda9292 2685 }
c277df42
IZ
2686 data->last_end = data->pos_min + l;
2687 data->pos_min += l; /* As in the first entry. */
2688 data->flags &= ~SF_BEFORE_EOL;
2689 }
653099ff
GS
2690 if (flags & SCF_DO_STCLASS_AND) {
2691 /* Check whether it is compatible with what we know already! */
2692 int compat = 1;
2693
1aa99e6b 2694 if (uc >= 0x100 ||
516a5887 2695 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2696 && !ANYOF_BITMAP_TEST(data->start_class, uc)
653099ff 2697 && (!(data->start_class->flags & ANYOF_FOLD)
1aa99e6b 2698 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
516a5887 2699 )
653099ff
GS
2700 compat = 0;
2701 ANYOF_CLASS_ZERO(data->start_class);
2702 ANYOF_BITMAP_ZERO(data->start_class);
2703 if (compat)
1aa99e6b 2704 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2705 data->start_class->flags &= ~ANYOF_EOS;
9b877dbb
IH
2706 if (uc < 0x100)
2707 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
653099ff
GS
2708 }
2709 else if (flags & SCF_DO_STCLASS_OR) {
2710 /* false positive possible if the class is case-folded */
1aa99e6b 2711 if (uc < 0x100)
9b877dbb
IH
2712 ANYOF_BITMAP_SET(data->start_class, uc);
2713 else
2714 data->start_class->flags |= ANYOF_UNICODE_ALL;
653099ff 2715 data->start_class->flags &= ~ANYOF_EOS;
40d049e4 2716 cl_and(data->start_class, and_withp);
653099ff
GS
2717 }
2718 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2719 }
3dab1dad 2720 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
cd439c50 2721 I32 l = STR_LEN(scan);
1aa99e6b 2722 UV uc = *((U8*)STRING(scan));
653099ff
GS
2723
2724 /* Search for fixed substrings supports EXACT only. */
ecaa9b9c
NC
2725 if (flags & SCF_DO_SUBSTR) {
2726 assert(data);
1de06328 2727 scan_commit(pRExC_state, data, minlenp);
ecaa9b9c 2728 }
a0ed51b3 2729 if (UTF) {
6136c704 2730 const U8 * const s = (U8 *)STRING(scan);
1aa99e6b 2731 l = utf8_length(s, s + l);
9041c2e3 2732 uc = utf8_to_uvchr(s, NULL);
a0ed51b3
LW
2733 }
2734 min += l;
ecaa9b9c 2735 if (flags & SCF_DO_SUBSTR)
a0ed51b3 2736 data->pos_min += l;
653099ff
GS
2737 if (flags & SCF_DO_STCLASS_AND) {
2738 /* Check whether it is compatible with what we know already! */
2739 int compat = 1;
2740
1aa99e6b 2741 if (uc >= 0x100 ||
516a5887 2742 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1aa99e6b 2743 && !ANYOF_BITMAP_TEST(data->start_class, uc)
516a5887 2744 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
653099ff
GS
2745 compat = 0;
2746 ANYOF_CLASS_ZERO(data->start_class);
2747 ANYOF_BITMAP_ZERO(data->start_class);
2748 if (compat) {
1aa99e6b 2749 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2750 data->start_class->flags &= ~ANYOF_EOS;
2751 data->start_class->flags |= ANYOF_FOLD;
2752 if (OP(scan) == EXACTFL)
2753 data->start_class->flags |= ANYOF_LOCALE;
2754 }
2755 }
2756 else if (flags & SCF_DO_STCLASS_OR) {
2757 if (data->start_class->flags & ANYOF_FOLD) {
2758 /* false positive possible if the class is case-folded.
2759 Assume that the locale settings are the same... */
1aa99e6b
IH
2760 if (uc < 0x100)
2761 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2762 data->start_class->flags &= ~ANYOF_EOS;
2763 }
40d049e4 2764 cl_and(data->start_class, and_withp);
653099ff
GS
2765 }
2766 flags &= ~SCF_DO_STCLASS;
a0ed51b3 2767 }
bfed75c6 2768 else if (strchr((const char*)PL_varies,OP(scan))) {
9c5ffd7c 2769 I32 mincount, maxcount, minnext, deltanext, fl = 0;
aa7a4b56 2770 I32 f = flags, pos_before = 0;
d4c19fe8 2771 regnode * const oscan = scan;
653099ff
GS
2772 struct regnode_charclass_class this_class;
2773 struct regnode_charclass_class *oclass = NULL;
727f22e3 2774 I32 next_is_eval = 0;
653099ff 2775
3dab1dad 2776 switch (PL_regkind[OP(scan)]) {
653099ff 2777 case WHILEM: /* End of (?:...)* . */
c277df42
IZ
2778 scan = NEXTOPER(scan);
2779 goto finish;
2780 case PLUS:
653099ff 2781 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
c277df42 2782 next = NEXTOPER(scan);
653099ff 2783 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
b81d288d
AB
2784 mincount = 1;
2785 maxcount = REG_INFTY;
c277df42
IZ
2786 next = regnext(scan);
2787 scan = NEXTOPER(scan);
2788 goto do_curly;
2789 }
2790 }
2791 if (flags & SCF_DO_SUBSTR)
2792 data->pos_min++;
2793 min++;
2794 /* Fall through. */
2795 case STAR:
653099ff
GS
2796 if (flags & SCF_DO_STCLASS) {
2797 mincount = 0;
b81d288d 2798 maxcount = REG_INFTY;
653099ff
GS
2799 next = regnext(scan);
2800 scan = NEXTOPER(scan);
2801 goto do_curly;
2802 }
b81d288d 2803 is_inf = is_inf_internal = 1;
c277df42
IZ
2804 scan = regnext(scan);
2805 if (flags & SCF_DO_SUBSTR) {
1de06328 2806 scan_commit(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2807 data->longest = &(data->longest_float);
2808 }
2809 goto optimize_curly_tail;
2810 case CURLY:
40d049e4
YO
2811 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2812 && (scan->flags == stopparen))
2813 {
2814 mincount = 1;
2815 maxcount = 1;
2816 } else {
2817 mincount = ARG1(scan);
2818 maxcount = ARG2(scan);
2819 }
c277df42 2820 next = regnext(scan);
cb434fcc
IZ
2821 if (OP(scan) == CURLYX) {
2822 I32 lp = (data ? *(data->last_closep) : 0);
786e8c11 2823 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
cb434fcc 2824 }
c277df42 2825 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
727f22e3 2826 next_is_eval = (OP(scan) == EVAL);
c277df42
IZ
2827 do_curly:
2828 if (flags & SCF_DO_SUBSTR) {
1de06328 2829 if (mincount == 0) scan_commit(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
c277df42
IZ
2830 pos_before = data->pos_min;
2831 }
2832 if (data) {
2833 fl = data->flags;
2834 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2835 if (is_inf)
2836 data->flags |= SF_IS_INF;
2837 }
653099ff 2838 if (flags & SCF_DO_STCLASS) {
830247a4 2839 cl_init(pRExC_state, &this_class);
653099ff
GS
2840 oclass = data->start_class;
2841 data->start_class = &this_class;
2842 f |= SCF_DO_STCLASS_AND;
2843 f &= ~SCF_DO_STCLASS_OR;
2844 }
e1901655
IZ
2845 /* These are the cases when once a subexpression
2846 fails at a particular position, it cannot succeed
2847 even after backtracking at the enclosing scope.
b81d288d 2848
e1901655
IZ
2849 XXXX what if minimal match and we are at the
2850 initial run of {n,m}? */
2851 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
2852 f &= ~SCF_WHILEM_VISITED_POS;
653099ff 2853
c277df42 2854 /* This will finish on WHILEM, setting scan, or on NULL: */
40d049e4
YO
2855 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2856 last, data, stopparen, recursed, NULL,
a3621e74
YO
2857 (mincount == 0
2858 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
653099ff
GS
2859
2860 if (flags & SCF_DO_STCLASS)
2861 data->start_class = oclass;
2862 if (mincount == 0 || minnext == 0) {
2863 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2864 cl_or(pRExC_state, data->start_class, &this_class);
653099ff
GS
2865 }
2866 else if (flags & SCF_DO_STCLASS_AND) {
b81d288d 2867 /* Switch to OR mode: cache the old value of
653099ff 2868 * data->start_class */
40d049e4
YO
2869 INIT_AND_WITHP;
2870 StructCopy(data->start_class, and_withp,
653099ff
GS
2871 struct regnode_charclass_class);
2872 flags &= ~SCF_DO_STCLASS_AND;
2873 StructCopy(&this_class, data->start_class,
2874 struct regnode_charclass_class);
2875 flags |= SCF_DO_STCLASS_OR;
2876 data->start_class->flags |= ANYOF_EOS;
2877 }
2878 } else { /* Non-zero len */
2879 if (flags & SCF_DO_STCLASS_OR) {
830247a4 2880 cl_or(pRExC_state, data->start_class, &this_class);
40d049e4 2881 cl_and(data->start_class, and_withp);
653099ff
GS
2882 }
2883 else if (flags & SCF_DO_STCLASS_AND)
2884 cl_and(data->start_class, &this_class);
2885 flags &= ~SCF_DO_STCLASS;
2886 }
c277df42
IZ
2887 if (!scan) /* It was not CURLYX, but CURLY. */
2888 scan = next;
041457d9
DM
2889 if ( /* ? quantifier ok, except for (?{ ... }) */
2890 (next_is_eval || !(mincount == 0 && maxcount == 1))
84037bb0 2891 && (minnext == 0) && (deltanext == 0)
99799961 2892 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
041457d9
DM
2893 && maxcount <= REG_INFTY/3 /* Complement check for big count */
2894 && ckWARN(WARN_REGEXP))
b45f050a 2895 {
830247a4 2896 vWARN(RExC_parse,
b45f050a
JF
2897 "Quantifier unexpected on zero-length expression");
2898 }
2899
c277df42 2900 min += minnext * mincount;
b81d288d 2901 is_inf_internal |= ((maxcount == REG_INFTY
155aba94
GS
2902 && (minnext + deltanext) > 0)
2903 || deltanext == I32_MAX);
aca2d497 2904 is_inf |= is_inf_internal;
c277df42
IZ
2905 delta += (minnext + deltanext) * maxcount - minnext * mincount;
2906
2907 /* Try powerful optimization CURLYX => CURLYN. */
b81d288d 2908 if ( OP(oscan) == CURLYX && data
c277df42
IZ
2909 && data->flags & SF_IN_PAR
2910 && !(data->flags & SF_HAS_EVAL)
2911 && !deltanext && minnext == 1 ) {
2912 /* Try to optimize to CURLYN. */
2913 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
d4c19fe8 2914 regnode * const nxt1 = nxt;
497b47a8
JH
2915#ifdef DEBUGGING
2916 regnode *nxt2;
2917#endif
c277df42
IZ
2918
2919 /* Skip open. */
2920 nxt = regnext(nxt);
bfed75c6 2921 if (!strchr((const char*)PL_simple,OP(nxt))
3dab1dad 2922 && !(PL_regkind[OP(nxt)] == EXACT
b81d288d 2923 && STR_LEN(nxt) == 1))
c277df42 2924 goto nogo;
497b47a8 2925#ifdef DEBUGGING
c277df42 2926 nxt2 = nxt;
497b47a8 2927#endif
c277df42 2928 nxt = regnext(nxt);
b81d288d 2929 if (OP(nxt) != CLOSE)
c277df42 2930 goto nogo;
40d049e4
YO
2931 if (RExC_open_parens) {
2932 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2933 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
2934 }
c277df42 2935 /* Now we know that nxt2 is the only contents: */
eb160463 2936 oscan->flags = (U8)ARG(nxt);
c277df42
IZ
2937 OP(oscan) = CURLYN;
2938 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 2939
c277df42
IZ
2940#ifdef DEBUGGING
2941 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2942 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
2943 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
2944 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2945 OP(nxt + 1) = OPTIMIZED; /* was count. */
2946 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 2947#endif
c277df42 2948 }
c277df42
IZ
2949 nogo:
2950
2951 /* Try optimization CURLYX => CURLYM. */
b81d288d 2952 if ( OP(oscan) == CURLYX && data
c277df42 2953 && !(data->flags & SF_HAS_PAR)
c277df42 2954 && !(data->flags & SF_HAS_EVAL)
0e788c72
HS
2955 && !deltanext /* atom is fixed width */
2956 && minnext != 0 /* CURLYM can't handle zero width */
2957 ) {
c277df42
IZ
2958 /* XXXX How to optimize if data == 0? */
2959 /* Optimize to a simpler form. */
2960 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
2961 regnode *nxt2;
2962
2963 OP(oscan) = CURLYM;
2964 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
b81d288d 2965 && (OP(nxt2) != WHILEM))
c277df42
IZ
2966 nxt = nxt2;
2967 OP(nxt2) = SUCCEED; /* Whas WHILEM */
c277df42
IZ
2968 /* Need to optimize away parenths. */
2969 if (data->flags & SF_IN_PAR) {
2970 /* Set the parenth number. */
2971 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
2972
b81d288d 2973 if (OP(nxt) != CLOSE)
b45f050a 2974 FAIL("Panic opt close");
eb160463 2975 oscan->flags = (U8)ARG(nxt);
40d049e4
YO
2976 if (RExC_open_parens) {
2977 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
2978 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
2979 }
c277df42
IZ
2980 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2981 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 2982
c277df42
IZ
2983#ifdef DEBUGGING
2984 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2985 OP(nxt + 1) = OPTIMIZED; /* was count. */
2986 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
2987 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 2988#endif
c277df42
IZ
2989#if 0
2990 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2991 regnode *nnxt = regnext(nxt1);
b81d288d 2992
c277df42
IZ
2993 if (nnxt == nxt) {
2994 if (reg_off_by_arg[OP(nxt1)])
2995 ARG_SET(nxt1, nxt2 - nxt1);
2996 else if (nxt2 - nxt1 < U16_MAX)
2997 NEXT_OFF(nxt1) = nxt2 - nxt1;
2998 else
2999 OP(nxt) = NOTHING; /* Cannot beautify */
3000 }
3001 nxt1 = nnxt;
3002 }
3003#endif
3004 /* Optimize again: */
1de06328 3005 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
40d049e4 3006 NULL, stopparen, recursed, NULL, 0,depth+1);
a0ed51b3
LW
3007 }
3008 else
c277df42 3009 oscan->flags = 0;
c277df42 3010 }
e1901655
IZ
3011 else if ((OP(oscan) == CURLYX)
3012 && (flags & SCF_WHILEM_VISITED_POS)
3013 /* See the comment on a similar expression above.
3014 However, this time it not a subexpression
3015 we care about, but the expression itself. */
3016 && (maxcount == REG_INFTY)
3017 && data && ++data->whilem_c < 16) {
3018 /* This stays as CURLYX, we can put the count/of pair. */
2c2d71f5
JH
3019 /* Find WHILEM (as in regexec.c) */
3020 regnode *nxt = oscan + NEXT_OFF(oscan);
3021
3022 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3023 nxt += ARG(nxt);
eb160463
GS
3024 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3025 | (RExC_whilem_seen << 4)); /* On WHILEM */
2c2d71f5 3026 }
b81d288d 3027 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
c277df42
IZ
3028 pars++;
3029 if (flags & SCF_DO_SUBSTR) {
c445ea15 3030 SV *last_str = NULL;
c277df42
IZ
3031 int counted = mincount != 0;
3032
3033 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
5d1c421c
JH
3034#if defined(SPARC64_GCC_WORKAROUND)
3035 I32 b = 0;
3036 STRLEN l = 0;
cfd0369c 3037 const char *s = NULL;
5d1c421c
JH
3038 I32 old = 0;
3039
3040 if (pos_before >= data->last_start_min)
3041 b = pos_before;
3042 else
3043 b = data->last_start_min;
3044
3045 l = 0;
cfd0369c 3046 s = SvPV_const(data->last_found, l);
5d1c421c
JH
3047 old = b - data->last_start_min;
3048
3049#else
b81d288d 3050 I32 b = pos_before >= data->last_start_min
c277df42
IZ
3051 ? pos_before : data->last_start_min;
3052 STRLEN l;
d4c19fe8 3053 const char * const s = SvPV_const(data->last_found, l);
a0ed51b3 3054 I32 old = b - data->last_start_min;
5d1c421c 3055#endif
a0ed51b3
LW
3056
3057 if (UTF)
3058 old = utf8_hop((U8*)s, old) - (U8*)s;
c277df42 3059
a0ed51b3 3060 l -= old;
c277df42 3061 /* Get the added string: */
79cb57f6 3062 last_str = newSVpvn(s + old, l);
0e933229
IH
3063 if (UTF)
3064 SvUTF8_on(last_str);
c277df42
IZ
3065 if (deltanext == 0 && pos_before == b) {
3066 /* What was added is a constant string */
3067 if (mincount > 1) {
3068 SvGROW(last_str, (mincount * l) + 1);
b81d288d 3069 repeatcpy(SvPVX(last_str) + l,
3f7c398e 3070 SvPVX_const(last_str), l, mincount - 1);
b162af07 3071 SvCUR_set(last_str, SvCUR(last_str) * mincount);
c277df42 3072 /* Add additional parts. */
b81d288d 3073 SvCUR_set(data->last_found,
c277df42
IZ
3074 SvCUR(data->last_found) - l);
3075 sv_catsv(data->last_found, last_str);
0eda9292
JH
3076 {
3077 SV * sv = data->last_found;
3078 MAGIC *mg =
3079 SvUTF8(sv) && SvMAGICAL(sv) ?
3080 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3081 if (mg && mg->mg_len >= 0)
3082 mg->mg_len += CHR_SVLEN(last_str);
3083 }
c277df42
IZ
3084 data->last_end += l * (mincount - 1);
3085 }
2a8d9689
HS
3086 } else {
3087 /* start offset must point into the last copy */
3088 data->last_start_min += minnext * (mincount - 1);
c152dc43
HS
3089 data->last_start_max += is_inf ? I32_MAX
3090 : (maxcount - 1) * (minnext + data->pos_delta);
c277df42
IZ
3091 }
3092 }
3093 /* It is counted once already... */
3094 data->pos_min += minnext * (mincount - counted);
3095 data->pos_delta += - counted * deltanext +
3096 (minnext + deltanext) * maxcount - minnext * mincount;
3097 if (mincount != maxcount) {
653099ff
GS
3098 /* Cannot extend fixed substrings found inside
3099 the group. */
1de06328 3100 scan_commit(pRExC_state,data,minlenp);
c277df42 3101 if (mincount && last_str) {
d4c19fe8
AL
3102 SV * const sv = data->last_found;
3103 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
45f47268
NC
3104 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3105
3106 if (mg)
3107 mg->mg_len = -1;
3108 sv_setsv(sv, last_str);
c277df42 3109 data->last_end = data->pos_min;
b81d288d 3110 data->last_start_min =
a0ed51b3 3111 data->pos_min - CHR_SVLEN(last_str);
b81d288d
AB
3112 data->last_start_max = is_inf
3113 ? I32_MAX
c277df42 3114 : data->pos_min + data->pos_delta
a0ed51b3 3115 - CHR_SVLEN(last_str);
c277df42
IZ
3116 }
3117 data->longest = &(data->longest_float);
3118 }
aca2d497 3119 SvREFCNT_dec(last_str);
c277df42 3120 }
405ff068 3121 if (data && (fl & SF_HAS_EVAL))
c277df42
IZ
3122 data->flags |= SF_HAS_EVAL;
3123 optimize_curly_tail:
c277df42 3124 if (OP(oscan) != CURLYX) {
3dab1dad 3125 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
c277df42
IZ
3126 && NEXT_OFF(next))
3127 NEXT_OFF(oscan) += NEXT_OFF(next);
3128 }
c277df42 3129 continue;
653099ff 3130 default: /* REF and CLUMP only? */
c277df42 3131 if (flags & SCF_DO_SUBSTR) {
1de06328 3132 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
c277df42
IZ
3133 data->longest = &(data->longest_float);
3134 }
aca2d497 3135 is_inf = is_inf_internal = 1;
653099ff 3136 if (flags & SCF_DO_STCLASS_OR)
830247a4 3137 cl_anything(pRExC_state, data->start_class);
653099ff 3138 flags &= ~SCF_DO_STCLASS;
c277df42
IZ
3139 break;
3140 }
a0ed51b3 3141 }
bfed75c6 3142 else if (strchr((const char*)PL_simple,OP(scan))) {
9c5ffd7c 3143 int value = 0;
653099ff 3144
c277df42 3145 if (flags & SCF_DO_SUBSTR) {
1de06328 3146 scan_commit(pRExC_state,data,minlenp);
c277df42
IZ
3147 data->pos_min++;
3148 }
3149 min++;
653099ff
GS
3150 if (flags & SCF_DO_STCLASS) {
3151 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3152
3153 /* Some of the logic below assumes that switching
3154 locale on will only add false positives. */
3dab1dad 3155 switch (PL_regkind[OP(scan)]) {
653099ff 3156 case SANY:
653099ff
GS
3157 default:
3158 do_default:
3159 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3160 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3161 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3162 break;
3163 case REG_ANY:
3164 if (OP(scan) == SANY)
3165 goto do_default;
3166 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3167 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3168 || (data->start_class->flags & ANYOF_CLASS));
830247a4 3169 cl_anything(pRExC_state, data->start_class);
653099ff
GS
3170 }
3171 if (flags & SCF_DO_STCLASS_AND || !value)
3172 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3173 break;
3174 case ANYOF:
3175 if (flags & SCF_DO_STCLASS_AND)
3176 cl_and(data->start_class,
3177 (struct regnode_charclass_class*)scan);
3178 else
830247a4 3179 cl_or(pRExC_state, data->start_class,
653099ff
GS
3180 (struct regnode_charclass_class*)scan);
3181 break;
3182 case ALNUM:
3183 if (flags & SCF_DO_STCLASS_AND) {
3184 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3185 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3186 for (value = 0; value < 256; value++)
3187 if (!isALNUM(value))
3188 ANYOF_BITMAP_CLEAR(data->start_class, value);
3189 }
3190 }
3191 else {
3192 if (data->start_class->flags & ANYOF_LOCALE)
3193 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3194 else {
3195 for (value = 0; value < 256; value++)
3196 if (isALNUM(value))
b81d288d 3197 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3198 }
3199 }
3200 break;
3201 case ALNUML:
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 if (data->start_class->flags & ANYOF_LOCALE)
3204 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3205 }
3206 else {
3207 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3208 data->start_class->flags |= ANYOF_LOCALE;
3209 }
3210 break;
3211 case NALNUM:
3212 if (flags & SCF_DO_STCLASS_AND) {
3213 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3215 for (value = 0; value < 256; value++)
3216 if (isALNUM(value))
3217 ANYOF_BITMAP_CLEAR(data->start_class, value);
3218 }
3219 }
3220 else {
3221 if (data->start_class->flags & ANYOF_LOCALE)
3222 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3223 else {
3224 for (value = 0; value < 256; value++)
3225 if (!isALNUM(value))
b81d288d 3226 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3227 }
3228 }
3229 break;
3230 case NALNUML:
3231 if (flags & SCF_DO_STCLASS_AND) {
3232 if (data->start_class->flags & ANYOF_LOCALE)
3233 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3234 }
3235 else {
3236 data->start_class->flags |= ANYOF_LOCALE;
3237 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3238 }
3239 break;
3240 case SPACE:
3241 if (flags & SCF_DO_STCLASS_AND) {
3242 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3243 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3244 for (value = 0; value < 256; value++)
3245 if (!isSPACE(value))
3246 ANYOF_BITMAP_CLEAR(data->start_class, value);
3247 }
3248 }
3249 else {
3250 if (data->start_class->flags & ANYOF_LOCALE)
3251 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3252 else {
3253 for (value = 0; value < 256; value++)
3254 if (isSPACE(value))
b81d288d 3255 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3256 }
3257 }
3258 break;
3259 case SPACEL:
3260 if (flags & SCF_DO_STCLASS_AND) {
3261 if (data->start_class->flags & ANYOF_LOCALE)
3262 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3263 }
3264 else {
3265 data->start_class->flags |= ANYOF_LOCALE;
3266 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3267 }
3268 break;
3269 case NSPACE:
3270 if (flags & SCF_DO_STCLASS_AND) {
3271 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3272 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3273 for (value = 0; value < 256; value++)
3274 if (isSPACE(value))
3275 ANYOF_BITMAP_CLEAR(data->start_class, value);
3276 }
3277 }
3278 else {
3279 if (data->start_class->flags & ANYOF_LOCALE)
3280 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3281 else {
3282 for (value = 0; value < 256; value++)
3283 if (!isSPACE(value))
b81d288d 3284 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3285 }
3286 }
3287 break;
3288 case NSPACEL:
3289 if (flags & SCF_DO_STCLASS_AND) {
3290 if (data->start_class->flags & ANYOF_LOCALE) {
3291 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3292 for (value = 0; value < 256; value++)
3293 if (!isSPACE(value))
3294 ANYOF_BITMAP_CLEAR(data->start_class, value);
3295 }
3296 }
3297 else {
3298 data->start_class->flags |= ANYOF_LOCALE;
3299 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3300 }
3301 break;
3302 case DIGIT:
3303 if (flags & SCF_DO_STCLASS_AND) {
3304 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3305 for (value = 0; value < 256; value++)
3306 if (!isDIGIT(value))
3307 ANYOF_BITMAP_CLEAR(data->start_class, value);
3308 }
3309 else {
3310 if (data->start_class->flags & ANYOF_LOCALE)
3311 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3312 else {
3313 for (value = 0; value < 256; value++)
3314 if (isDIGIT(value))
b81d288d 3315 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3316 }
3317 }
3318 break;
3319 case NDIGIT:
3320 if (flags & SCF_DO_STCLASS_AND) {
3321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3322 for (value = 0; value < 256; value++)
3323 if (isDIGIT(value))
3324 ANYOF_BITMAP_CLEAR(data->start_class, value);
3325 }
3326 else {
3327 if (data->start_class->flags & ANYOF_LOCALE)
3328 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3329 else {
3330 for (value = 0; value < 256; value++)
3331 if (!isDIGIT(value))
b81d288d 3332 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3333 }
3334 }
3335 break;
3336 }
3337 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3338 cl_and(data->start_class, and_withp);
653099ff
GS
3339 flags &= ~SCF_DO_STCLASS;
3340 }
a0ed51b3 3341 }
3dab1dad 3342 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
c277df42
IZ
3343 data->flags |= (OP(scan) == MEOL
3344 ? SF_BEFORE_MEOL
3345 : SF_BEFORE_SEOL);
a0ed51b3 3346 }
3dab1dad 3347 else if ( PL_regkind[OP(scan)] == BRANCHJ
653099ff
GS
3348 /* Lookbehind, or need to calculate parens/evals/stclass: */
3349 && (scan->flags || data || (flags & SCF_DO_STCLASS))
c277df42 3350 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1de06328
YO
3351 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3352 || OP(scan) == UNLESSM )
3353 {
3354 /* Negative Lookahead/lookbehind
3355 In this case we can't do fixed string optimisation.
3356 */
3357
3358 I32 deltanext, minnext, fake = 0;
3359 regnode *nscan;
3360 struct regnode_charclass_class intrnl;
3361 int f = 0;
3362
3363 data_fake.flags = 0;
3364 if (data) {
3365 data_fake.whilem_c = data->whilem_c;
3366 data_fake.last_closep = data->last_closep;
a0ed51b3 3367 }
1de06328
YO
3368 else
3369 data_fake.last_closep = &fake;
3370 if ( flags & SCF_DO_STCLASS && !scan->flags
3371 && OP(scan) == IFMATCH ) { /* Lookahead */
3372 cl_init(pRExC_state, &intrnl);
3373 data_fake.start_class = &intrnl;
3374 f |= SCF_DO_STCLASS_AND;
c277df42 3375 }
1de06328
YO
3376 if (flags & SCF_WHILEM_VISITED_POS)
3377 f |= SCF_WHILEM_VISITED_POS;
3378 next = regnext(scan);
3379 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3380 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3381 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
1de06328
YO
3382 if (scan->flags) {
3383 if (deltanext) {
3384 vFAIL("Variable length lookbehind not implemented");
3385 }
3386 else if (minnext > (I32)U8_MAX) {
3387 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3388 }
3389 scan->flags = (U8)minnext;
3390 }
3391 if (data) {
3392 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3393 pars++;
3394 if (data_fake.flags & SF_HAS_EVAL)
3395 data->flags |= SF_HAS_EVAL;
3396 data->whilem_c = data_fake.whilem_c;
3397 }
3398 if (f & SCF_DO_STCLASS_AND) {
3399 const int was = (data->start_class->flags & ANYOF_EOS);
3400
3401 cl_and(data->start_class, &intrnl);
3402 if (was)
3403 data->start_class->flags |= ANYOF_EOS;
3404 }
be8e71aa 3405 }
1de06328
YO
3406#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3407 else {
3408 /* Positive Lookahead/lookbehind
3409 In this case we can do fixed string optimisation,
3410 but we must be careful about it. Note in the case of
3411 lookbehind the positions will be offset by the minimum
3412 length of the pattern, something we won't know about
3413 until after the recurse.
3414 */
3415 I32 deltanext, fake = 0;
3416 regnode *nscan;
3417 struct regnode_charclass_class intrnl;
3418 int f = 0;
3419 /* We use SAVEFREEPV so that when the full compile
3420 is finished perl will clean up the allocated
3421 minlens when its all done. This was we don't
3422 have to worry about freeing them when we know
3423 they wont be used, which would be a pain.
3424 */
3425 I32 *minnextp;
3426 Newx( minnextp, 1, I32 );
3427 SAVEFREEPV(minnextp);
3428
3429 if (data) {
3430 StructCopy(data, &data_fake, scan_data_t);
3431 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3432 f |= SCF_DO_SUBSTR;
3433 if (scan->flags)
3434 scan_commit(pRExC_state, &data_fake,minlenp);
3435 data_fake.last_found=newSVsv(data->last_found);
3436 }
3437 }
3438 else
3439 data_fake.last_closep = &fake;
3440 data_fake.flags = 0;
3441 if (is_inf)
3442 data_fake.flags |= SF_IS_INF;
3443 if ( flags & SCF_DO_STCLASS && !scan->flags
3444 && OP(scan) == IFMATCH ) { /* Lookahead */
3445 cl_init(pRExC_state, &intrnl);
3446 data_fake.start_class = &intrnl;
3447 f |= SCF_DO_STCLASS_AND;
3448 }
3449 if (flags & SCF_WHILEM_VISITED_POS)
3450 f |= SCF_WHILEM_VISITED_POS;
3451 next = regnext(scan);
3452 nscan = NEXTOPER(NEXTOPER(scan));
40d049e4
YO
3453
3454 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3455 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
1de06328
YO
3456 if (scan->flags) {
3457 if (deltanext) {
3458 vFAIL("Variable length lookbehind not implemented");
3459 }
3460 else if (*minnextp > (I32)U8_MAX) {
3461 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3462 }
3463 scan->flags = (U8)*minnextp;
3464 }
40d049e4 3465
1de06328 3466 *minnextp += min;
40d049e4 3467
1de06328
YO
3468 if (f & SCF_DO_STCLASS_AND) {
3469 const int was = (data->start_class->flags & ANYOF_EOS);
3470
3471 cl_and(data->start_class, &intrnl);
3472 if (was)
3473 data->start_class->flags |= ANYOF_EOS;
40d049e4 3474 }
1de06328
YO
3475 if (data) {
3476 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3477 pars++;
3478 if (data_fake.flags & SF_HAS_EVAL)
3479 data->flags |= SF_HAS_EVAL;
3480 data->whilem_c = data_fake.whilem_c;
3481 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3482 if (RExC_rx->minlen<*minnextp)
3483 RExC_rx->minlen=*minnextp;
3484 scan_commit(pRExC_state, &data_fake, minnextp);
3485 SvREFCNT_dec(data_fake.last_found);
3486
3487 if ( data_fake.minlen_fixed != minlenp )
3488 {
3489 data->offset_fixed= data_fake.offset_fixed;
3490 data->minlen_fixed= data_fake.minlen_fixed;
3491 data->lookbehind_fixed+= scan->flags;
3492 }
3493 if ( data_fake.minlen_float != minlenp )
3494 {
3495 data->minlen_float= data_fake.minlen_float;
3496 data->offset_float_min=data_fake.offset_float_min;
3497 data->offset_float_max=data_fake.offset_float_max;
3498 data->lookbehind_float+= scan->flags;
3499 }
3500 }
3501 }
3502
653099ff 3503
653099ff 3504 }
1de06328 3505#endif
a0ed51b3
LW
3506 }
3507 else if (OP(scan) == OPEN) {
40d049e4
YO
3508 if (stopparen != (I32)ARG(scan))
3509 pars++;
a0ed51b3 3510 }
cb434fcc 3511 else if (OP(scan) == CLOSE) {
40d049e4
YO
3512 if (stopparen == (I32)ARG(scan)) {
3513 break;
3514 }
eb160463 3515 if ((I32)ARG(scan) == is_par) {
cb434fcc 3516 next = regnext(scan);
c277df42 3517
cb434fcc
IZ
3518 if ( next && (OP(next) != WHILEM) && next < last)
3519 is_par = 0; /* Disable optimization */
3520 }
3521 if (data)
3522 *(data->last_closep) = ARG(scan);
a0ed51b3 3523 }
1a147d38 3524 else if (OP(scan) == GOSUB || OP(scan) == GOSTART) {
40d049e4
YO
3525 /* set the pointer */
3526 I32 paren;
3527 regnode *start;
3528 regnode *end;
1a147d38 3529 if (OP(scan) == GOSUB) {
40d049e4
YO
3530 paren = ARG(scan);
3531 RExC_recurse[ARG2L(scan)] = scan;
3532 start = RExC_open_parens[paren-1];
3533 end = RExC_close_parens[paren-1];
3534 } else {
3535 paren = 0;
3536 start = RExC_rx->program + 1;
3537 end = RExC_opend;
3538 }
3539 assert(start);
3540 assert(end);
3541 if (!recursed) {
3542 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3543 SAVEFREEPV(recursed);
3544 }
3545 if (!PAREN_TEST(recursed,paren+1)) {
3546 I32 deltanext = 0;
3547 PAREN_SET(recursed,paren+1);
3548
3549 DEBUG_PEEP("goto",start,depth);
3550 min += study_chunk(
3551 pRExC_state,
3552 &start,
3553 minlenp,
3554 &deltanext,
3555 end+1,
3556 data,
3557 paren,
3558 recursed,
3559 and_withp,
3560 flags,depth+1);
3561 delta+=deltanext;
3562 if (deltanext == I32_MAX) {
3563 is_inf = is_inf_internal = 1;
3564 delta=deltanext;
3565 }
3566 DEBUG_PEEP("rtrn",end,depth);
3567 PAREN_UNSET(recursed,paren+1);
3568 } else {
3569 if (flags & SCF_DO_SUBSTR) {
3570 scan_commit(pRExC_state,data,minlenp);
3571 data->longest = &(data->longest_float);
3572 }
3573 is_inf = is_inf_internal = 1;
3574 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3575 cl_anything(pRExC_state, data->start_class);
3576 flags &= ~SCF_DO_STCLASS;
3577 }
3578 }
a0ed51b3 3579 else if (OP(scan) == EVAL) {
c277df42
IZ
3580 if (data)
3581 data->flags |= SF_HAS_EVAL;
3582 }
40d049e4
YO
3583 else if ( OP(scan)==OPFAIL ) {
3584 if (flags & SCF_DO_SUBSTR) {
3585 scan_commit(pRExC_state,data,minlenp);
3586 flags &= ~SCF_DO_SUBSTR;
3587 }
3588 }
3589 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3590 {
0f5d15d6 3591 if (flags & SCF_DO_SUBSTR) {
1de06328 3592 scan_commit(pRExC_state,data,minlenp);
0f5d15d6
IZ
3593 data->longest = &(data->longest_float);
3594 }
3595 is_inf = is_inf_internal = 1;
653099ff 3596 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3597 cl_anything(pRExC_state, data->start_class);
96776eda 3598 flags &= ~SCF_DO_STCLASS;
0f5d15d6 3599 }
786e8c11 3600#ifdef TRIE_STUDY_OPT
40d049e4 3601#ifdef FULL_TRIE_STUDY
786e8c11
YO
3602 else if (PL_regkind[OP(scan)] == TRIE) {
3603 /* NOTE - There is similar code to this block above for handling
40d049e4 3604 BRANCH nodes on the initial study. If you change stuff here
786e8c11 3605 check there too. */
7f69552c 3606 regnode *trie_node= scan;
786e8c11
YO
3607 regnode *tail= regnext(scan);
3608 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3609 I32 max1 = 0, min1 = I32_MAX;
3610 struct regnode_charclass_class accum;
3611
3612 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
1de06328 3613 scan_commit(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
786e8c11
YO
3614 if (flags & SCF_DO_STCLASS)
3615 cl_init_zero(pRExC_state, &accum);
3616
3617 if (!trie->jump) {
3618 min1= trie->minlen;
3619 max1= trie->maxlen;
3620 } else {
3621 const regnode *nextbranch= NULL;
3622 U32 word;
3623
3624 for ( word=1 ; word <= trie->wordcount ; word++)
3625 {
3626 I32 deltanext=0, minnext=0, f = 0, fake;
3627 struct regnode_charclass_class this_class;
3628
3629 data_fake.flags = 0;
3630 if (data) {
3631 data_fake.whilem_c = data->whilem_c;
3632 data_fake.last_closep = data->last_closep;
3633 }
3634 else
3635 data_fake.last_closep = &fake;
3636
3637 if (flags & SCF_DO_STCLASS) {
3638 cl_init(pRExC_state, &this_class);
3639 data_fake.start_class = &this_class;
3640 f = SCF_DO_STCLASS_AND;
3641 }
3642 if (flags & SCF_WHILEM_VISITED_POS)
3643 f |= SCF_WHILEM_VISITED_POS;
3644
3645 if (trie->jump[word]) {
3646 if (!nextbranch)
7f69552c
YO
3647 nextbranch = trie_node + trie->jump[0];
3648 scan= trie_node + trie->jump[word];
786e8c11
YO
3649 /* We go from the jump point to the branch that follows
3650 it. Note this means we need the vestigal unused branches
3651 even though they arent otherwise used.
3652 */
40d049e4
YO
3653 minnext = study_chunk(pRExC_state, &scan, minlenp,
3654 &deltanext, (regnode *)nextbranch, &data_fake,
3655 stopparen, recursed, NULL, f,depth+1);
786e8c11
YO
3656 }
3657 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3658 nextbranch= regnext((regnode*)nextbranch);
3659
3660 if (min1 > (I32)(minnext + trie->minlen))
3661 min1 = minnext + trie->minlen;
3662 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3663 max1 = minnext + deltanext + trie->maxlen;
3664 if (deltanext == I32_MAX)
3665 is_inf = is_inf_internal = 1;
3666
3667 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3668 pars++;
3669
3670 if (data) {
3671 if (data_fake.flags & SF_HAS_EVAL)
3672 data->flags |= SF_HAS_EVAL;
3673 data->whilem_c = data_fake.whilem_c;
3674 }
3675 if (flags & SCF_DO_STCLASS)
3676 cl_or(pRExC_state, &accum, &this_class);
3677 }
3678 }
3679 if (flags & SCF_DO_SUBSTR) {
3680 data->pos_min += min1;
3681 data->pos_delta += max1 - min1;
3682 if (max1 != min1 || is_inf)
3683 data->longest = &(data->longest_float);
3684 }
3685 min += min1;
3686 delta += max1 - min1;
3687 if (flags & SCF_DO_STCLASS_OR) {
3688 cl_or(pRExC_state, data->start_class, &accum);
3689 if (min1) {
40d049e4 3690 cl_and(data->start_class, and_withp);
786e8c11
YO
3691 flags &= ~SCF_DO_STCLASS;
3692 }
3693 }
3694 else if (flags & SCF_DO_STCLASS_AND) {
3695 if (min1) {
3696 cl_and(data->start_class, &accum);
3697 flags &= ~SCF_DO_STCLASS;
3698 }
3699 else {
3700 /* Switch to OR mode: cache the old value of
3701 * data->start_class */
40d049e4
YO
3702 INIT_AND_WITHP;
3703 StructCopy(data->start_class, and_withp,
786e8c11
YO
3704 struct regnode_charclass_class);
3705 flags &= ~SCF_DO_STCLASS_AND;
3706 StructCopy(&accum, data->start_class,
3707 struct regnode_charclass_class);
3708 flags |= SCF_DO_STCLASS_OR;
3709 data->start_class->flags |= ANYOF_EOS;
3710 }
3711 }
3712 scan= tail;
3713 continue;
3714 }
3715#else
3716 else if (PL_regkind[OP(scan)] == TRIE) {
3717 reg_trie_data *trie = (reg_trie_data*)RExC_rx->data->data[ ARG(scan) ];
3718 U8*bang=NULL;
3719
3720 min += trie->minlen;
3721 delta += (trie->maxlen - trie->minlen);
3722 flags &= ~SCF_DO_STCLASS; /* xxx */
3723 if (flags & SCF_DO_SUBSTR) {
1de06328 3724 scan_commit(pRExC_state,data,minlenp); /* Cannot expect anything... */
786e8c11
YO
3725 data->pos_min += trie->minlen;
3726 data->pos_delta += (trie->maxlen - trie->minlen);
3727 if (trie->maxlen != trie->minlen)
3728 data->longest = &(data->longest_float);
3729 }
3730 if (trie->jump) /* no more substrings -- for now /grr*/
3731 flags &= ~SCF_DO_SUBSTR;
3732 }
3733#endif /* old or new */
3734#endif /* TRIE_STUDY_OPT */
c277df42
IZ
3735 /* Else: zero-length, ignore. */
3736 scan = regnext(scan);
3737 }
3738
3739 finish:
3740 *scanp = scan;
aca2d497 3741 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3742 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3743 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3744 if (is_par > (I32)U8_MAX)
c277df42
IZ
3745 is_par = 0;
3746 if (is_par && pars==1 && data) {
3747 data->flags |= SF_IN_PAR;
3748 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3749 }
3750 else if (pars && data) {
c277df42
IZ
3751 data->flags |= SF_HAS_PAR;
3752 data->flags &= ~SF_IN_PAR;
3753 }
653099ff 3754 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3755 cl_and(data->start_class, and_withp);
786e8c11
YO
3756 if (flags & SCF_TRIE_RESTUDY)
3757 data->flags |= SCF_TRIE_RESTUDY;
1de06328
YO
3758
3759 DEBUG_STUDYDATA(data,depth);
3760
c277df42
IZ
3761 return min;
3762}
3763
76e3520e 3764STATIC I32
5f66b61c 3765S_add_data(RExC_state_t *pRExC_state, I32 n, const char *s)
c277df42 3766{
830247a4 3767 if (RExC_rx->data) {
2eb97020 3768 const U32 count = RExC_rx->data->count;
b81d288d 3769 Renewc(RExC_rx->data,
2eb97020 3770 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
c277df42 3771 char, struct reg_data);
2eb97020 3772 Renew(RExC_rx->data->what, count + n, U8);
830247a4 3773 RExC_rx->data->count += n;
a0ed51b3
LW
3774 }
3775 else {
a02a5408 3776 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
c277df42 3777 char, struct reg_data);
a02a5408 3778 Newx(RExC_rx->data->what, n, U8);
830247a4 3779 RExC_rx->data->count = n;
c277df42 3780 }
830247a4
IZ
3781 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
3782 return RExC_rx->data->count - n;
c277df42
IZ
3783}
3784
76234dfb 3785#ifndef PERL_IN_XSUB_RE
d88dccdf 3786void
864dbfa3 3787Perl_reginitcolors(pTHX)
d88dccdf 3788{
97aff369 3789 dVAR;
1df70142 3790 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3791 if (s) {
1df70142
AL
3792 char *t = savepv(s);
3793 int i = 0;
3794 PL_colors[0] = t;
d88dccdf 3795 while (++i < 6) {
1df70142
AL
3796 t = strchr(t, '\t');
3797 if (t) {
3798 *t = '\0';
3799 PL_colors[i] = ++t;
d88dccdf
IZ
3800 }
3801 else
1df70142 3802 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3803 }
3804 } else {
1df70142 3805 int i = 0;
b81d288d 3806 while (i < 6)
06b5626a 3807 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3808 }
3809 PL_colorset = 1;
3810}
76234dfb 3811#endif
8615cb43 3812
07be1b83 3813
786e8c11
YO
3814#ifdef TRIE_STUDY_OPT
3815#define CHECK_RESTUDY_GOTO \
3816 if ( \
3817 (data.flags & SCF_TRIE_RESTUDY) \
3818 && ! restudied++ \
3819 ) goto reStudy
3820#else
3821#define CHECK_RESTUDY_GOTO
3822#endif
f9f4320a 3823
a687059c 3824/*
e50aee73 3825 - pregcomp - compile a regular expression into internal code
a687059c
LW
3826 *
3827 * We can't allocate space until we know how big the compiled form will be,
3828 * but we can't compile it (and thus know how big it is) until we've got a
3829 * place to put the code. So we cheat: we compile it twice, once with code
3830 * generation turned off and size counting turned on, and once "for real".
3831 * This also means that we don't allocate space until we are sure that the
3832 * thing really will compile successfully, and we never have to move the
3833 * code and thus invalidate pointers into it. (Note that it has to be in
3834 * one piece because free() must be able to free it all.) [NB: not true in perl]
3835 *
3836 * Beware that the optimization-preparation code in here knows about some
3837 * of the structure of the compiled regexp. [I'll say.]
3838 */
b9b4dddf
YO
3839
3840
3841
f9f4320a 3842#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3843#define RE_ENGINE_PTR &PL_core_reg_engine
3844#else
f9f4320a
YO
3845extern const struct regexp_engine my_reg_engine;
3846#define RE_ENGINE_PTR &my_reg_engine
3847#endif
b9b4dddf
YO
3848/* these make a few things look better, to avoid indentation */
3849#define BEGIN_BLOCK {
f9f4320a
YO
3850#define END_BLOCK }
3851
a687059c 3852regexp *
864dbfa3 3853Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 3854{
97aff369 3855 dVAR;
f9f4320a
YO
3856 GET_RE_DEBUG_FLAGS_DECL;
3857 DEBUG_r(if (!PL_colorset) reginitcolors());
b9b4dddf
YO
3858#ifndef PERL_IN_XSUB_RE
3859 BEGIN_BLOCK
f9f4320a
YO
3860 /* Dispatch a request to compile a regexp to correct
3861 regexp engine. */
3862 HV * const table = GvHV(PL_hintgv);
3863 if (table) {
3864 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
1e2e3d02 3865 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
3866 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
3867 DEBUG_COMPILE_r({
8d8756e7 3868 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
3869 SvIV(*ptr));
3870 });
f2f78491 3871 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 3872 }
b9b4dddf
YO
3873 }
3874 END_BLOCK
3875#endif
3876 BEGIN_BLOCK
a0d0e21e 3877 register regexp *r;
c277df42 3878 regnode *scan;
c277df42 3879 regnode *first;
a0d0e21e 3880 I32 flags;
a0d0e21e
LW
3881 I32 minlen = 0;
3882 I32 sawplus = 0;
3883 I32 sawopen = 0;
2c2d71f5 3884 scan_data_t data;
830247a4 3885 RExC_state_t RExC_state;
be8e71aa 3886 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
3887#ifdef TRIE_STUDY_OPT
3888 int restudied= 0;
3889 RExC_state_t copyRExC_state;
3890#endif
a0d0e21e 3891 if (exp == NULL)
c277df42 3892 FAIL("NULL regexp argument");
a0d0e21e 3893
a5961de5 3894 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 3895
5cfc7842 3896 RExC_precomp = exp;
a3621e74 3897 DEBUG_COMPILE_r({
ab3bbdeb
YO
3898 SV *dsv= sv_newmortal();
3899 RE_PV_QUOTED_DECL(s, RExC_utf8,
3900 dsv, RExC_precomp, (xend - exp), 60);
3901 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
3902 PL_colors[4],PL_colors[5],s);
a5961de5 3903 });
e2509266 3904 RExC_flags = pm->op_pmflags;
830247a4 3905 RExC_sawback = 0;
bbce6d69 3906
830247a4
IZ
3907 RExC_seen = 0;
3908 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
3909 RExC_seen_evals = 0;
3910 RExC_extralen = 0;
c277df42 3911
bbce6d69 3912 /* First pass: determine size, legality. */
830247a4 3913 RExC_parse = exp;
fac92740 3914 RExC_start = exp;
830247a4
IZ
3915 RExC_end = xend;
3916 RExC_naughty = 0;
3917 RExC_npar = 1;
3918 RExC_size = 0L;
3919 RExC_emit = &PL_regdummy;
3920 RExC_whilem_seen = 0;
fc8cd66c 3921 RExC_charnames = NULL;
40d049e4
YO
3922 RExC_open_parens = NULL;
3923 RExC_close_parens = NULL;
3924 RExC_opend = NULL;
81714fb9 3925 RExC_paren_names = NULL;
40d049e4
YO
3926 RExC_recurse = NULL;
3927 RExC_recurse_count = 0;
81714fb9 3928
85ddcde9
JH
3929#if 0 /* REGC() is (currently) a NOP at the first pass.
3930 * Clever compilers notice this and complain. --jhi */
830247a4 3931 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 3932#endif
3dab1dad
YO
3933 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
3934 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 3935 RExC_precomp = NULL;
a0d0e21e
LW
3936 return(NULL);
3937 }
07be1b83 3938 DEBUG_PARSE_r({
81714fb9
YO
3939 PerlIO_printf(Perl_debug_log,
3940 "Required size %"IVdf" nodes\n"
3941 "Starting second pass (creation)\n",
3942 (IV)RExC_size);
07be1b83
YO
3943 RExC_lastnum=0;
3944 RExC_lastparse=NULL;
3945 });
c277df42
IZ
3946 /* Small enough for pointer-storage convention?
3947 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
3948 if (RExC_size >= 0x10000L && RExC_extralen)
3949 RExC_size += RExC_extralen;
c277df42 3950 else
830247a4
IZ
3951 RExC_extralen = 0;
3952 if (RExC_whilem_seen > 15)
3953 RExC_whilem_seen = 15;
a0d0e21e 3954
f9f4320a
YO
3955 /* Allocate space and zero-initialize. Note, the two step process
3956 of zeroing when in debug mode, thus anything assigned has to
3957 happen after that */
a02a5408 3958 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
c277df42 3959 char, regexp);
a0d0e21e 3960 if (r == NULL)
b45f050a 3961 FAIL("Regexp out of space");
0f79a09d
GS
3962#ifdef DEBUGGING
3963 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
830247a4 3964 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
0f79a09d 3965#endif
f9f4320a
YO
3966 /* initialization begins here */
3967 r->engine= RE_ENGINE_PTR;
c277df42 3968 r->refcnt = 1;
bbce6d69 3969 r->prelen = xend - exp;
5cfc7842 3970 r->precomp = savepvn(RExC_precomp, r->prelen);
cf93c79d 3971 r->subbeg = NULL;
f8c7b90f 3972#ifdef PERL_OLD_COPY_ON_WRITE
c445ea15 3973 r->saved_copy = NULL;
ed252734 3974#endif
cf93c79d 3975 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
830247a4 3976 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
38d1b06f 3977 r->lastparen = 0; /* mg.c reads this. */
4327152a
IZ
3978
3979 r->substrs = 0; /* Useful during FAIL. */
3980 r->startp = 0; /* Useful during FAIL. */
6bda09f9 3981 r->endp = 0;
81714fb9
YO
3982 r->paren_names = 0;
3983
6bda09f9 3984 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
3985 Newxz(RExC_open_parens, RExC_npar,regnode *);
3986 SAVEFREEPV(RExC_open_parens);
3987 Newxz(RExC_close_parens,RExC_npar,regnode *);
3988 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
3989 }
3990
3991 /* Useful during FAIL. */
a02a5408 3992 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
fac92740 3993 if (r->offsets) {
2af232bd 3994 r->offsets[0] = RExC_size;
fac92740 3995 }
a3621e74 3996 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd
SS
3997 "%s %"UVuf" bytes for offset annotations.\n",
3998 r->offsets ? "Got" : "Couldn't get",
392fbf5d 3999 (UV)((2*RExC_size+1) * sizeof(U32))));
fac92740 4000
830247a4 4001 RExC_rx = r;
bbce6d69 4002
4003 /* Second pass: emit code. */
e2509266 4004 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4005 RExC_parse = exp;
4006 RExC_end = xend;
4007 RExC_naughty = 0;
4008 RExC_npar = 1;
fac92740 4009 RExC_emit_start = r->program;
830247a4 4010 RExC_emit = r->program;
2cd61cdb 4011 /* Store the count of eval-groups for security checks: */
786e8c11 4012 RExC_emit->next_off = (RExC_seen_evals > (I32)U16_MAX) ? U16_MAX : (U16)RExC_seen_evals;
830247a4 4013 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
c277df42 4014 r->data = 0;
3dab1dad 4015 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4016 return(NULL);
6bda09f9 4017
07be1b83
YO
4018 /* XXXX To minimize changes to RE engine we always allocate
4019 3-units-long substrs field. */
4020 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4021 if (RExC_recurse_count) {
4022 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4023 SAVEFREEPV(RExC_recurse);
4024 }
a0d0e21e 4025
07be1b83 4026reStudy:
1de06328 4027 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4028 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4029
07be1b83
YO
4030#ifdef TRIE_STUDY_OPT
4031 if ( restudied ) {
4032 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4033 RExC_state=copyRExC_state;
1de06328 4034 if (data.last_found) {
07be1b83 4035 SvREFCNT_dec(data.longest_fixed);
07be1b83 4036 SvREFCNT_dec(data.longest_float);
07be1b83 4037 SvREFCNT_dec(data.last_found);
1de06328 4038 }
40d049e4 4039 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4040 } else {
40d049e4 4041 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83
YO
4042 copyRExC_state=RExC_state;
4043 }
40d049e4
YO
4044#else
4045 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4046#endif
fc8cd66c 4047
a0d0e21e 4048 /* Dig out information for optimizations. */
cf93c79d 4049 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
e2509266 4050 pm->op_pmflags = RExC_flags;
a0ed51b3 4051 if (UTF)
5ff6fc6d 4052 r->reganch |= ROPT_UTF8; /* Unicode in it? */
c277df42 4053 r->regstclass = NULL;
830247a4 4054 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
a0ed51b3 4055 r->reganch |= ROPT_NAUGHTY;
c277df42 4056 scan = r->program + 1; /* First BRANCH. */
2779dcf1 4057
1de06328
YO
4058 /* testing for BRANCH here tells us whether there is "must appear"
4059 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4060 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4061 I32 fake;
c5254dd6 4062 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4063 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4064 int stclass_flag;
07be1b83 4065 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4066
4067 first = scan;
c277df42 4068 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4069 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4070 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4071 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4072 /* for now we can't handle lookbehind IFMATCH*/
4073 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4074 (OP(first) == PLUS) ||
4075 (OP(first) == MINMOD) ||
653099ff 4076 /* An {n,m} with n>0 */
07be1b83
YO
4077 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4078 {
786e8c11 4079
a0d0e21e
LW
4080 if (OP(first) == PLUS)
4081 sawplus = 1;
4082 else
3dab1dad 4083 first += regarglen[OP(first)];
07be1b83
YO
4084 if (OP(first) == IFMATCH) {
4085 first = NEXTOPER(first);
4086 first += EXTRA_STEP_2ARGS;
7c167cea 4087 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4088 first = NEXTOPER(first);
a687059c
LW
4089 }
4090
a0d0e21e
LW
4091 /* Starting-point info. */
4092 again:
786e8c11 4093 DEBUG_PEEP("first:",first,0);
07be1b83 4094 /* Ignore EXACT as we deal with it later. */
3dab1dad 4095 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4096 if (OP(first) == EXACT)
6f207bd3 4097 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4098 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
b3c9acc1
IZ
4099 r->regstclass = first;
4100 }
07be1b83 4101#ifdef TRIE_STCLASS
786e8c11 4102 else if (PL_regkind[OP(first)] == TRIE &&
07be1b83
YO
4103 ((reg_trie_data *)r->data->data[ ARG(first) ])->minlen>0)
4104 {
786e8c11 4105 regnode *trie_op;
07be1b83 4106 /* this can happen only on restudy */
786e8c11
YO
4107 if ( OP(first) == TRIE ) {
4108 struct regnode_1 *trieop;
4109 Newxz(trieop,1,struct regnode_1);
4110 StructCopy(first,trieop,struct regnode_1);
4111 trie_op=(regnode *)trieop;
4112 } else {
4113 struct regnode_charclass *trieop;
4114 Newxz(trieop,1,struct regnode_charclass);
4115 StructCopy(first,trieop,struct regnode_charclass);
4116 trie_op=(regnode *)trieop;
4117 }
1de06328 4118 OP(trie_op)+=2;
786e8c11
YO
4119 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4120 r->regstclass = trie_op;
07be1b83
YO
4121 }
4122#endif
bfed75c6 4123 else if (strchr((const char*)PL_simple,OP(first)))
a0d0e21e 4124 r->regstclass = first;
3dab1dad
YO
4125 else if (PL_regkind[OP(first)] == BOUND ||
4126 PL_regkind[OP(first)] == NBOUND)
a0d0e21e 4127 r->regstclass = first;
3dab1dad 4128 else if (PL_regkind[OP(first)] == BOL) {
cad2e5aa
JH
4129 r->reganch |= (OP(first) == MBOL
4130 ? ROPT_ANCH_MBOL
4131 : (OP(first) == SBOL
4132 ? ROPT_ANCH_SBOL
4133 : ROPT_ANCH_BOL));
a0d0e21e 4134 first = NEXTOPER(first);
774d564b 4135 goto again;
4136 }
4137 else if (OP(first) == GPOS) {
4138 r->reganch |= ROPT_ANCH_GPOS;
4139 first = NEXTOPER(first);
4140 goto again;
a0d0e21e 4141 }
e09294f4 4142 else if (!sawopen && (OP(first) == STAR &&
3dab1dad 4143 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
a0d0e21e
LW
4144 !(r->reganch & ROPT_ANCH) )
4145 {
4146 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4147 const int type =
4148 (OP(NEXTOPER(first)) == REG_ANY)
4149 ? ROPT_ANCH_MBOL
4150 : ROPT_ANCH_SBOL;
cad2e5aa 4151 r->reganch |= type | ROPT_IMPLICIT;
a0d0e21e 4152 first = NEXTOPER(first);
774d564b 4153 goto again;
a0d0e21e 4154 }
b81d288d 4155 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4156 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa
JH
4157 /* x+ must match at the 1st pos of run of x's */
4158 r->reganch |= ROPT_SKIP;
a0d0e21e 4159
c277df42 4160 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4161#ifdef TRIE_STUDY_OPT
81714fb9 4162 DEBUG_PARSE_r(
be8e71aa
YO
4163 if (!restudied)
4164 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4165 (IV)(first - scan + 1))
4166 );
4167#else
81714fb9 4168 DEBUG_PARSE_r(
be8e71aa
YO
4169 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4170 (IV)(first - scan + 1))
4171 );
4172#endif
4173
4174
a0d0e21e
LW
4175 /*
4176 * If there's something expensive in the r.e., find the
4177 * longest literal string that must appear and make it the
4178 * regmust. Resolve ties in favor of later strings, since
4179 * the regstart check works with the beginning of the r.e.
4180 * and avoiding duplication strengthens checking. Not a
4181 * strong reason, but sufficient in the absence of others.
4182 * [Now we resolve ties in favor of the earlier string if
c277df42 4183 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4184 * earlier string may buy us something the later one won't.]
4185 */
a0d0e21e 4186 minlen = 0;
a687059c 4187
396482e1
GA
4188 data.longest_fixed = newSVpvs("");
4189 data.longest_float = newSVpvs("");
4190 data.last_found = newSVpvs("");
c277df42
IZ
4191 data.longest = &(data.longest_fixed);
4192 first = scan;
653099ff 4193 if (!r->regstclass) {
830247a4 4194 cl_init(pRExC_state, &ch_class);
653099ff
GS
4195 data.start_class = &ch_class;
4196 stclass_flag = SCF_DO_STCLASS_AND;
4197 } else /* XXXX Check for BOUND? */
4198 stclass_flag = 0;
cb434fcc 4199 data.last_closep = &last_close;
653099ff 4200
1de06328 4201 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4202 &data, -1, NULL, NULL,
4203 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4204
07be1b83 4205
786e8c11
YO
4206 CHECK_RESTUDY_GOTO;
4207
4208
830247a4 4209 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4210 && data.last_start_min == 0 && data.last_end > 0
830247a4
IZ
4211 && !RExC_seen_zerolen
4212 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
c277df42 4213 r->reganch |= ROPT_CHECK_ALL;
1de06328 4214 scan_commit(pRExC_state, &data,&minlen);
c277df42
IZ
4215 SvREFCNT_dec(data.last_found);
4216
1de06328
YO
4217 /* Note that code very similar to this but for anchored string
4218 follows immediately below, changes may need to be made to both.
4219 Be careful.
4220 */
a0ed51b3 4221 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4222 if (longest_float_length
c277df42
IZ
4223 || (data.flags & SF_FL_BEFORE_EOL
4224 && (!(data.flags & SF_FL_BEFORE_MEOL)
1de06328
YO
4225 || (RExC_flags & PMf_MULTILINE))))
4226 {
1182767e 4227 I32 t,ml;
cf93c79d 4228
1de06328 4229 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4230 && data.offset_fixed == data.offset_float_min
4231 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4232 goto remove_float; /* As in (a)+. */
4233
1de06328
YO
4234 /* copy the information about the longest float from the reg_scan_data
4235 over to the program. */
33b8afdf
JH
4236 if (SvUTF8(data.longest_float)) {
4237 r->float_utf8 = data.longest_float;
c445ea15 4238 r->float_substr = NULL;
33b8afdf
JH
4239 } else {
4240 r->float_substr = data.longest_float;
c445ea15 4241 r->float_utf8 = NULL;
33b8afdf 4242 }
1de06328
YO
4243 /* float_end_shift is how many chars that must be matched that
4244 follow this item. We calculate it ahead of time as once the
4245 lookbehind offset is added in we lose the ability to correctly
4246 calculate it.*/
4247 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4248 : (I32)longest_float_length;
1de06328
YO
4249 r->float_end_shift = ml - data.offset_float_min
4250 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4251 + data.lookbehind_float;
4252 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4253 r->float_max_offset = data.offset_float_max;
1182767e 4254 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4255 r->float_max_offset -= data.lookbehind_float;
4256
cf93c79d
IZ
4257 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4258 && (!(data.flags & SF_FL_BEFORE_MEOL)
e2509266 4259 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4260 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4261 }
4262 else {
aca2d497 4263 remove_float:
c445ea15 4264 r->float_substr = r->float_utf8 = NULL;
c277df42 4265 SvREFCNT_dec(data.longest_float);
c5254dd6 4266 longest_float_length = 0;
a0d0e21e 4267 }
c277df42 4268
1de06328
YO
4269 /* Note that code very similar to this but for floating string
4270 is immediately above, changes may need to be made to both.
4271 Be careful.
4272 */
a0ed51b3 4273 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4274 if (longest_fixed_length
c277df42
IZ
4275 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4276 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1de06328
YO
4277 || (RExC_flags & PMf_MULTILINE))))
4278 {
1182767e 4279 I32 t,ml;
cf93c79d 4280
1de06328
YO
4281 /* copy the information about the longest fixed
4282 from the reg_scan_data over to the program. */
33b8afdf
JH
4283 if (SvUTF8(data.longest_fixed)) {
4284 r->anchored_utf8 = data.longest_fixed;
c445ea15 4285 r->anchored_substr = NULL;
33b8afdf
JH
4286 } else {
4287 r->anchored_substr = data.longest_fixed;
c445ea15 4288 r->anchored_utf8 = NULL;
33b8afdf 4289 }
1de06328
YO
4290 /* fixed_end_shift is how many chars that must be matched that
4291 follow this item. We calculate it ahead of time as once the
4292 lookbehind offset is added in we lose the ability to correctly
4293 calculate it.*/
4294 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4295 : (I32)longest_fixed_length;
1de06328
YO
4296 r->anchored_end_shift = ml - data.offset_fixed
4297 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4298 + data.lookbehind_fixed;
4299 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4300
cf93c79d
IZ
4301 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4302 && (!(data.flags & SF_FIX_BEFORE_MEOL)
e2509266 4303 || (RExC_flags & PMf_MULTILINE)));
33b8afdf 4304 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4305 }
4306 else {
c445ea15 4307 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4308 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4309 longest_fixed_length = 0;
a0d0e21e 4310 }
b81d288d 4311 if (r->regstclass
ffc61ed2 4312 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
653099ff 4313 r->regstclass = NULL;
33b8afdf
JH
4314 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4315 && stclass_flag
653099ff 4316 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4317 && !cl_is_anything(data.start_class))
4318 {
1df70142 4319 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4320
a02a5408 4321 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4322 struct regnode_charclass_class);
4323 StructCopy(data.start_class,
830247a4 4324 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4325 struct regnode_charclass_class);
830247a4 4326 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4327 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4328 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4329 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4330 PerlIO_printf(Perl_debug_log,
a0288114 4331 "synthetic stclass \"%s\".\n",
3f7c398e 4332 SvPVX_const(sv));});
653099ff 4333 }
c277df42
IZ
4334
4335 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4336 if (longest_fixed_length > longest_float_length) {
1de06328 4337 r->check_end_shift = r->anchored_end_shift;
c277df42 4338 r->check_substr = r->anchored_substr;
33b8afdf 4339 r->check_utf8 = r->anchored_utf8;
c277df42
IZ
4340 r->check_offset_min = r->check_offset_max = r->anchored_offset;
4341 if (r->reganch & ROPT_ANCH_SINGLE)
4342 r->reganch |= ROPT_NOSCAN;
a0ed51b3
LW
4343 }
4344 else {
1de06328 4345 r->check_end_shift = r->float_end_shift;
c277df42 4346 r->check_substr = r->float_substr;
33b8afdf 4347 r->check_utf8 = r->float_utf8;
1de06328
YO
4348 r->check_offset_min = r->float_min_offset;
4349 r->check_offset_max = r->float_max_offset;
a0d0e21e 4350 }
30382c73
IZ
4351 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4352 This should be changed ASAP! */
33b8afdf 4353 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
cad2e5aa 4354 r->reganch |= RE_USE_INTUIT;
33b8afdf 4355 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
cad2e5aa
JH
4356 r->reganch |= RE_INTUIT_TAIL;
4357 }
1de06328
YO
4358 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4359 if ( (STRLEN)minlen < longest_float_length )
4360 minlen= longest_float_length;
4361 if ( (STRLEN)minlen < longest_fixed_length )
4362 minlen= longest_fixed_length;
4363 */
a0ed51b3
LW
4364 }
4365 else {
c277df42
IZ
4366 /* Several toplevels. Best we can is to set minlen. */
4367 I32 fake;
653099ff 4368 struct regnode_charclass_class ch_class;
cb434fcc 4369 I32 last_close = 0;
c277df42 4370
a3621e74 4371 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "\n"));
07be1b83 4372
c277df42 4373 scan = r->program + 1;
830247a4 4374 cl_init(pRExC_state, &ch_class);
653099ff 4375 data.start_class = &ch_class;
cb434fcc 4376 data.last_closep = &last_close;
07be1b83 4377
1de06328 4378 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4379 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
07be1b83 4380
786e8c11 4381 CHECK_RESTUDY_GOTO;
07be1b83 4382
33b8afdf 4383 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4384 = r->float_substr = r->float_utf8 = NULL;
653099ff 4385 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4386 && !cl_is_anything(data.start_class))
4387 {
1df70142 4388 const I32 n = add_data(pRExC_state, 1, "f");
653099ff 4389
a02a5408 4390 Newx(RExC_rx->data->data[n], 1,
653099ff
GS
4391 struct regnode_charclass_class);
4392 StructCopy(data.start_class,
830247a4 4393 (struct regnode_charclass_class*)RExC_rx->data->data[n],
653099ff 4394 struct regnode_charclass_class);
830247a4 4395 r->regstclass = (regnode*)RExC_rx->data->data[n];
653099ff 4396 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
a3621e74 4397 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4398 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4399 PerlIO_printf(Perl_debug_log,
a0288114 4400 "synthetic stclass \"%s\".\n",
3f7c398e 4401 SvPVX_const(sv));});
653099ff 4402 }
a0d0e21e
LW
4403 }
4404
1de06328
YO
4405 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4406 the "real" pattern. */
4407 if (r->minlen < minlen)
4408 r->minlen = minlen;
4409
b81d288d 4410 if (RExC_seen & REG_SEEN_GPOS)
c277df42 4411 r->reganch |= ROPT_GPOS_SEEN;
830247a4 4412 if (RExC_seen & REG_SEEN_LOOKBEHIND)
c277df42 4413 r->reganch |= ROPT_LOOKBEHIND_SEEN;
830247a4 4414 if (RExC_seen & REG_SEEN_EVAL)
ce862d02 4415 r->reganch |= ROPT_EVAL_SEEN;
f33976b4
DB
4416 if (RExC_seen & REG_SEEN_CANY)
4417 r->reganch |= ROPT_CANY_SEEN;
81714fb9
YO
4418 if (RExC_paren_names)
4419 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4420 else
4421 r->paren_names = NULL;
4422
40d049e4
YO
4423 if (RExC_recurse_count) {
4424 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4425 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4426 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4427 }
4428 }
a02a5408
JC
4429 Newxz(r->startp, RExC_npar, I32);
4430 Newxz(r->endp, RExC_npar, I32);
f9f4320a 4431
f2278c82 4432 DEBUG_r( RX_DEBUG_on(r) );
be8e71aa
YO
4433 DEBUG_DUMP_r({
4434 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4435 regdump(r);
4436 });
8e9a8a48
YO
4437 DEBUG_OFFSETS_r(if (r->offsets) {
4438 const U32 len = r->offsets[0];
4439 U32 i;
4440 GET_RE_DEBUG_FLAGS_DECL;
4441 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4442 for (i = 1; i <= len; i++) {
4443 if (r->offsets[i*2-1] || r->offsets[i*2])
4444 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
786e8c11 4445 (UV)i, (UV)r->offsets[i*2-1], (UV)r->offsets[i*2]);
8e9a8a48
YO
4446 }
4447 PerlIO_printf(Perl_debug_log, "\n");
4448 });
a0d0e21e 4449 return(r);
f9f4320a 4450 END_BLOCK
a687059c
LW
4451}
4452
f9f4320a
YO
4453#undef CORE_ONLY_BLOCK
4454#undef END_BLOCK
4455#undef RE_ENGINE_PTR
3dab1dad 4456
9af228c6 4457#ifndef PERL_IN_XSUB_RE
81714fb9
YO
4458SV*
4459Perl_reg_named_buff_sv(pTHX_ SV* namesv)
4460{
4461 I32 parno = 0; /* no match */
4462 if (PL_curpm) {
4463 const REGEXP * const rx = PM_GETRE(PL_curpm);
4464 if (rx && rx->paren_names) {
4465 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4466 if (he_str) {
4467 IV i;
4468 SV* sv_dat=HeVAL(he_str);
4469 I32 *nums=(I32*)SvPVX(sv_dat);
4470 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4471 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4472 rx->endp[nums[i]] != -1)
4473 {
4474 parno = nums[i];
4475 break;
4476 }
4477 }
4478 }
4479 }
4480 }
4481 if ( !parno ) {
4482 return 0;
4483 } else {
4484 GV *gv_paren;
4485 SV *sv= sv_newmortal();
4486 Perl_sv_setpvf(aTHX_ sv, "%"IVdf,(IV)parno);
4487 gv_paren= Perl_gv_fetchsv(aTHX_ sv, GV_ADD, SVt_PVGV);
4488 return GvSVn(gv_paren);
4489 }
4490}
9af228c6 4491#endif
0a4db386 4492
894be9b7 4493/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4494 * If flags is REG_RSN_RETURN_NULL returns null.
4495 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4496 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4497 * to the parsed name as looked up in the RExC_paren_names hash.
4498 * If there is an error throws a vFAIL().. type exception.
894be9b7 4499 */
0a4db386
YO
4500
4501#define REG_RSN_RETURN_NULL 0
4502#define REG_RSN_RETURN_NAME 1
4503#define REG_RSN_RETURN_DATA 2
4504
894be9b7
YO
4505STATIC SV*
4506S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4507 char *name_start = RExC_parse;
0a4db386 4508 if ( UTF ) {
894be9b7 4509 STRLEN numlen;
0a4db386
YO
4510 while( isIDFIRST_uni(utf8n_to_uvchr((U8*)RExC_parse,
4511 RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT)))
4512 {
4513 RExC_parse += numlen;
4514 }
4515 } else {
4516 while( isIDFIRST(*RExC_parse) )
894be9b7
YO
4517 RExC_parse++;
4518 }
0a4db386
YO
4519 if ( flags ) {
4520 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4521 (int)(RExC_parse - name_start)));
894be9b7 4522 if (UTF)
0a4db386
YO
4523 SvUTF8_on(sv_name);
4524 if ( flags == REG_RSN_RETURN_NAME)
4525 return sv_name;
4526 else if (flags==REG_RSN_RETURN_DATA) {
4527 HE *he_str = NULL;
4528 SV *sv_dat = NULL;
4529 if ( ! sv_name ) /* should not happen*/
4530 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4531 if (RExC_paren_names)
4532 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4533 if ( he_str )
4534 sv_dat = HeVAL(he_str);
4535 if ( ! sv_dat )
4536 vFAIL("Reference to nonexistent named group");
4537 return sv_dat;
4538 }
4539 else {
4540 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4541 }
4542 /* NOT REACHED */
894be9b7 4543 }
0a4db386 4544 return NULL;
894be9b7
YO
4545}
4546
3dab1dad
YO
4547#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4548 int rem=(int)(RExC_end - RExC_parse); \
4549 int cut; \
4550 int num; \
4551 int iscut=0; \
4552 if (rem>10) { \
4553 rem=10; \
4554 iscut=1; \
4555 } \
4556 cut=10-rem; \
4557 if (RExC_lastparse!=RExC_parse) \
4558 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4559 rem, RExC_parse, \
4560 cut + 4, \
4561 iscut ? "..." : "<" \
4562 ); \
4563 else \
4564 PerlIO_printf(Perl_debug_log,"%16s",""); \
4565 \
4566 if (SIZE_ONLY) \
4567 num=RExC_size; \
4568 else \
4569 num=REG_NODE_NUM(RExC_emit); \
4570 if (RExC_lastnum!=num) \
0a4db386 4571 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4572 else \
0a4db386 4573 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4574 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4575 (int)((depth*2)), "", \
3dab1dad
YO
4576 (funcname) \
4577 ); \
4578 RExC_lastnum=num; \
4579 RExC_lastparse=RExC_parse; \
4580})
4581
07be1b83
YO
4582
4583
3dab1dad
YO
4584#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4585 DEBUG_PARSE_MSG((funcname)); \
4586 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4587})
6bda09f9
YO
4588#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4589 DEBUG_PARSE_MSG((funcname)); \
4590 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4591})
a687059c
LW
4592/*
4593 - reg - regular expression, i.e. main body or parenthesized thing
4594 *
4595 * Caller must absorb opening parenthesis.
4596 *
4597 * Combining parenthesis handling with the base level of regular expression
4598 * is a trifle forced, but the need to tie the tails of the branches to what
4599 * follows makes it hard to avoid.
4600 */
07be1b83
YO
4601#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4602#ifdef DEBUGGING
4603#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4604#else
4605#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4606#endif
3dab1dad 4607
76e3520e 4608STATIC regnode *
3dab1dad 4609S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4610 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4611{
27da23d5 4612 dVAR;
c277df42
IZ
4613 register regnode *ret; /* Will be the head of the group. */
4614 register regnode *br;
4615 register regnode *lastbr;
cbbf8932 4616 register regnode *ender = NULL;
a0d0e21e 4617 register I32 parno = 0;
cbbf8932
AL
4618 I32 flags;
4619 const I32 oregflags = RExC_flags;
6136c704
AL
4620 bool have_branch = 0;
4621 bool is_open = 0;
9d1d55b5
JP
4622
4623 /* for (?g), (?gc), and (?o) warnings; warning
4624 about (?c) will warn about (?g) -- japhy */
4625
6136c704
AL
4626#define WASTED_O 0x01
4627#define WASTED_G 0x02
4628#define WASTED_C 0x04
4629#define WASTED_GC (0x02|0x04)
cbbf8932 4630 I32 wastedflags = 0x00;
9d1d55b5 4631
fac92740 4632 char * parse_start = RExC_parse; /* MJD */
a28509cc 4633 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4634
3dab1dad
YO
4635 GET_RE_DEBUG_FLAGS_DECL;
4636 DEBUG_PARSE("reg ");
4637
4638
821b33a5 4639 *flagp = 0; /* Tentatively. */
a0d0e21e 4640
9d1d55b5 4641
a0d0e21e
LW
4642 /* Make an OPEN node, if parenthesized. */
4643 if (paren) {
fac92740 4644 if (*RExC_parse == '?') { /* (?...) */
2b36a5a0
JH
4645 U32 posflags = 0, negflags = 0;
4646 U32 *flagsp = &posflags;
6136c704 4647 bool is_logical = 0;
a28509cc 4648 const char * const seqstart = RExC_parse;
ca9dfc88 4649
830247a4
IZ
4650 RExC_parse++;
4651 paren = *RExC_parse++;
c277df42 4652 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 4653 switch (paren) {
894be9b7 4654
fac92740 4655 case '<': /* (?<...) */
b81d288d 4656 if (*RExC_parse == '!')
c277df42 4657 paren = ',';
0a4db386
YO
4658 else if (*RExC_parse != '=')
4659 { /* (?<...>) */
81714fb9 4660 char *name_start;
894be9b7 4661 SV *svname;
81714fb9
YO
4662 paren= '>';
4663 case '\'': /* (?'...') */
4664 name_start= RExC_parse;
0a4db386
YO
4665 svname = reg_scan_name(pRExC_state,
4666 SIZE_ONLY ? /* reverse test from the others */
4667 REG_RSN_RETURN_NAME :
4668 REG_RSN_RETURN_NULL);
81714fb9
YO
4669 if (RExC_parse == name_start)
4670 goto unknown;
4671 if (*RExC_parse != paren)
4672 vFAIL2("Sequence (?%c... not terminated",
4673 paren=='>' ? '<' : paren);
4674 if (SIZE_ONLY) {
e62cc96a
YO
4675 HE *he_str;
4676 SV *sv_dat = NULL;
894be9b7
YO
4677 if (!svname) /* shouldnt happen */
4678 Perl_croak(aTHX_
4679 "panic: reg_scan_name returned NULL");
81714fb9
YO
4680 if (!RExC_paren_names) {
4681 RExC_paren_names= newHV();
4682 sv_2mortal((SV*)RExC_paren_names);
4683 }
4684 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 4685 if ( he_str )
81714fb9 4686 sv_dat = HeVAL(he_str);
e62cc96a 4687 if ( ! sv_dat ) {
81714fb9 4688 /* croak baby croak */
e62cc96a
YO
4689 Perl_croak(aTHX_
4690 "panic: paren_name hash element allocation failed");
4691 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
4692 IV count=SvIV(sv_dat);
4693 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
4694 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
4695 pv[count]=RExC_npar;
4696 SvIVX(sv_dat)++;
4697 } else {
4698 (void)SvUPGRADE(sv_dat,SVt_PVNV);
4699 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
4700 SvIOK_on(sv_dat);
4701 SvIVX(sv_dat)= 1;
e62cc96a
YO
4702 }
4703
81714fb9
YO
4704 /*sv_dump(sv_dat);*/
4705 }
4706 nextchar(pRExC_state);
4707 paren = 1;
4708 goto capturing_parens;
4709 }
4710 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 4711 RExC_parse++;
fac92740
MJD
4712 case '=': /* (?=...) */
4713 case '!': /* (?!...) */
7f69552c
YO
4714 if (*RExC_parse == ')')
4715 goto do_op_fail;
830247a4 4716 RExC_seen_zerolen++;
fac92740
MJD
4717 case ':': /* (?:...) */
4718 case '>': /* (?>...) */
a0d0e21e 4719 break;
54612592
YO
4720 case 'C':
4721 if (RExC_parse[0] == 'O' &&
4722 RExC_parse[1] == 'M' &&
4723 RExC_parse[2] == 'M' &&
4724 RExC_parse[3] == 'I' &&
4725 RExC_parse[4] == 'T' &&
4726 RExC_parse[5] == ')')
4727 {
4728 RExC_parse+=5;
4729 ret = reg_node(pRExC_state, COMMIT);
4730 } else {
4731 vFAIL("Sequence (?C... not terminated");
4732 }
4733 nextchar(pRExC_state);
4734 return ret;
4735 break;
7f69552c
YO
4736 case 'F':
4737 if (RExC_parse[0] == 'A' &&
4738 RExC_parse[1] == 'I' &&
4739 RExC_parse[2] == 'L')
4740 RExC_parse+=3;
4741 if (*RExC_parse != ')')
4742 vFAIL("Sequence (?FAIL) or (?F) not terminated");
4743 do_op_fail:
4744 ret = reg_node(pRExC_state, OPFAIL);
4745 nextchar(pRExC_state);
4746 return ret;
4747 break;
fac92740
MJD
4748 case '$': /* (?$...) */
4749 case '@': /* (?@...) */
8615cb43 4750 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 4751 break;
fac92740 4752 case '#': /* (?#...) */
830247a4
IZ
4753 while (*RExC_parse && *RExC_parse != ')')
4754 RExC_parse++;
4755 if (*RExC_parse != ')')
c277df42 4756 FAIL("Sequence (?#... not terminated");
830247a4 4757 nextchar(pRExC_state);
a0d0e21e
LW
4758 *flagp = TRYAGAIN;
4759 return NULL;
894be9b7
YO
4760 case '0' : /* (?0) */
4761 case 'R' : /* (?R) */
4762 if (*RExC_parse != ')')
6bda09f9 4763 FAIL("Sequence (?R) not terminated");
1a147d38 4764 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
4765 nextchar(pRExC_state);
4766 return ret;
4767 /*notreached*/
894be9b7
YO
4768 { /* named and numeric backreferences */
4769 I32 num;
4770 char * parse_start;
4771 case '&': /* (?&NAME) */
4772 parse_start = RExC_parse - 1;
4773 {
0a4db386
YO
4774 SV *sv_dat = reg_scan_name(pRExC_state,
4775 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4776 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
4777 }
4778 goto gen_recurse_regop;
4779 /* NOT REACHED */
6bda09f9
YO
4780 case '1': case '2': case '3': case '4': /* (?1) */
4781 case '5': case '6': case '7': case '8': case '9':
4782 RExC_parse--;
894be9b7
YO
4783 num = atoi(RExC_parse);
4784 parse_start = RExC_parse - 1; /* MJD */
6bda09f9
YO
4785 while (isDIGIT(*RExC_parse))
4786 RExC_parse++;
4787 if (*RExC_parse!=')')
4788 vFAIL("Expecting close bracket");
894be9b7
YO
4789
4790 gen_recurse_regop:
1a147d38 4791 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
4792 if (!SIZE_ONLY) {
4793 if (num > (I32)RExC_rx->nparens) {
4794 RExC_parse++;
4795 vFAIL("Reference to nonexistent group");
4796 }
40d049e4 4797 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 4798 RExC_emit++;
226de585 4799 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 4800 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 4801 } else {
6bda09f9 4802 RExC_size++;
6bda09f9 4803 }
0a4db386 4804 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 4805 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
4806 Set_Node_Offset(ret, parse_start); /* MJD */
4807
6bda09f9
YO
4808 nextchar(pRExC_state);
4809 return ret;
894be9b7
YO
4810 } /* named and numeric backreferences */
4811 /* NOT REACHED */
4812
fac92740 4813 case 'p': /* (?p...) */
9014280d 4814 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 4815 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 4816 /* FALL THROUGH*/
fac92740 4817 case '?': /* (??...) */
6136c704 4818 is_logical = 1;
438a3801
YST
4819 if (*RExC_parse != '{')
4820 goto unknown;
830247a4 4821 paren = *RExC_parse++;
0f5d15d6 4822 /* FALL THROUGH */
fac92740 4823 case '{': /* (?{...}) */
c277df42 4824 {
c277df42
IZ
4825 I32 count = 1, n = 0;
4826 char c;
830247a4 4827 char *s = RExC_parse;
c277df42 4828
830247a4
IZ
4829 RExC_seen_zerolen++;
4830 RExC_seen |= REG_SEEN_EVAL;
4831 while (count && (c = *RExC_parse)) {
6136c704
AL
4832 if (c == '\\') {
4833 if (RExC_parse[1])
4834 RExC_parse++;
4835 }
b81d288d 4836 else if (c == '{')
c277df42 4837 count++;
b81d288d 4838 else if (c == '}')
c277df42 4839 count--;
830247a4 4840 RExC_parse++;
c277df42 4841 }
6136c704 4842 if (*RExC_parse != ')') {
b81d288d 4843 RExC_parse = s;
b45f050a
JF
4844 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
4845 }
c277df42 4846 if (!SIZE_ONLY) {
f3548bdc 4847 PAD *pad;
6136c704
AL
4848 OP_4tree *sop, *rop;
4849 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 4850
569233ed
SB
4851 ENTER;
4852 Perl_save_re_context(aTHX);
f3548bdc 4853 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
4854 sop->op_private |= OPpREFCOUNTED;
4855 /* re_dup will OpREFCNT_inc */
4856 OpREFCNT_set(sop, 1);
569233ed 4857 LEAVE;
c277df42 4858
830247a4
IZ
4859 n = add_data(pRExC_state, 3, "nop");
4860 RExC_rx->data->data[n] = (void*)rop;
4861 RExC_rx->data->data[n+1] = (void*)sop;
f3548bdc 4862 RExC_rx->data->data[n+2] = (void*)pad;
c277df42 4863 SvREFCNT_dec(sv);
a0ed51b3 4864 }
e24b16f9 4865 else { /* First pass */
830247a4 4866 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 4867 && IN_PERL_RUNTIME)
2cd61cdb
IZ
4868 /* No compiled RE interpolated, has runtime
4869 components ===> unsafe. */
4870 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 4871 if (PL_tainting && PL_tainted)
cc6b7395 4872 FAIL("Eval-group in insecure regular expression");
54df2634 4873#if PERL_VERSION > 8
923e4eb5 4874 if (IN_PERL_COMPILETIME)
b5c19bd7 4875 PL_cv_has_eval = 1;
54df2634 4876#endif
c277df42 4877 }
b5c19bd7 4878
830247a4 4879 nextchar(pRExC_state);
6136c704 4880 if (is_logical) {
830247a4 4881 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4882 if (!SIZE_ONLY)
4883 ret->flags = 2;
3dab1dad 4884 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 4885 /* deal with the length of this later - MJD */
0f5d15d6
IZ
4886 return ret;
4887 }
ccb2c380
MP
4888 ret = reganode(pRExC_state, EVAL, n);
4889 Set_Node_Length(ret, RExC_parse - parse_start + 1);
4890 Set_Node_Offset(ret, parse_start);
4891 return ret;
c277df42 4892 }
fac92740 4893 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 4894 {
0a4db386 4895 int is_define= 0;
fac92740 4896 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
4897 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
4898 || RExC_parse[1] == '<'
830247a4 4899 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
4900 I32 flag;
4901
830247a4 4902 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
4903 if (!SIZE_ONLY)
4904 ret->flags = 1;
3dab1dad 4905 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 4906 goto insert_if;
b81d288d 4907 }
a0ed51b3 4908 }
0a4db386
YO
4909 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
4910 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
4911 {
4912 char ch = RExC_parse[0] == '<' ? '>' : '\'';
4913 char *name_start= RExC_parse++;
4914 I32 num = 0;
4915 SV *sv_dat=reg_scan_name(pRExC_state,
4916 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4917 if (RExC_parse == name_start || *RExC_parse != ch)
4918 vFAIL2("Sequence (?(%c... not terminated",
4919 (ch == '>' ? '<' : ch));
4920 RExC_parse++;
4921 if (!SIZE_ONLY) {
4922 num = add_data( pRExC_state, 1, "S" );
4923 RExC_rx->data->data[num]=(void*)sv_dat;
4924 SvREFCNT_inc(sv_dat);
4925 }
4926 ret = reganode(pRExC_state,NGROUPP,num);
4927 goto insert_if_check_paren;
4928 }
4929 else if (RExC_parse[0] == 'D' &&
4930 RExC_parse[1] == 'E' &&
4931 RExC_parse[2] == 'F' &&
4932 RExC_parse[3] == 'I' &&
4933 RExC_parse[4] == 'N' &&
4934 RExC_parse[5] == 'E')
4935 {
4936 ret = reganode(pRExC_state,DEFINEP,0);
4937 RExC_parse +=6 ;
4938 is_define = 1;
4939 goto insert_if_check_paren;
4940 }
4941 else if (RExC_parse[0] == 'R') {
4942 RExC_parse++;
4943 parno = 0;
4944 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
4945 parno = atoi(RExC_parse++);
4946 while (isDIGIT(*RExC_parse))
4947 RExC_parse++;
4948 } else if (RExC_parse[0] == '&') {
4949 SV *sv_dat;
4950 RExC_parse++;
4951 sv_dat = reg_scan_name(pRExC_state,
4952 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
4953 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
4954 }
1a147d38 4955 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
4956 goto insert_if_check_paren;
4957 }
830247a4 4958 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 4959 /* (?(1)...) */
6136c704 4960 char c;
830247a4 4961 parno = atoi(RExC_parse++);
c277df42 4962
830247a4
IZ
4963 while (isDIGIT(*RExC_parse))
4964 RExC_parse++;
fac92740 4965 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 4966
0a4db386 4967 insert_if_check_paren:
830247a4 4968 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 4969 vFAIL("Switch condition not recognized");
c277df42 4970 insert_if:
3dab1dad
YO
4971 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
4972 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 4973 if (br == NULL)
830247a4 4974 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 4975 else
3dab1dad 4976 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 4977 c = *nextchar(pRExC_state);
d1b80229
IZ
4978 if (flags&HASWIDTH)
4979 *flagp |= HASWIDTH;
c277df42 4980 if (c == '|') {
0a4db386
YO
4981 if (is_define)
4982 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 4983 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
4984 regbranch(pRExC_state, &flags, 1,depth+1);
4985 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
4986 if (flags&HASWIDTH)
4987 *flagp |= HASWIDTH;
830247a4 4988 c = *nextchar(pRExC_state);
a0ed51b3
LW
4989 }
4990 else
c277df42
IZ
4991 lastbr = NULL;
4992 if (c != ')')
8615cb43 4993 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 4994 ender = reg_node(pRExC_state, TAIL);
3dab1dad 4995 REGTAIL(pRExC_state, br, ender);
c277df42 4996 if (lastbr) {
3dab1dad
YO
4997 REGTAIL(pRExC_state, lastbr, ender);
4998 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
4999 }
5000 else
3dab1dad 5001 REGTAIL(pRExC_state, ret, ender);
c277df42 5002 return ret;
a0ed51b3
LW
5003 }
5004 else {
830247a4 5005 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5006 }
5007 }
1b1626e4 5008 case 0:
830247a4 5009 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5010 vFAIL("Sequence (? incomplete");
1b1626e4 5011 break;
a0d0e21e 5012 default:
830247a4 5013 --RExC_parse;
fac92740 5014 parse_flags: /* (?i) */
830247a4 5015 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
9d1d55b5
JP
5016 /* (?g), (?gc) and (?o) are useless here
5017 and must be globally applied -- japhy */
5018
5019 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
5020 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5021 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5022 if (! (wastedflags & wflagbit) ) {
5023 wastedflags |= wflagbit;
5024 vWARN5(
5025 RExC_parse + 1,
5026 "Useless (%s%c) - %suse /%c modifier",
5027 flagsp == &negflags ? "?-" : "?",
5028 *RExC_parse,
5029 flagsp == &negflags ? "don't " : "",
5030 *RExC_parse
5031 );
5032 }
5033 }
5034 }
5035 else if (*RExC_parse == 'c') {
5036 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5037 if (! (wastedflags & WASTED_C) ) {
5038 wastedflags |= WASTED_GC;
9d1d55b5
JP
5039 vWARN3(
5040 RExC_parse + 1,
5041 "Useless (%sc) - %suse /gc modifier",
5042 flagsp == &negflags ? "?-" : "?",
5043 flagsp == &negflags ? "don't " : ""
5044 );
5045 }
5046 }
5047 }
5048 else { pmflag(flagsp, *RExC_parse); }
5049
830247a4 5050 ++RExC_parse;
ca9dfc88 5051 }
830247a4 5052 if (*RExC_parse == '-') {
ca9dfc88 5053 flagsp = &negflags;
9d1d55b5 5054 wastedflags = 0; /* reset so (?g-c) warns twice */
830247a4 5055 ++RExC_parse;
ca9dfc88 5056 goto parse_flags;
48c036b1 5057 }
e2509266
JH
5058 RExC_flags |= posflags;
5059 RExC_flags &= ~negflags;
830247a4
IZ
5060 if (*RExC_parse == ':') {
5061 RExC_parse++;
ca9dfc88
IZ
5062 paren = ':';
5063 break;
5064 }
c277df42 5065 unknown:
830247a4
IZ
5066 if (*RExC_parse != ')') {
5067 RExC_parse++;
5068 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
b45f050a 5069 }
830247a4 5070 nextchar(pRExC_state);
a0d0e21e
LW
5071 *flagp = TRYAGAIN;
5072 return NULL;
5073 }
5074 }
fac92740 5075 else { /* (...) */
81714fb9 5076 capturing_parens:
830247a4
IZ
5077 parno = RExC_npar;
5078 RExC_npar++;
5079 ret = reganode(pRExC_state, OPEN, parno);
6bda09f9 5080 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
226de585 5081 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5082 "Setting open paren #%"IVdf" to %d\n",
5083 (IV)parno, REG_NODE_NUM(ret)));
5084 RExC_open_parens[parno-1]= ret;
6bda09f9 5085 }
fac92740
MJD
5086 Set_Node_Length(ret, 1); /* MJD */
5087 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5088 is_open = 1;
a0d0e21e 5089 }
a0ed51b3 5090 }
fac92740 5091 else /* ! paren */
a0d0e21e
LW
5092 ret = NULL;
5093
5094 /* Pick up the branches, linking them together. */
fac92740 5095 parse_start = RExC_parse; /* MJD */
3dab1dad 5096 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5097 /* branch_len = (paren != 0); */
2af232bd 5098
a0d0e21e
LW
5099 if (br == NULL)
5100 return(NULL);
830247a4
IZ
5101 if (*RExC_parse == '|') {
5102 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5103 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5104 }
fac92740 5105 else { /* MJD */
6bda09f9 5106 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5107 Set_Node_Length(br, paren != 0);
5108 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5109 }
c277df42
IZ
5110 have_branch = 1;
5111 if (SIZE_ONLY)
830247a4 5112 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5113 }
5114 else if (paren == ':') {
c277df42
IZ
5115 *flagp |= flags&SIMPLE;
5116 }
6136c704 5117 if (is_open) { /* Starts with OPEN. */
3dab1dad 5118 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5119 }
5120 else if (paren != '?') /* Not Conditional */
a0d0e21e 5121 ret = br;
32a0ca98 5122 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5123 lastbr = br;
830247a4
IZ
5124 while (*RExC_parse == '|') {
5125 if (!SIZE_ONLY && RExC_extralen) {
5126 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5127 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5128 }
5129 if (SIZE_ONLY)
830247a4
IZ
5130 RExC_extralen += 2; /* Account for LONGJMP. */
5131 nextchar(pRExC_state);
3dab1dad 5132 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5133
a687059c 5134 if (br == NULL)
a0d0e21e 5135 return(NULL);
3dab1dad 5136 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5137 lastbr = br;
821b33a5
IZ
5138 if (flags&HASWIDTH)
5139 *flagp |= HASWIDTH;
a687059c 5140 *flagp |= flags&SPSTART;
a0d0e21e
LW
5141 }
5142
c277df42
IZ
5143 if (have_branch || paren != ':') {
5144 /* Make a closing node, and hook it on the end. */
5145 switch (paren) {
5146 case ':':
830247a4 5147 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5148 break;
5149 case 1:
830247a4 5150 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5151 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5152 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5153 "Setting close paren #%"IVdf" to %d\n",
5154 (IV)parno, REG_NODE_NUM(ender)));
5155 RExC_close_parens[parno-1]= ender;
5156 }
fac92740
MJD
5157 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5158 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5159 break;
5160 case '<':
c277df42
IZ
5161 case ',':
5162 case '=':
5163 case '!':
c277df42 5164 *flagp &= ~HASWIDTH;
821b33a5
IZ
5165 /* FALL THROUGH */
5166 case '>':
830247a4 5167 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5168 break;
5169 case 0:
830247a4 5170 ender = reg_node(pRExC_state, END);
40d049e4
YO
5171 if (!SIZE_ONLY) {
5172 assert(!RExC_opend); /* there can only be one! */
5173 RExC_opend = ender;
5174 }
c277df42
IZ
5175 break;
5176 }
eaf3ca90 5177 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5178
9674d46a 5179 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5180 if (depth==1)
5181 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5182
c277df42 5183 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5184 for (br = ret; br; br = regnext(br)) {
5185 const U8 op = PL_regkind[OP(br)];
5186 if (op == BRANCH) {
07be1b83 5187 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5188 }
5189 else if (op == BRANCHJ) {
07be1b83 5190 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5191 }
c277df42
IZ
5192 }
5193 }
a0d0e21e 5194 }
c277df42
IZ
5195
5196 {
e1ec3a88
AL
5197 const char *p;
5198 static const char parens[] = "=!<,>";
c277df42
IZ
5199
5200 if (paren && (p = strchr(parens, paren))) {
eb160463 5201 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5202 int flag = (p - parens) > 1;
5203
5204 if (paren == '>')
5205 node = SUSPEND, flag = 0;
6bda09f9 5206 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5207 Set_Node_Cur_Length(ret);
5208 Set_Node_Offset(ret, parse_start + 1);
c277df42 5209 ret->flags = flag;
07be1b83 5210 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5211 }
a0d0e21e
LW
5212 }
5213
5214 /* Check for proper termination. */
ce3e6498 5215 if (paren) {
e2509266 5216 RExC_flags = oregflags;
830247a4
IZ
5217 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5218 RExC_parse = oregcomp_parse;
380a0633 5219 vFAIL("Unmatched (");
ce3e6498 5220 }
a0ed51b3 5221 }
830247a4
IZ
5222 else if (!paren && RExC_parse < RExC_end) {
5223 if (*RExC_parse == ')') {
5224 RExC_parse++;
380a0633 5225 vFAIL("Unmatched )");
a0ed51b3
LW
5226 }
5227 else
b45f050a 5228 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5229 /* NOTREACHED */
5230 }
a687059c 5231
a0d0e21e 5232 return(ret);
a687059c
LW
5233}
5234
5235/*
5236 - regbranch - one alternative of an | operator
5237 *
5238 * Implements the concatenation operator.
5239 */
76e3520e 5240STATIC regnode *
3dab1dad 5241S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5242{
97aff369 5243 dVAR;
c277df42
IZ
5244 register regnode *ret;
5245 register regnode *chain = NULL;
5246 register regnode *latest;
5247 I32 flags = 0, c = 0;
3dab1dad
YO
5248 GET_RE_DEBUG_FLAGS_DECL;
5249 DEBUG_PARSE("brnc");
b81d288d 5250 if (first)
c277df42
IZ
5251 ret = NULL;
5252 else {
b81d288d 5253 if (!SIZE_ONLY && RExC_extralen)
830247a4 5254 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5255 else {
830247a4 5256 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5257 Set_Node_Length(ret, 1);
5258 }
c277df42
IZ
5259 }
5260
b81d288d 5261 if (!first && SIZE_ONLY)
830247a4 5262 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5263
c277df42 5264 *flagp = WORST; /* Tentatively. */
a0d0e21e 5265
830247a4
IZ
5266 RExC_parse--;
5267 nextchar(pRExC_state);
5268 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5269 flags &= ~TRYAGAIN;
3dab1dad 5270 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5271 if (latest == NULL) {
5272 if (flags & TRYAGAIN)
5273 continue;
5274 return(NULL);
a0ed51b3
LW
5275 }
5276 else if (ret == NULL)
c277df42 5277 ret = latest;
a0d0e21e 5278 *flagp |= flags&HASWIDTH;
c277df42 5279 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5280 *flagp |= flags&SPSTART;
5281 else {
830247a4 5282 RExC_naughty++;
3dab1dad 5283 REGTAIL(pRExC_state, chain, latest);
a687059c 5284 }
a0d0e21e 5285 chain = latest;
c277df42
IZ
5286 c++;
5287 }
5288 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5289 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5290 if (ret == NULL)
5291 ret = chain;
5292 }
5293 if (c == 1) {
5294 *flagp |= flags&SIMPLE;
a0d0e21e 5295 }
a687059c 5296
d4c19fe8 5297 return ret;
a687059c
LW
5298}
5299
5300/*
5301 - regpiece - something followed by possible [*+?]
5302 *
5303 * Note that the branching code sequences used for ? and the general cases
5304 * of * and + are somewhat optimized: they use the same NOTHING node as
5305 * both the endmarker for their branch list and the body of the last branch.
5306 * It might seem that this node could be dispensed with entirely, but the
5307 * endmarker role is not redundant.
5308 */
76e3520e 5309STATIC regnode *
3dab1dad 5310S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5311{
97aff369 5312 dVAR;
c277df42 5313 register regnode *ret;
a0d0e21e
LW
5314 register char op;
5315 register char *next;
5316 I32 flags;
1df70142 5317 const char * const origparse = RExC_parse;
a0d0e21e 5318 I32 min;
c277df42 5319 I32 max = REG_INFTY;
fac92740 5320 char *parse_start;
10edeb5d 5321 const char *maxpos = NULL;
3dab1dad
YO
5322 GET_RE_DEBUG_FLAGS_DECL;
5323 DEBUG_PARSE("piec");
a0d0e21e 5324
3dab1dad 5325 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5326 if (ret == NULL) {
5327 if (flags & TRYAGAIN)
5328 *flagp |= TRYAGAIN;
5329 return(NULL);
5330 }
5331
830247a4 5332 op = *RExC_parse;
a0d0e21e 5333
830247a4 5334 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5335 maxpos = NULL;
fac92740 5336 parse_start = RExC_parse; /* MJD */
830247a4 5337 next = RExC_parse + 1;
a0d0e21e
LW
5338 while (isDIGIT(*next) || *next == ',') {
5339 if (*next == ',') {
5340 if (maxpos)
5341 break;
5342 else
5343 maxpos = next;
a687059c 5344 }
a0d0e21e
LW
5345 next++;
5346 }
5347 if (*next == '}') { /* got one */
5348 if (!maxpos)
5349 maxpos = next;
830247a4
IZ
5350 RExC_parse++;
5351 min = atoi(RExC_parse);
a0d0e21e
LW
5352 if (*maxpos == ',')
5353 maxpos++;
5354 else
830247a4 5355 maxpos = RExC_parse;
a0d0e21e
LW
5356 max = atoi(maxpos);
5357 if (!max && *maxpos != '0')
c277df42
IZ
5358 max = REG_INFTY; /* meaning "infinity" */
5359 else if (max >= REG_INFTY)
8615cb43 5360 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5361 RExC_parse = next;
5362 nextchar(pRExC_state);
a0d0e21e
LW
5363
5364 do_curly:
5365 if ((flags&SIMPLE)) {
830247a4 5366 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5367 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5368 Set_Node_Offset(ret, parse_start+1); /* MJD */
5369 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5370 }
5371 else {
3dab1dad 5372 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5373
5374 w->flags = 0;
3dab1dad 5375 REGTAIL(pRExC_state, ret, w);
830247a4 5376 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5377 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5378 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5379 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5380 }
6bda09f9 5381 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5382 /* MJD hk */
5383 Set_Node_Offset(ret, parse_start+1);
2af232bd 5384 Set_Node_Length(ret,
fac92740 5385 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5386
830247a4 5387 if (!SIZE_ONLY && RExC_extralen)
c277df42 5388 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5389 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5390 if (SIZE_ONLY)
830247a4
IZ
5391 RExC_whilem_seen++, RExC_extralen += 3;
5392 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5393 }
c277df42 5394 ret->flags = 0;
a0d0e21e
LW
5395
5396 if (min > 0)
821b33a5
IZ
5397 *flagp = WORST;
5398 if (max > 0)
5399 *flagp |= HASWIDTH;
a0d0e21e 5400 if (max && max < min)
8615cb43 5401 vFAIL("Can't do {n,m} with n > m");
c277df42 5402 if (!SIZE_ONLY) {
eb160463
GS
5403 ARG1_SET(ret, (U16)min);
5404 ARG2_SET(ret, (U16)max);
a687059c 5405 }
a687059c 5406
a0d0e21e 5407 goto nest_check;
a687059c 5408 }
a0d0e21e 5409 }
a687059c 5410
a0d0e21e
LW
5411 if (!ISMULT1(op)) {
5412 *flagp = flags;
a687059c 5413 return(ret);
a0d0e21e 5414 }
bb20fd44 5415
c277df42 5416#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5417
5418 /* if this is reinstated, don't forget to put this back into perldiag:
5419
5420 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5421
5422 (F) The part of the regexp subject to either the * or + quantifier
5423 could match an empty string. The {#} shows in the regular
5424 expression about where the problem was discovered.
5425
5426 */
5427
bb20fd44 5428 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5429 vFAIL("Regexp *+ operand could be empty");
b81d288d 5430#endif
bb20fd44 5431
fac92740 5432 parse_start = RExC_parse;
830247a4 5433 nextchar(pRExC_state);
a0d0e21e 5434
821b33a5 5435 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5436
5437 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5438 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5439 ret->flags = 0;
830247a4 5440 RExC_naughty += 4;
a0d0e21e
LW
5441 }
5442 else if (op == '*') {
5443 min = 0;
5444 goto do_curly;
a0ed51b3
LW
5445 }
5446 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5447 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5448 ret->flags = 0;
830247a4 5449 RExC_naughty += 3;
a0d0e21e
LW
5450 }
5451 else if (op == '+') {
5452 min = 1;
5453 goto do_curly;
a0ed51b3
LW
5454 }
5455 else if (op == '?') {
a0d0e21e
LW
5456 min = 0; max = 1;
5457 goto do_curly;
5458 }
5459 nest_check:
041457d9 5460 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5461 vWARN3(RExC_parse,
b45f050a 5462 "%.*s matches null string many times",
afd78fd5 5463 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5464 origparse);
a0d0e21e
LW
5465 }
5466
b9b4dddf 5467 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5468 nextchar(pRExC_state);
6bda09f9 5469 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5470 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5471 }
b9b4dddf
YO
5472#ifndef REG_ALLOW_MINMOD_SUSPEND
5473 else
5474#endif
5475 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5476 regnode *ender;
5477 nextchar(pRExC_state);
5478 ender = reg_node(pRExC_state, SUCCEED);
5479 REGTAIL(pRExC_state, ret, ender);
5480 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5481 ret->flags = 0;
5482 ender = reg_node(pRExC_state, TAIL);
5483 REGTAIL(pRExC_state, ret, ender);
5484 /*ret= ender;*/
5485 }
5486
5487 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5488 RExC_parse++;
b45f050a
JF
5489 vFAIL("Nested quantifiers");
5490 }
a0d0e21e
LW
5491
5492 return(ret);
a687059c
LW
5493}
5494
fc8cd66c
YO
5495
5496/* reg_namedseq(pRExC_state,UVp)
5497
5498 This is expected to be called by a parser routine that has
5499 recognized'\N' and needs to handle the rest. RExC_parse is
5500 expected to point at the first char following the N at the time
5501 of the call.
5502
5503 If valuep is non-null then it is assumed that we are parsing inside
5504 of a charclass definition and the first codepoint in the resolved
5505 string is returned via *valuep and the routine will return NULL.
5506 In this mode if a multichar string is returned from the charnames
5507 handler a warning will be issued, and only the first char in the
5508 sequence will be examined. If the string returned is zero length
5509 then the value of *valuep is undefined and NON-NULL will
5510 be returned to indicate failure. (This will NOT be a valid pointer
5511 to a regnode.)
5512
5513 If value is null then it is assumed that we are parsing normal text
5514 and inserts a new EXACT node into the program containing the resolved
5515 string and returns a pointer to the new node. If the string is
5516 zerolength a NOTHING node is emitted.
5517
5518 On success RExC_parse is set to the char following the endbrace.
5519 Parsing failures will generate a fatal errorvia vFAIL(...)
5520
5521 NOTE: We cache all results from the charnames handler locally in
5522 the RExC_charnames hash (created on first use) to prevent a charnames
5523 handler from playing silly-buggers and returning a short string and
5524 then a long string for a given pattern. Since the regexp program
5525 size is calculated during an initial parse this would result
5526 in a buffer overrun so we cache to prevent the charname result from
5527 changing during the course of the parse.
5528
5529 */
5530STATIC regnode *
5531S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
5532{
5533 char * name; /* start of the content of the name */
5534 char * endbrace; /* endbrace following the name */
5535 SV *sv_str = NULL;
5536 SV *sv_name = NULL;
5537 STRLEN len; /* this has various purposes throughout the code */
5538 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
5539 regnode *ret = NULL;
5540
5541 if (*RExC_parse != '{') {
5542 vFAIL("Missing braces on \\N{}");
5543 }
5544 name = RExC_parse+1;
5545 endbrace = strchr(RExC_parse, '}');
5546 if ( ! endbrace ) {
5547 RExC_parse++;
5548 vFAIL("Missing right brace on \\N{}");
5549 }
5550 RExC_parse = endbrace + 1;
5551
5552
5553 /* RExC_parse points at the beginning brace,
5554 endbrace points at the last */
5555 if ( name[0]=='U' && name[1]=='+' ) {
5556 /* its a "unicode hex" notation {U+89AB} */
5557 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
5558 | PERL_SCAN_DISALLOW_PREFIX
5559 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
5560 UV cp;
196f1508 5561 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 5562 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 5563 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
5564 cp = 0xFFFD;
5565 }
5566 if (cp > 0xff)
5567 RExC_utf8 = 1;
5568 if ( valuep ) {
5569 *valuep = cp;
5570 return NULL;
5571 }
5572 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
5573 } else {
5574 /* fetch the charnames handler for this scope */
5575 HV * const table = GvHV(PL_hintgv);
5576 SV **cvp= table ?
5577 hv_fetchs(table, "charnames", FALSE) :
5578 NULL;
5579 SV *cv= cvp ? *cvp : NULL;
5580 HE *he_str;
5581 int count;
5582 /* create an SV with the name as argument */
5583 sv_name = newSVpvn(name, endbrace - name);
5584
5585 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
5586 vFAIL2("Constant(\\N{%s}) unknown: "
5587 "(possibly a missing \"use charnames ...\")",
5588 SvPVX(sv_name));
5589 }
5590 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
5591 vFAIL2("Constant(\\N{%s}): "
5592 "$^H{charnames} is not defined",SvPVX(sv_name));
5593 }
5594
5595
5596
5597 if (!RExC_charnames) {
5598 /* make sure our cache is allocated */
5599 RExC_charnames = newHV();
6bda09f9 5600 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
5601 }
5602 /* see if we have looked this one up before */
5603 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
5604 if ( he_str ) {
5605 sv_str = HeVAL(he_str);
5606 cached = 1;
5607 } else {
5608 dSP ;
5609
5610 ENTER ;
5611 SAVETMPS ;
5612 PUSHMARK(SP) ;
5613
5614 XPUSHs(sv_name);
5615
5616 PUTBACK ;
5617
5618 count= call_sv(cv, G_SCALAR);
5619
5620 if (count == 1) { /* XXXX is this right? dmq */
5621 sv_str = POPs;
5622 SvREFCNT_inc_simple_void(sv_str);
5623 }
5624
5625 SPAGAIN ;
5626 PUTBACK ;
5627 FREETMPS ;
5628 LEAVE ;
5629
5630 if ( !sv_str || !SvOK(sv_str) ) {
5631 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
5632 "did not return a defined value",SvPVX(sv_name));
5633 }
5634 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
5635 cached = 1;
5636 }
5637 }
5638 if (valuep) {
5639 char *p = SvPV(sv_str, len);
5640 if (len) {
5641 STRLEN numlen = 1;
5642 if ( SvUTF8(sv_str) ) {
196f1508 5643 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
5644 if (*valuep > 0x7F)
5645 RExC_utf8 = 1;
5646 /* XXXX
5647 We have to turn on utf8 for high bit chars otherwise
5648 we get failures with
5649
5650 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5651 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
5652
5653 This is different from what \x{} would do with the same
5654 codepoint, where the condition is > 0xFF.
5655 - dmq
5656 */
5657
5658
5659 } else {
5660 *valuep = (UV)*p;
5661 /* warn if we havent used the whole string? */
5662 }
5663 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5664 vWARN2(RExC_parse,
5665 "Ignoring excess chars from \\N{%s} in character class",
5666 SvPVX(sv_name)
5667 );
5668 }
5669 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
5670 vWARN2(RExC_parse,
5671 "Ignoring zero length \\N{%s} in character class",
5672 SvPVX(sv_name)
5673 );
5674 }
5675 if (sv_name)
5676 SvREFCNT_dec(sv_name);
5677 if (!cached)
5678 SvREFCNT_dec(sv_str);
5679 return len ? NULL : (regnode *)&len;
5680 } else if(SvCUR(sv_str)) {
5681
5682 char *s;
5683 char *p, *pend;
5684 STRLEN charlen = 1;
5685 char * parse_start = name-3; /* needed for the offsets */
5686 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
5687
5688 ret = reg_node(pRExC_state,
5689 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
5690 s= STRING(ret);
5691
5692 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
5693 sv_utf8_upgrade(sv_str);
5694 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
5695 RExC_utf8= 1;
5696 }
5697
5698 p = SvPV(sv_str, len);
5699 pend = p + len;
5700 /* len is the length written, charlen is the size the char read */
5701 for ( len = 0; p < pend; p += charlen ) {
5702 if (UTF) {
196f1508 5703 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
5704 if (FOLD) {
5705 STRLEN foldlen,numlen;
5706 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
5707 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
5708 /* Emit all the Unicode characters. */
5709
5710 for (foldbuf = tmpbuf;
5711 foldlen;
5712 foldlen -= numlen)
5713 {
5714 uvc = utf8_to_uvchr(foldbuf, &numlen);
5715 if (numlen > 0) {
5716 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5717 s += unilen;
5718 len += unilen;
5719 /* In EBCDIC the numlen
5720 * and unilen can differ. */
5721 foldbuf += numlen;
5722 if (numlen >= foldlen)
5723 break;
5724 }
5725 else
5726 break; /* "Can't happen." */
5727 }
5728 } else {
5729 const STRLEN unilen = reguni(pRExC_state, uvc, s);
5730 if (unilen > 0) {
5731 s += unilen;
5732 len += unilen;
5733 }
5734 }
5735 } else {
5736 len++;
5737 REGC(*p, s++);
5738 }
5739 }
5740 if (SIZE_ONLY) {
5741 RExC_size += STR_SZ(len);
5742 } else {
5743 STR_LEN(ret) = len;
5744 RExC_emit += STR_SZ(len);
5745 }
5746 Set_Node_Cur_Length(ret); /* MJD */
5747 RExC_parse--;
5748 nextchar(pRExC_state);
5749 } else {
5750 ret = reg_node(pRExC_state,NOTHING);
5751 }
5752 if (!cached) {
5753 SvREFCNT_dec(sv_str);
5754 }
5755 if (sv_name) {
5756 SvREFCNT_dec(sv_name);
5757 }
5758 return ret;
5759
5760}
5761
5762
5763
a687059c
LW
5764/*
5765 - regatom - the lowest level
5766 *
5767 * Optimization: gobbles an entire sequence of ordinary characters so that
5768 * it can turn them into a single node, which is smaller to store and
5769 * faster to run. Backslashed characters are exceptions, each becoming a
5770 * separate node; the code is simpler that way and it's not worth fixing.
5771 *
7f6f358c
YO
5772 * [Yes, it is worth fixing, some scripts can run twice the speed.]
5773 * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes]
5774 */
76e3520e 5775STATIC regnode *
3dab1dad 5776S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5777{
97aff369 5778 dVAR;
cbbf8932 5779 register regnode *ret = NULL;
a0d0e21e 5780 I32 flags;
45948336 5781 char *parse_start = RExC_parse;
3dab1dad
YO
5782 GET_RE_DEBUG_FLAGS_DECL;
5783 DEBUG_PARSE("atom");
a0d0e21e
LW
5784 *flagp = WORST; /* Tentatively. */
5785
5786tryagain:
830247a4 5787 switch (*RExC_parse) {
a0d0e21e 5788 case '^':
830247a4
IZ
5789 RExC_seen_zerolen++;
5790 nextchar(pRExC_state);
e2509266 5791 if (RExC_flags & PMf_MULTILINE)
830247a4 5792 ret = reg_node(pRExC_state, MBOL);
e2509266 5793 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5794 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5795 else
830247a4 5796 ret = reg_node(pRExC_state, BOL);
fac92740 5797 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5798 break;
5799 case '$':
830247a4 5800 nextchar(pRExC_state);
b81d288d 5801 if (*RExC_parse)
830247a4 5802 RExC_seen_zerolen++;
e2509266 5803 if (RExC_flags & PMf_MULTILINE)
830247a4 5804 ret = reg_node(pRExC_state, MEOL);
e2509266 5805 else if (RExC_flags & PMf_SINGLELINE)
830247a4 5806 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5807 else
830247a4 5808 ret = reg_node(pRExC_state, EOL);
fac92740 5809 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5810 break;
5811 case '.':
830247a4 5812 nextchar(pRExC_state);
e2509266 5813 if (RExC_flags & PMf_SINGLELINE)
ffc61ed2
JH
5814 ret = reg_node(pRExC_state, SANY);
5815 else
5816 ret = reg_node(pRExC_state, REG_ANY);
5817 *flagp |= HASWIDTH|SIMPLE;
830247a4 5818 RExC_naughty++;
fac92740 5819 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
5820 break;
5821 case '[':
b45f050a 5822 {
3dab1dad
YO
5823 char * const oregcomp_parse = ++RExC_parse;
5824 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
5825 if (*RExC_parse != ']') {
5826 RExC_parse = oregcomp_parse;
b45f050a
JF
5827 vFAIL("Unmatched [");
5828 }
830247a4 5829 nextchar(pRExC_state);
a0d0e21e 5830 *flagp |= HASWIDTH|SIMPLE;
fac92740 5831 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 5832 break;
b45f050a 5833 }
a0d0e21e 5834 case '(':
830247a4 5835 nextchar(pRExC_state);
3dab1dad 5836 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 5837 if (ret == NULL) {
bf93d4cc 5838 if (flags & TRYAGAIN) {
830247a4 5839 if (RExC_parse == RExC_end) {
bf93d4cc
GS
5840 /* Make parent create an empty node if needed. */
5841 *flagp |= TRYAGAIN;
5842 return(NULL);
5843 }
a0d0e21e 5844 goto tryagain;
bf93d4cc 5845 }
a0d0e21e
LW
5846 return(NULL);
5847 }
c277df42 5848 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
5849 break;
5850 case '|':
5851 case ')':
5852 if (flags & TRYAGAIN) {
5853 *flagp |= TRYAGAIN;
5854 return NULL;
5855 }
b45f050a 5856 vFAIL("Internal urp");
a0d0e21e
LW
5857 /* Supposed to be caught earlier. */
5858 break;
85afd4ae 5859 case '{':
830247a4
IZ
5860 if (!regcurly(RExC_parse)) {
5861 RExC_parse++;
85afd4ae
CS
5862 goto defchar;
5863 }
5864 /* FALL THROUGH */
a0d0e21e
LW
5865 case '?':
5866 case '+':
5867 case '*':
830247a4 5868 RExC_parse++;
b45f050a 5869 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
5870 break;
5871 case '\\':
830247a4 5872 switch (*++RExC_parse) {
a0d0e21e 5873 case 'A':
830247a4
IZ
5874 RExC_seen_zerolen++;
5875 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 5876 *flagp |= SIMPLE;
830247a4 5877 nextchar(pRExC_state);
fac92740 5878 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5879 break;
5880 case 'G':
830247a4
IZ
5881 ret = reg_node(pRExC_state, GPOS);
5882 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 5883 *flagp |= SIMPLE;
830247a4 5884 nextchar(pRExC_state);
fac92740 5885 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5886 break;
5887 case 'Z':
830247a4 5888 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 5889 *flagp |= SIMPLE;
a1917ab9 5890 RExC_seen_zerolen++; /* Do not optimize RE away */
830247a4 5891 nextchar(pRExC_state);
a0d0e21e 5892 break;
b85d18e9 5893 case 'z':
830247a4 5894 ret = reg_node(pRExC_state, EOS);
b85d18e9 5895 *flagp |= SIMPLE;
830247a4
IZ
5896 RExC_seen_zerolen++; /* Do not optimize RE away */
5897 nextchar(pRExC_state);
fac92740 5898 Set_Node_Length(ret, 2); /* MJD */
b85d18e9 5899 break;
4a2d328f 5900 case 'C':
f33976b4
DB
5901 ret = reg_node(pRExC_state, CANY);
5902 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 5903 *flagp |= HASWIDTH|SIMPLE;
830247a4 5904 nextchar(pRExC_state);
fac92740 5905 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3
LW
5906 break;
5907 case 'X':
830247a4 5908 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 5909 *flagp |= HASWIDTH;
830247a4 5910 nextchar(pRExC_state);
fac92740 5911 Set_Node_Length(ret, 2); /* MJD */
a0ed51b3 5912 break;
a0d0e21e 5913 case 'w':
eb160463 5914 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 5915 *flagp |= HASWIDTH|SIMPLE;
830247a4 5916 nextchar(pRExC_state);
fac92740 5917 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5918 break;
5919 case 'W':
eb160463 5920 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 5921 *flagp |= HASWIDTH|SIMPLE;
830247a4 5922 nextchar(pRExC_state);
fac92740 5923 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5924 break;
5925 case 'b':
830247a4
IZ
5926 RExC_seen_zerolen++;
5927 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5928 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 5929 *flagp |= SIMPLE;
830247a4 5930 nextchar(pRExC_state);
fac92740 5931 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5932 break;
5933 case 'B':
830247a4
IZ
5934 RExC_seen_zerolen++;
5935 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 5936 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 5937 *flagp |= SIMPLE;
830247a4 5938 nextchar(pRExC_state);
fac92740 5939 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5940 break;
5941 case 's':
eb160463 5942 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 5943 *flagp |= HASWIDTH|SIMPLE;
830247a4 5944 nextchar(pRExC_state);
fac92740 5945 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5946 break;
5947 case 'S':
eb160463 5948 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 5949 *flagp |= HASWIDTH|SIMPLE;
830247a4 5950 nextchar(pRExC_state);
fac92740 5951 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5952 break;
5953 case 'd':
ffc61ed2 5954 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 5955 *flagp |= HASWIDTH|SIMPLE;
830247a4 5956 nextchar(pRExC_state);
fac92740 5957 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e
LW
5958 break;
5959 case 'D':
ffc61ed2 5960 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 5961 *flagp |= HASWIDTH|SIMPLE;
830247a4 5962 nextchar(pRExC_state);
fac92740 5963 Set_Node_Length(ret, 2); /* MJD */
a0d0e21e 5964 break;
a14b48bc
LW
5965 case 'p':
5966 case 'P':
3568d838 5967 {
3dab1dad 5968 char* const oldregxend = RExC_end;
ccb2c380 5969 char* parse_start = RExC_parse - 2;
a14b48bc 5970
830247a4 5971 if (RExC_parse[1] == '{') {
3568d838 5972 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
5973 RExC_end = strchr(RExC_parse, '}');
5974 if (!RExC_end) {
3dab1dad 5975 const U8 c = (U8)*RExC_parse;
830247a4
IZ
5976 RExC_parse += 2;
5977 RExC_end = oldregxend;
0da60cf5 5978 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 5979 }
830247a4 5980 RExC_end++;
a14b48bc 5981 }
af6f566e 5982 else {
830247a4 5983 RExC_end = RExC_parse + 2;
af6f566e
HS
5984 if (RExC_end > oldregxend)
5985 RExC_end = oldregxend;
5986 }
830247a4 5987 RExC_parse--;
a14b48bc 5988
3dab1dad 5989 ret = regclass(pRExC_state,depth+1);
a14b48bc 5990
830247a4
IZ
5991 RExC_end = oldregxend;
5992 RExC_parse--;
ccb2c380
MP
5993
5994 Set_Node_Offset(ret, parse_start + 2);
5995 Set_Node_Cur_Length(ret);
830247a4 5996 nextchar(pRExC_state);
a14b48bc
LW
5997 *flagp |= HASWIDTH|SIMPLE;
5998 }
5999 break;
fc8cd66c
YO
6000 case 'N':
6001 /* Handle \N{NAME} here and not below because it can be
6002 multicharacter. join_exact() will join them up later on.
6003 Also this makes sure that things like /\N{BLAH}+/ and
6004 \N{BLAH} being multi char Just Happen. dmq*/
6005 ++RExC_parse;
6006 ret= reg_namedseq(pRExC_state, NULL);
6007 break;
0a4db386 6008 case 'k': /* Handle \k<NAME> and \k'NAME' */
81714fb9
YO
6009 {
6010 char ch= RExC_parse[1];
6011 if (ch != '<' && ch != '\'') {
6012 if (SIZE_ONLY)
6013 vWARN( RExC_parse + 1,
6014 "Possible broken named back reference treated as literal k");
6015 parse_start--;
6016 goto defchar;
6017 } else {
6018 char* name_start = (RExC_parse += 2);
6019 I32 num = 0;
0a4db386
YO
6020 SV *sv_dat = reg_scan_name(pRExC_state,
6021 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
894be9b7
YO
6022 ch= (ch == '<') ? '>' : '\'';
6023
81714fb9
YO
6024 if (RExC_parse == name_start || *RExC_parse != ch)
6025 vFAIL2("Sequence \\k%c... not terminated",
6026 (ch == '>' ? '<' : ch));
6027
6028 RExC_sawback = 1;
6029 ret = reganode(pRExC_state,
6030 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6031 num);
6032 *flagp |= HASWIDTH;
6033
6034
6035 if (!SIZE_ONLY) {
81714fb9
YO
6036 num = add_data( pRExC_state, 1, "S" );
6037 ARG_SET(ret,num);
6038 RExC_rx->data->data[num]=(void*)sv_dat;
6039 SvREFCNT_inc(sv_dat);
6040 }
6041 /* override incorrect value set in reganode MJD */
6042 Set_Node_Offset(ret, parse_start+1);
6043 Set_Node_Cur_Length(ret); /* MJD */
6044 nextchar(pRExC_state);
6045
6046 }
6047 break;
6048 }
a0d0e21e
LW
6049 case 'n':
6050 case 'r':
6051 case 't':
6052 case 'f':
6053 case 'e':
6054 case 'a':
6055 case 'x':
6056 case 'c':
6057 case '0':
6058 goto defchar;
6059 case '1': case '2': case '3': case '4':
6060 case '5': case '6': case '7': case '8': case '9':
6061 {
1df70142 6062 const I32 num = atoi(RExC_parse);
a0d0e21e 6063
830247a4 6064 if (num > 9 && num >= RExC_npar)
a0d0e21e
LW
6065 goto defchar;
6066 else {
3dab1dad 6067 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6068 while (isDIGIT(*RExC_parse))
6069 RExC_parse++;
b45f050a 6070
eb160463 6071 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
9baa0206 6072 vFAIL("Reference to nonexistent group");
830247a4 6073 RExC_sawback = 1;
eb160463
GS
6074 ret = reganode(pRExC_state,
6075 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6076 num);
a0d0e21e 6077 *flagp |= HASWIDTH;
2af232bd 6078
fac92740 6079 /* override incorrect value set in reganode MJD */
2af232bd 6080 Set_Node_Offset(ret, parse_start+1);
fac92740 6081 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6082 RExC_parse--;
6083 nextchar(pRExC_state);
a0d0e21e
LW
6084 }
6085 }
6086 break;
6087 case '\0':
830247a4 6088 if (RExC_parse >= RExC_end)
b45f050a 6089 FAIL("Trailing \\");
a0d0e21e
LW
6090 /* FALL THROUGH */
6091 default:
a0288114 6092 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6093 back into the quick-grab loop below */
45948336 6094 parse_start--;
a0d0e21e
LW
6095 goto defchar;
6096 }
6097 break;
4633a7c4
LW
6098
6099 case '#':
e2509266 6100 if (RExC_flags & PMf_EXTENDED) {
3dab1dad
YO
6101 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6102 RExC_parse++;
830247a4 6103 if (RExC_parse < RExC_end)
4633a7c4
LW
6104 goto tryagain;
6105 }
6106 /* FALL THROUGH */
6107
a0d0e21e 6108 default: {
ba210ebe 6109 register STRLEN len;
58ae7d3f 6110 register UV ender;
a0d0e21e 6111 register char *p;
3dab1dad 6112 char *s;
80aecb99 6113 STRLEN foldlen;
89ebb4a3 6114 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6115
6116 parse_start = RExC_parse - 1;
a0d0e21e 6117
830247a4 6118 RExC_parse++;
a0d0e21e
LW
6119
6120 defchar:
58ae7d3f 6121 ender = 0;
eb160463
GS
6122 ret = reg_node(pRExC_state,
6123 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6124 s = STRING(ret);
830247a4
IZ
6125 for (len = 0, p = RExC_parse - 1;
6126 len < 127 && p < RExC_end;
a0d0e21e
LW
6127 len++)
6128 {
3dab1dad 6129 char * const oldp = p;
5b5a24f7 6130
e2509266 6131 if (RExC_flags & PMf_EXTENDED)
830247a4 6132 p = regwhite(p, RExC_end);
a0d0e21e
LW
6133 switch (*p) {
6134 case '^':
6135 case '$':
6136 case '.':
6137 case '[':
6138 case '(':
6139 case ')':
6140 case '|':
6141 goto loopdone;
6142 case '\\':
6143 switch (*++p) {
6144 case 'A':
1ed8eac0
JF
6145 case 'C':
6146 case 'X':
a0d0e21e
LW
6147 case 'G':
6148 case 'Z':
b85d18e9 6149 case 'z':
a0d0e21e
LW
6150 case 'w':
6151 case 'W':
6152 case 'b':
6153 case 'B':
6154 case 's':
6155 case 'S':
6156 case 'd':
6157 case 'D':
a14b48bc
LW
6158 case 'p':
6159 case 'P':
fc8cd66c 6160 case 'N':
a0d0e21e
LW
6161 --p;
6162 goto loopdone;
6163 case 'n':
6164 ender = '\n';
6165 p++;
a687059c 6166 break;
a0d0e21e
LW
6167 case 'r':
6168 ender = '\r';
6169 p++;
a687059c 6170 break;
a0d0e21e
LW
6171 case 't':
6172 ender = '\t';
6173 p++;
a687059c 6174 break;
a0d0e21e
LW
6175 case 'f':
6176 ender = '\f';
6177 p++;
a687059c 6178 break;
a0d0e21e 6179 case 'e':
c7f1f016 6180 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6181 p++;
a687059c 6182 break;
a0d0e21e 6183 case 'a':
c7f1f016 6184 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6185 p++;
a687059c 6186 break;
a0d0e21e 6187 case 'x':
a0ed51b3 6188 if (*++p == '{') {
1df70142 6189 char* const e = strchr(p, '}');
b81d288d 6190
b45f050a 6191 if (!e) {
830247a4 6192 RExC_parse = p + 1;
b45f050a
JF
6193 vFAIL("Missing right brace on \\x{}");
6194 }
de5f0749 6195 else {
a4c04bdc
NC
6196 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6197 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6198 STRLEN numlen = e - p - 1;
53305cf1 6199 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6200 if (ender > 0xff)
6201 RExC_utf8 = 1;
a0ed51b3
LW
6202 p = e + 1;
6203 }
a0ed51b3
LW
6204 }
6205 else {
a4c04bdc 6206 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6207 STRLEN numlen = 2;
53305cf1 6208 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6209 p += numlen;
6210 }
a687059c 6211 break;
a0d0e21e
LW
6212 case 'c':
6213 p++;
bbce6d69 6214 ender = UCHARAT(p++);
6215 ender = toCTRL(ender);
a687059c 6216 break;
a0d0e21e
LW
6217 case '0': case '1': case '2': case '3':case '4':
6218 case '5': case '6': case '7': case '8':case '9':
6219 if (*p == '0' ||
830247a4 6220 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6221 I32 flags = 0;
1df70142 6222 STRLEN numlen = 3;
53305cf1 6223 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6224 p += numlen;
6225 }
6226 else {
6227 --p;
6228 goto loopdone;
a687059c
LW
6229 }
6230 break;
a0d0e21e 6231 case '\0':
830247a4 6232 if (p >= RExC_end)
b45f050a 6233 FAIL("Trailing \\");
a687059c 6234 /* FALL THROUGH */
a0d0e21e 6235 default:
041457d9 6236 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6237 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6238 goto normal_default;
a0d0e21e
LW
6239 }
6240 break;
a687059c 6241 default:
a0ed51b3 6242 normal_default:
fd400ab9 6243 if (UTF8_IS_START(*p) && UTF) {
1df70142 6244 STRLEN numlen;
5e12f4fb 6245 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6246 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6247 p += numlen;
6248 }
6249 else
6250 ender = *p++;
a0d0e21e 6251 break;
a687059c 6252 }
e2509266 6253 if (RExC_flags & PMf_EXTENDED)
830247a4 6254 p = regwhite(p, RExC_end);
60a8b682
JH
6255 if (UTF && FOLD) {
6256 /* Prime the casefolded buffer. */
ac7e0132 6257 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6258 }
a0d0e21e
LW
6259 if (ISMULT2(p)) { /* Back off on ?+*. */
6260 if (len)
6261 p = oldp;
16ea2a2e 6262 else if (UTF) {
80aecb99 6263 if (FOLD) {
60a8b682 6264 /* Emit all the Unicode characters. */
1df70142 6265 STRLEN numlen;
80aecb99
JH
6266 for (foldbuf = tmpbuf;
6267 foldlen;
6268 foldlen -= numlen) {
6269 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6270 if (numlen > 0) {
71207a34 6271 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6272 s += unilen;
6273 len += unilen;
6274 /* In EBCDIC the numlen
6275 * and unilen can differ. */
9dc45d57 6276 foldbuf += numlen;
47654450
JH
6277 if (numlen >= foldlen)
6278 break;
9dc45d57
JH
6279 }
6280 else
6281 break; /* "Can't happen." */
80aecb99
JH
6282 }
6283 }
6284 else {
71207a34 6285 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6286 if (unilen > 0) {
0ebc6274
JH
6287 s += unilen;
6288 len += unilen;
9dc45d57 6289 }
80aecb99 6290 }
a0ed51b3 6291 }
a0d0e21e
LW
6292 else {
6293 len++;
eb160463 6294 REGC((char)ender, s++);
a0d0e21e
LW
6295 }
6296 break;
a687059c 6297 }
16ea2a2e 6298 if (UTF) {
80aecb99 6299 if (FOLD) {
60a8b682 6300 /* Emit all the Unicode characters. */
1df70142 6301 STRLEN numlen;
80aecb99
JH
6302 for (foldbuf = tmpbuf;
6303 foldlen;
6304 foldlen -= numlen) {
6305 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6306 if (numlen > 0) {
71207a34 6307 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6308 len += unilen;
6309 s += unilen;
6310 /* In EBCDIC the numlen
6311 * and unilen can differ. */
9dc45d57 6312 foldbuf += numlen;
47654450
JH
6313 if (numlen >= foldlen)
6314 break;
9dc45d57
JH
6315 }
6316 else
6317 break;
80aecb99
JH
6318 }
6319 }
6320 else {
71207a34 6321 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6322 if (unilen > 0) {
0ebc6274
JH
6323 s += unilen;
6324 len += unilen;
9dc45d57 6325 }
80aecb99
JH
6326 }
6327 len--;
a0ed51b3
LW
6328 }
6329 else
eb160463 6330 REGC((char)ender, s++);
a0d0e21e
LW
6331 }
6332 loopdone:
830247a4 6333 RExC_parse = p - 1;
fac92740 6334 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6335 nextchar(pRExC_state);
793db0cb
JH
6336 {
6337 /* len is STRLEN which is unsigned, need to copy to signed */
6338 IV iv = len;
6339 if (iv < 0)
6340 vFAIL("Internal disaster");
6341 }
a0d0e21e
LW
6342 if (len > 0)
6343 *flagp |= HASWIDTH;
090f7165 6344 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6345 *flagp |= SIMPLE;
3dab1dad 6346
cd439c50 6347 if (SIZE_ONLY)
830247a4 6348 RExC_size += STR_SZ(len);
3dab1dad
YO
6349 else {
6350 STR_LEN(ret) = len;
830247a4 6351 RExC_emit += STR_SZ(len);
07be1b83 6352 }
3dab1dad 6353 }
a0d0e21e
LW
6354 break;
6355 }
a687059c 6356
60a8b682
JH
6357 /* If the encoding pragma is in effect recode the text of
6358 * any EXACT-kind nodes. */
fc8cd66c 6359 if (ret && PL_encoding && PL_regkind[OP(ret)] == EXACT) {
3dab1dad
YO
6360 const STRLEN oldlen = STR_LEN(ret);
6361 SV * const sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
d0063567
DK
6362
6363 if (RExC_utf8)
6364 SvUTF8_on(sv);
6365 if (sv_utf8_downgrade(sv, TRUE)) {
1df70142
AL
6366 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
6367 const STRLEN newlen = SvCUR(sv);
d0063567
DK
6368
6369 if (SvUTF8(sv))
6370 RExC_utf8 = 1;
6371 if (!SIZE_ONLY) {
a3621e74
YO
6372 GET_RE_DEBUG_FLAGS_DECL;
6373 DEBUG_COMPILE_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
d0063567
DK
6374 (int)oldlen, STRING(ret),
6375 (int)newlen, s));
6376 Copy(s, STRING(ret), newlen, char);
6377 STR_LEN(ret) += newlen - oldlen;
6378 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
6379 } else
6380 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
6381 }
a72c7584
JH
6382 }
6383
a0d0e21e 6384 return(ret);
a687059c
LW
6385}
6386
873ef191 6387STATIC char *
5f66b61c 6388S_regwhite(char *p, const char *e)
5b5a24f7
CS
6389{
6390 while (p < e) {
6391 if (isSPACE(*p))
6392 ++p;
6393 else if (*p == '#') {
6394 do {
6395 p++;
6396 } while (p < e && *p != '\n');
6397 }
6398 else
6399 break;
6400 }
6401 return p;
6402}
6403
b8c5462f
JH
6404/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6405 Character classes ([:foo:]) can also be negated ([:^foo:]).
6406 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6407 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6408 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6409
6410#define POSIXCC_DONE(c) ((c) == ':')
6411#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6412#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6413
b8c5462f 6414STATIC I32
830247a4 6415S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6416{
97aff369 6417 dVAR;
936ed897 6418 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6419
830247a4 6420 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6421 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6422 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6423 const char c = UCHARAT(RExC_parse);
097eb12c 6424 char* const s = RExC_parse++;
b81d288d 6425
9a86a77b 6426 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6427 RExC_parse++;
6428 if (RExC_parse == RExC_end)
620e46c5 6429 /* Grandfather lone [:, [=, [. */
830247a4 6430 RExC_parse = s;
620e46c5 6431 else {
3dab1dad 6432 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6433 assert(*t == c);
6434
9a86a77b 6435 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6436 const char *posixcc = s + 1;
830247a4 6437 RExC_parse++; /* skip over the ending ] */
3dab1dad 6438
b8c5462f 6439 if (*s == ':') {
1df70142
AL
6440 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6441 const I32 skip = t - posixcc;
80916619
NC
6442
6443 /* Initially switch on the length of the name. */
6444 switch (skip) {
6445 case 4:
3dab1dad
YO
6446 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6447 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6448 break;
80916619
NC
6449 case 5:
6450 /* Names all of length 5. */
6451 /* alnum alpha ascii blank cntrl digit graph lower
6452 print punct space upper */
6453 /* Offset 4 gives the best switch position. */
6454 switch (posixcc[4]) {
6455 case 'a':
3dab1dad
YO
6456 if (memEQ(posixcc, "alph", 4)) /* alpha */
6457 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
6458 break;
6459 case 'e':
3dab1dad
YO
6460 if (memEQ(posixcc, "spac", 4)) /* space */
6461 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
6462 break;
6463 case 'h':
3dab1dad
YO
6464 if (memEQ(posixcc, "grap", 4)) /* graph */
6465 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
6466 break;
6467 case 'i':
3dab1dad
YO
6468 if (memEQ(posixcc, "asci", 4)) /* ascii */
6469 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
6470 break;
6471 case 'k':
3dab1dad
YO
6472 if (memEQ(posixcc, "blan", 4)) /* blank */
6473 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
6474 break;
6475 case 'l':
3dab1dad
YO
6476 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
6477 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
6478 break;
6479 case 'm':
3dab1dad
YO
6480 if (memEQ(posixcc, "alnu", 4)) /* alnum */
6481 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
6482 break;
6483 case 'r':
3dab1dad
YO
6484 if (memEQ(posixcc, "lowe", 4)) /* lower */
6485 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
6486 else if (memEQ(posixcc, "uppe", 4)) /* upper */
6487 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
6488 break;
6489 case 't':
3dab1dad
YO
6490 if (memEQ(posixcc, "digi", 4)) /* digit */
6491 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
6492 else if (memEQ(posixcc, "prin", 4)) /* print */
6493 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
6494 else if (memEQ(posixcc, "punc", 4)) /* punct */
6495 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 6496 break;
b8c5462f
JH
6497 }
6498 break;
80916619 6499 case 6:
3dab1dad
YO
6500 if (memEQ(posixcc, "xdigit", 6))
6501 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
6502 break;
6503 }
80916619
NC
6504
6505 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
6506 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
6507 t - s - 1, s + 1);
80916619
NC
6508 assert (posixcc[skip] == ':');
6509 assert (posixcc[skip+1] == ']');
b45f050a 6510 } else if (!SIZE_ONLY) {
b8c5462f 6511 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 6512
830247a4 6513 /* adjust RExC_parse so the warning shows after
b45f050a 6514 the class closes */
9a86a77b 6515 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 6516 RExC_parse++;
b45f050a
JF
6517 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6518 }
b8c5462f
JH
6519 } else {
6520 /* Maternal grandfather:
6521 * "[:" ending in ":" but not in ":]" */
830247a4 6522 RExC_parse = s;
767d463e 6523 }
620e46c5
JH
6524 }
6525 }
6526
b8c5462f
JH
6527 return namedclass;
6528}
6529
6530STATIC void
830247a4 6531S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 6532{
97aff369 6533 dVAR;
3dab1dad 6534 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
6535 const char *s = RExC_parse;
6536 const char c = *s++;
b8c5462f 6537
3dab1dad 6538 while (isALNUM(*s))
b8c5462f
JH
6539 s++;
6540 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
6541 if (ckWARN(WARN_REGEXP))
6542 vWARN3(s+2,
6543 "POSIX syntax [%c %c] belongs inside character classes",
6544 c, c);
b45f050a
JF
6545
6546 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 6547 if (POSIXCC_NOTYET(c)) {
830247a4 6548 /* adjust RExC_parse so the error shows after
b45f050a 6549 the class closes */
9a86a77b 6550 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 6551 NOOP;
b45f050a
JF
6552 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
6553 }
b8c5462f
JH
6554 }
6555 }
620e46c5
JH
6556}
6557
7f6f358c
YO
6558
6559/*
6560 parse a class specification and produce either an ANYOF node that
6561 matches the pattern. If the pattern matches a single char only and
6562 that char is < 256 then we produce an EXACT node instead.
6563*/
76e3520e 6564STATIC regnode *
3dab1dad 6565S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 6566{
97aff369 6567 dVAR;
9ef43ace 6568 register UV value = 0;
9a86a77b 6569 register UV nextvalue;
3568d838 6570 register IV prevvalue = OOB_UNICODE;
ffc61ed2 6571 register IV range = 0;
c277df42 6572 register regnode *ret;
ba210ebe 6573 STRLEN numlen;
ffc61ed2 6574 IV namedclass;
cbbf8932 6575 char *rangebegin = NULL;
936ed897 6576 bool need_class = 0;
c445ea15 6577 SV *listsv = NULL;
ffc61ed2 6578 UV n;
9e55ce06 6579 bool optimize_invert = TRUE;
cbbf8932 6580 AV* unicode_alternate = NULL;
1b2d223b
JH
6581#ifdef EBCDIC
6582 UV literal_endpoint = 0;
6583#endif
7f6f358c 6584 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 6585
3dab1dad 6586 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 6587 case we need to change the emitted regop to an EXACT. */
07be1b83 6588 const char * orig_parse = RExC_parse;
72f13be8 6589 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
6590#ifndef DEBUGGING
6591 PERL_UNUSED_ARG(depth);
6592#endif
72f13be8 6593
3dab1dad 6594 DEBUG_PARSE("clas");
7f6f358c
YO
6595
6596 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
6597 ret = reganode(pRExC_state, ANYOF, 0);
6598
6599 if (!SIZE_ONLY)
6600 ANYOF_FLAGS(ret) = 0;
6601
9a86a77b 6602 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
6603 RExC_naughty++;
6604 RExC_parse++;
6605 if (!SIZE_ONLY)
6606 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
6607 }
a0d0e21e 6608
73060fc4 6609 if (SIZE_ONLY) {
830247a4 6610 RExC_size += ANYOF_SKIP;
73060fc4
JH
6611 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
6612 }
936ed897 6613 else {
830247a4 6614 RExC_emit += ANYOF_SKIP;
936ed897
IZ
6615 if (FOLD)
6616 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
6617 if (LOC)
6618 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 6619 ANYOF_BITMAP_ZERO(ret);
396482e1 6620 listsv = newSVpvs("# comment\n");
a0d0e21e 6621 }
b8c5462f 6622
9a86a77b
JH
6623 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6624
b938889d 6625 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 6626 checkposixcc(pRExC_state);
b8c5462f 6627
f064b6ad
HS
6628 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
6629 if (UCHARAT(RExC_parse) == ']')
6630 goto charclassloop;
ffc61ed2 6631
fc8cd66c 6632parseit:
9a86a77b 6633 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
6634
6635 charclassloop:
6636
6637 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
6638
73b437c8 6639 if (!range)
830247a4 6640 rangebegin = RExC_parse;
ffc61ed2 6641 if (UTF) {
5e12f4fb 6642 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 6643 RExC_end - RExC_parse,
9f7f3913 6644 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6645 RExC_parse += numlen;
6646 }
6647 else
6648 value = UCHARAT(RExC_parse++);
7f6f358c 6649
9a86a77b
JH
6650 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
6651 if (value == '[' && POSIXCC(nextvalue))
830247a4 6652 namedclass = regpposixcc(pRExC_state, value);
620e46c5 6653 else if (value == '\\') {
ffc61ed2 6654 if (UTF) {
5e12f4fb 6655 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 6656 RExC_end - RExC_parse,
9f7f3913 6657 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
6658 RExC_parse += numlen;
6659 }
6660 else
6661 value = UCHARAT(RExC_parse++);
470c3474 6662 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 6663 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
6664 * be a problem later if we want switch on Unicode.
6665 * A similar issue a little bit later when switching on
6666 * namedclass. --jhi */
ffc61ed2 6667 switch ((I32)value) {
b8c5462f
JH
6668 case 'w': namedclass = ANYOF_ALNUM; break;
6669 case 'W': namedclass = ANYOF_NALNUM; break;
6670 case 's': namedclass = ANYOF_SPACE; break;
6671 case 'S': namedclass = ANYOF_NSPACE; break;
6672 case 'd': namedclass = ANYOF_DIGIT; break;
6673 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
6674 case 'N': /* Handle \N{NAME} in class */
6675 {
6676 /* We only pay attention to the first char of
6677 multichar strings being returned. I kinda wonder
6678 if this makes sense as it does change the behaviour
6679 from earlier versions, OTOH that behaviour was broken
6680 as well. */
6681 UV v; /* value is register so we cant & it /grrr */
6682 if (reg_namedseq(pRExC_state, &v)) {
6683 goto parseit;
6684 }
6685 value= v;
6686 }
6687 break;
ffc61ed2
JH
6688 case 'p':
6689 case 'P':
3dab1dad
YO
6690 {
6691 char *e;
af6f566e 6692 if (RExC_parse >= RExC_end)
2a4859cd 6693 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 6694 if (*RExC_parse == '{') {
1df70142 6695 const U8 c = (U8)value;
ffc61ed2
JH
6696 e = strchr(RExC_parse++, '}');
6697 if (!e)
0da60cf5 6698 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
6699 while (isSPACE(UCHARAT(RExC_parse)))
6700 RExC_parse++;
6701 if (e == RExC_parse)
0da60cf5 6702 vFAIL2("Empty \\%c{}", c);
ffc61ed2 6703 n = e - RExC_parse;
ab13f0c7
JH
6704 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
6705 n--;
ffc61ed2
JH
6706 }
6707 else {
6708 e = RExC_parse;
6709 n = 1;
6710 }
6711 if (!SIZE_ONLY) {
ab13f0c7
JH
6712 if (UCHARAT(RExC_parse) == '^') {
6713 RExC_parse++;
6714 n--;
6715 value = value == 'p' ? 'P' : 'p'; /* toggle */
6716 while (isSPACE(UCHARAT(RExC_parse))) {
6717 RExC_parse++;
6718 n--;
6719 }
6720 }
097eb12c
AL
6721 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
6722 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
6723 }
6724 RExC_parse = e + 1;
6725 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 6726 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 6727 }
f81125e2 6728 break;
b8c5462f
JH
6729 case 'n': value = '\n'; break;
6730 case 'r': value = '\r'; break;
6731 case 't': value = '\t'; break;
6732 case 'f': value = '\f'; break;
6733 case 'b': value = '\b'; break;
c7f1f016
NIS
6734 case 'e': value = ASCII_TO_NATIVE('\033');break;
6735 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 6736 case 'x':
ffc61ed2 6737 if (*RExC_parse == '{') {
a4c04bdc
NC
6738 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6739 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 6740 char * const e = strchr(RExC_parse++, '}');
b81d288d 6741 if (!e)
ffc61ed2 6742 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
6743
6744 numlen = e - RExC_parse;
6745 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6746 RExC_parse = e + 1;
6747 }
6748 else {
a4c04bdc 6749 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
6750 numlen = 2;
6751 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
6752 RExC_parse += numlen;
6753 }
b8c5462f
JH
6754 break;
6755 case 'c':
830247a4 6756 value = UCHARAT(RExC_parse++);
b8c5462f
JH
6757 value = toCTRL(value);
6758 break;
6759 case '0': case '1': case '2': case '3': case '4':
6760 case '5': case '6': case '7': case '8': case '9':
53305cf1
NC
6761 {
6762 I32 flags = 0;
6763 numlen = 3;
6764 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
830247a4 6765 RExC_parse += numlen;
b8c5462f 6766 break;
53305cf1 6767 }
1028017a 6768 default:
041457d9 6769 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
6770 vWARN2(RExC_parse,
6771 "Unrecognized escape \\%c in character class passed through",
6772 (int)value);
1028017a 6773 break;
b8c5462f 6774 }
ffc61ed2 6775 } /* end of \blah */
1b2d223b
JH
6776#ifdef EBCDIC
6777 else
6778 literal_endpoint++;
6779#endif
ffc61ed2
JH
6780
6781 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
6782
6783 if (!SIZE_ONLY && !need_class)
936ed897 6784 ANYOF_CLASS_ZERO(ret);
ffc61ed2 6785
936ed897 6786 need_class = 1;
ffc61ed2
JH
6787
6788 /* a bad range like a-\d, a-[:digit:] ? */
6789 if (range) {
73b437c8 6790 if (!SIZE_ONLY) {
afd78fd5 6791 if (ckWARN(WARN_REGEXP)) {
097eb12c 6792 const int w =
afd78fd5
JH
6793 RExC_parse >= rangebegin ?
6794 RExC_parse - rangebegin : 0;
830247a4 6795 vWARN4(RExC_parse,
b45f050a 6796 "False [] range \"%*.*s\"",
097eb12c 6797 w, w, rangebegin);
afd78fd5 6798 }
3568d838
JH
6799 if (prevvalue < 256) {
6800 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
6801 ANYOF_BITMAP_SET(ret, '-');
6802 }
6803 else {
6804 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
6805 Perl_sv_catpvf(aTHX_ listsv,
3568d838 6806 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 6807 }
b8c5462f 6808 }
ffc61ed2
JH
6809
6810 range = 0; /* this was not a true range */
73b437c8 6811 }
ffc61ed2 6812
73b437c8 6813 if (!SIZE_ONLY) {
c49a72a9
NC
6814 const char *what = NULL;
6815 char yesno = 0;
6816
3568d838
JH
6817 if (namedclass > OOB_NAMEDCLASS)
6818 optimize_invert = FALSE;
e2962f66
JH
6819 /* Possible truncation here but in some 64-bit environments
6820 * the compiler gets heartburn about switch on 64-bit values.
6821 * A similar issue a little earlier when switching on value.
98f323fa 6822 * --jhi */
e2962f66 6823 switch ((I32)namedclass) {
73b437c8
JH
6824 case ANYOF_ALNUM:
6825 if (LOC)
936ed897 6826 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
73b437c8
JH
6827 else {
6828 for (value = 0; value < 256; value++)
6829 if (isALNUM(value))
936ed897 6830 ANYOF_BITMAP_SET(ret, value);
73b437c8 6831 }
c49a72a9
NC
6832 yesno = '+';
6833 what = "Word";
73b437c8
JH
6834 break;
6835 case ANYOF_NALNUM:
6836 if (LOC)
936ed897 6837 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
73b437c8
JH
6838 else {
6839 for (value = 0; value < 256; value++)
6840 if (!isALNUM(value))
936ed897 6841 ANYOF_BITMAP_SET(ret, value);
73b437c8 6842 }
c49a72a9
NC
6843 yesno = '!';
6844 what = "Word";
73b437c8 6845 break;
ffc61ed2 6846 case ANYOF_ALNUMC:
73b437c8 6847 if (LOC)
ffc61ed2 6848 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
73b437c8
JH
6849 else {
6850 for (value = 0; value < 256; value++)
ffc61ed2 6851 if (isALNUMC(value))
936ed897 6852 ANYOF_BITMAP_SET(ret, value);
73b437c8 6853 }
c49a72a9
NC
6854 yesno = '+';
6855 what = "Alnum";
73b437c8
JH
6856 break;
6857 case ANYOF_NALNUMC:
6858 if (LOC)
936ed897 6859 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
73b437c8
JH
6860 else {
6861 for (value = 0; value < 256; value++)
6862 if (!isALNUMC(value))
936ed897 6863 ANYOF_BITMAP_SET(ret, value);
73b437c8 6864 }
c49a72a9
NC
6865 yesno = '!';
6866 what = "Alnum";
73b437c8
JH
6867 break;
6868 case ANYOF_ALPHA:
6869 if (LOC)
936ed897 6870 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
73b437c8
JH
6871 else {
6872 for (value = 0; value < 256; value++)
6873 if (isALPHA(value))
936ed897 6874 ANYOF_BITMAP_SET(ret, value);
73b437c8 6875 }
c49a72a9
NC
6876 yesno = '+';
6877 what = "Alpha";
73b437c8
JH
6878 break;
6879 case ANYOF_NALPHA:
6880 if (LOC)
936ed897 6881 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
73b437c8
JH
6882 else {
6883 for (value = 0; value < 256; value++)
6884 if (!isALPHA(value))
936ed897 6885 ANYOF_BITMAP_SET(ret, value);
73b437c8 6886 }
c49a72a9
NC
6887 yesno = '!';
6888 what = "Alpha";
73b437c8
JH
6889 break;
6890 case ANYOF_ASCII:
6891 if (LOC)
936ed897 6892 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 6893 else {
c7f1f016 6894#ifndef EBCDIC
1ba5c669
JH
6895 for (value = 0; value < 128; value++)
6896 ANYOF_BITMAP_SET(ret, value);
6897#else /* EBCDIC */
ffbc6a93 6898 for (value = 0; value < 256; value++) {
3a3c4447
JH
6899 if (isASCII(value))
6900 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6901 }
1ba5c669 6902#endif /* EBCDIC */
73b437c8 6903 }
c49a72a9
NC
6904 yesno = '+';
6905 what = "ASCII";
73b437c8
JH
6906 break;
6907 case ANYOF_NASCII:
6908 if (LOC)
936ed897 6909 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 6910 else {
c7f1f016 6911#ifndef EBCDIC
1ba5c669
JH
6912 for (value = 128; value < 256; value++)
6913 ANYOF_BITMAP_SET(ret, value);
6914#else /* EBCDIC */
ffbc6a93 6915 for (value = 0; value < 256; value++) {
3a3c4447
JH
6916 if (!isASCII(value))
6917 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 6918 }
1ba5c669 6919#endif /* EBCDIC */
73b437c8 6920 }
c49a72a9
NC
6921 yesno = '!';
6922 what = "ASCII";
73b437c8 6923 break;
aaa51d5e
JF
6924 case ANYOF_BLANK:
6925 if (LOC)
6926 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
6927 else {
6928 for (value = 0; value < 256; value++)
6929 if (isBLANK(value))
6930 ANYOF_BITMAP_SET(ret, value);
6931 }
c49a72a9
NC
6932 yesno = '+';
6933 what = "Blank";
aaa51d5e
JF
6934 break;
6935 case ANYOF_NBLANK:
6936 if (LOC)
6937 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
6938 else {
6939 for (value = 0; value < 256; value++)
6940 if (!isBLANK(value))
6941 ANYOF_BITMAP_SET(ret, value);
6942 }
c49a72a9
NC
6943 yesno = '!';
6944 what = "Blank";
aaa51d5e 6945 break;
73b437c8
JH
6946 case ANYOF_CNTRL:
6947 if (LOC)
936ed897 6948 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
73b437c8
JH
6949 else {
6950 for (value = 0; value < 256; value++)
6951 if (isCNTRL(value))
936ed897 6952 ANYOF_BITMAP_SET(ret, value);
73b437c8 6953 }
c49a72a9
NC
6954 yesno = '+';
6955 what = "Cntrl";
73b437c8
JH
6956 break;
6957 case ANYOF_NCNTRL:
6958 if (LOC)
936ed897 6959 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
73b437c8
JH
6960 else {
6961 for (value = 0; value < 256; value++)
6962 if (!isCNTRL(value))
936ed897 6963 ANYOF_BITMAP_SET(ret, value);
73b437c8 6964 }
c49a72a9
NC
6965 yesno = '!';
6966 what = "Cntrl";
ffc61ed2
JH
6967 break;
6968 case ANYOF_DIGIT:
6969 if (LOC)
6970 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
6971 else {
6972 /* consecutive digits assumed */
6973 for (value = '0'; value <= '9'; value++)
6974 ANYOF_BITMAP_SET(ret, value);
6975 }
c49a72a9
NC
6976 yesno = '+';
6977 what = "Digit";
ffc61ed2
JH
6978 break;
6979 case ANYOF_NDIGIT:
6980 if (LOC)
6981 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
6982 else {
6983 /* consecutive digits assumed */
6984 for (value = 0; value < '0'; value++)
6985 ANYOF_BITMAP_SET(ret, value);
6986 for (value = '9' + 1; value < 256; value++)
6987 ANYOF_BITMAP_SET(ret, value);
6988 }
c49a72a9
NC
6989 yesno = '!';
6990 what = "Digit";
73b437c8
JH
6991 break;
6992 case ANYOF_GRAPH:
6993 if (LOC)
936ed897 6994 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
73b437c8
JH
6995 else {
6996 for (value = 0; value < 256; value++)
6997 if (isGRAPH(value))
936ed897 6998 ANYOF_BITMAP_SET(ret, value);
73b437c8 6999 }
c49a72a9
NC
7000 yesno = '+';
7001 what = "Graph";
73b437c8
JH
7002 break;
7003 case ANYOF_NGRAPH:
7004 if (LOC)
936ed897 7005 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
73b437c8
JH
7006 else {
7007 for (value = 0; value < 256; value++)
7008 if (!isGRAPH(value))
936ed897 7009 ANYOF_BITMAP_SET(ret, value);
73b437c8 7010 }
c49a72a9
NC
7011 yesno = '!';
7012 what = "Graph";
73b437c8
JH
7013 break;
7014 case ANYOF_LOWER:
7015 if (LOC)
936ed897 7016 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
73b437c8
JH
7017 else {
7018 for (value = 0; value < 256; value++)
7019 if (isLOWER(value))
936ed897 7020 ANYOF_BITMAP_SET(ret, value);
73b437c8 7021 }
c49a72a9
NC
7022 yesno = '+';
7023 what = "Lower";
73b437c8
JH
7024 break;
7025 case ANYOF_NLOWER:
7026 if (LOC)
936ed897 7027 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
73b437c8
JH
7028 else {
7029 for (value = 0; value < 256; value++)
7030 if (!isLOWER(value))
936ed897 7031 ANYOF_BITMAP_SET(ret, value);
73b437c8 7032 }
c49a72a9
NC
7033 yesno = '!';
7034 what = "Lower";
73b437c8
JH
7035 break;
7036 case ANYOF_PRINT:
7037 if (LOC)
936ed897 7038 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
73b437c8
JH
7039 else {
7040 for (value = 0; value < 256; value++)
7041 if (isPRINT(value))
936ed897 7042 ANYOF_BITMAP_SET(ret, value);
73b437c8 7043 }
c49a72a9
NC
7044 yesno = '+';
7045 what = "Print";
73b437c8
JH
7046 break;
7047 case ANYOF_NPRINT:
7048 if (LOC)
936ed897 7049 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
73b437c8
JH
7050 else {
7051 for (value = 0; value < 256; value++)
7052 if (!isPRINT(value))
936ed897 7053 ANYOF_BITMAP_SET(ret, value);
73b437c8 7054 }
c49a72a9
NC
7055 yesno = '!';
7056 what = "Print";
73b437c8 7057 break;
aaa51d5e
JF
7058 case ANYOF_PSXSPC:
7059 if (LOC)
7060 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
7061 else {
7062 for (value = 0; value < 256; value++)
7063 if (isPSXSPC(value))
7064 ANYOF_BITMAP_SET(ret, value);
7065 }
c49a72a9
NC
7066 yesno = '+';
7067 what = "Space";
aaa51d5e
JF
7068 break;
7069 case ANYOF_NPSXSPC:
7070 if (LOC)
7071 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
7072 else {
7073 for (value = 0; value < 256; value++)
7074 if (!isPSXSPC(value))
7075 ANYOF_BITMAP_SET(ret, value);
7076 }
c49a72a9
NC
7077 yesno = '!';
7078 what = "Space";
aaa51d5e 7079 break;
73b437c8
JH
7080 case ANYOF_PUNCT:
7081 if (LOC)
936ed897 7082 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
73b437c8
JH
7083 else {
7084 for (value = 0; value < 256; value++)
7085 if (isPUNCT(value))
936ed897 7086 ANYOF_BITMAP_SET(ret, value);
73b437c8 7087 }
c49a72a9
NC
7088 yesno = '+';
7089 what = "Punct";
73b437c8
JH
7090 break;
7091 case ANYOF_NPUNCT:
7092 if (LOC)
936ed897 7093 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
73b437c8
JH
7094 else {
7095 for (value = 0; value < 256; value++)
7096 if (!isPUNCT(value))
936ed897 7097 ANYOF_BITMAP_SET(ret, value);
73b437c8 7098 }
c49a72a9
NC
7099 yesno = '!';
7100 what = "Punct";
ffc61ed2
JH
7101 break;
7102 case ANYOF_SPACE:
7103 if (LOC)
7104 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
7105 else {
7106 for (value = 0; value < 256; value++)
7107 if (isSPACE(value))
7108 ANYOF_BITMAP_SET(ret, value);
7109 }
c49a72a9
NC
7110 yesno = '+';
7111 what = "SpacePerl";
ffc61ed2
JH
7112 break;
7113 case ANYOF_NSPACE:
7114 if (LOC)
7115 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
7116 else {
7117 for (value = 0; value < 256; value++)
7118 if (!isSPACE(value))
7119 ANYOF_BITMAP_SET(ret, value);
7120 }
c49a72a9
NC
7121 yesno = '!';
7122 what = "SpacePerl";
73b437c8
JH
7123 break;
7124 case ANYOF_UPPER:
7125 if (LOC)
936ed897 7126 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
73b437c8
JH
7127 else {
7128 for (value = 0; value < 256; value++)
7129 if (isUPPER(value))
936ed897 7130 ANYOF_BITMAP_SET(ret, value);
73b437c8 7131 }
c49a72a9
NC
7132 yesno = '+';
7133 what = "Upper";
73b437c8
JH
7134 break;
7135 case ANYOF_NUPPER:
7136 if (LOC)
936ed897 7137 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
73b437c8
JH
7138 else {
7139 for (value = 0; value < 256; value++)
7140 if (!isUPPER(value))
936ed897 7141 ANYOF_BITMAP_SET(ret, value);
73b437c8 7142 }
c49a72a9
NC
7143 yesno = '!';
7144 what = "Upper";
73b437c8
JH
7145 break;
7146 case ANYOF_XDIGIT:
7147 if (LOC)
936ed897 7148 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
73b437c8
JH
7149 else {
7150 for (value = 0; value < 256; value++)
7151 if (isXDIGIT(value))
936ed897 7152 ANYOF_BITMAP_SET(ret, value);
73b437c8 7153 }
c49a72a9
NC
7154 yesno = '+';
7155 what = "XDigit";
73b437c8
JH
7156 break;
7157 case ANYOF_NXDIGIT:
7158 if (LOC)
936ed897 7159 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
73b437c8
JH
7160 else {
7161 for (value = 0; value < 256; value++)
7162 if (!isXDIGIT(value))
936ed897 7163 ANYOF_BITMAP_SET(ret, value);
73b437c8 7164 }
c49a72a9
NC
7165 yesno = '!';
7166 what = "XDigit";
73b437c8 7167 break;
f81125e2
JP
7168 case ANYOF_MAX:
7169 /* this is to handle \p and \P */
7170 break;
73b437c8 7171 default:
b45f050a 7172 vFAIL("Invalid [::] class");
73b437c8 7173 break;
b8c5462f 7174 }
c49a72a9
NC
7175 if (what) {
7176 /* Strings such as "+utf8::isWord\n" */
7177 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7178 }
b8c5462f 7179 if (LOC)
936ed897 7180 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7181 continue;
a0d0e21e 7182 }
ffc61ed2
JH
7183 } /* end of namedclass \blah */
7184
a0d0e21e 7185 if (range) {
eb160463 7186 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7187 const int w = RExC_parse - rangebegin;
7188 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7189 range = 0; /* not a valid range */
73b437c8 7190 }
a0d0e21e
LW
7191 }
7192 else {
3568d838 7193 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7194 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7195 RExC_parse[1] != ']') {
7196 RExC_parse++;
ffc61ed2
JH
7197
7198 /* a bad range like \w-, [:word:]- ? */
7199 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7200 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7201 const int w =
afd78fd5
JH
7202 RExC_parse >= rangebegin ?
7203 RExC_parse - rangebegin : 0;
830247a4 7204 vWARN4(RExC_parse,
b45f050a 7205 "False [] range \"%*.*s\"",
097eb12c 7206 w, w, rangebegin);
afd78fd5 7207 }
73b437c8 7208 if (!SIZE_ONLY)
936ed897 7209 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7210 } else
ffc61ed2
JH
7211 range = 1; /* yeah, it's a range! */
7212 continue; /* but do it the next time */
a0d0e21e 7213 }
a687059c 7214 }
ffc61ed2 7215
93733859 7216 /* now is the next time */
07be1b83 7217 /*stored += (value - prevvalue + 1);*/
ae5c130c 7218 if (!SIZE_ONLY) {
3568d838 7219 if (prevvalue < 256) {
1df70142 7220 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7221 IV i;
3568d838 7222#ifdef EBCDIC
1b2d223b
JH
7223 /* In EBCDIC [\x89-\x91] should include
7224 * the \x8e but [i-j] should not. */
7225 if (literal_endpoint == 2 &&
7226 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7227 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7228 {
3568d838
JH
7229 if (isLOWER(prevvalue)) {
7230 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7231 if (isLOWER(i))
7232 ANYOF_BITMAP_SET(ret, i);
7233 } else {
3568d838 7234 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7235 if (isUPPER(i))
7236 ANYOF_BITMAP_SET(ret, i);
7237 }
8ada0baa 7238 }
ffc61ed2 7239 else
8ada0baa 7240#endif
07be1b83
YO
7241 for (i = prevvalue; i <= ceilvalue; i++) {
7242 if (!ANYOF_BITMAP_TEST(ret,i)) {
7243 stored++;
7244 ANYOF_BITMAP_SET(ret, i);
7245 }
7246 }
3568d838 7247 }
a5961de5 7248 if (value > 255 || UTF) {
1df70142
AL
7249 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7250 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7251 stored+=2; /* can't optimize this class */
ffc61ed2 7252 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7253 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7254 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7255 prevnatvalue, natvalue);
7256 }
7257 else if (prevnatvalue == natvalue) {
7258 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7259 if (FOLD) {
89ebb4a3 7260 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7261 STRLEN foldlen;
1df70142 7262 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7263
e294cc5d
JH
7264#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7265 if (RExC_precomp[0] == ':' &&
7266 RExC_precomp[1] == '[' &&
7267 (f == 0xDF || f == 0x92)) {
7268 f = NATIVE_TO_UNI(f);
7269 }
7270#endif
c840d2a2
JH
7271 /* If folding and foldable and a single
7272 * character, insert also the folded version
7273 * to the charclass. */
9e55ce06 7274 if (f != value) {
e294cc5d
JH
7275#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7276 if ((RExC_precomp[0] == ':' &&
7277 RExC_precomp[1] == '[' &&
7278 (f == 0xA2 &&
7279 (value == 0xFB05 || value == 0xFB06))) ?
7280 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7281 foldlen == (STRLEN)UNISKIP(f) )
7282#else
eb160463 7283 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7284#endif
9e55ce06
JH
7285 Perl_sv_catpvf(aTHX_ listsv,
7286 "%04"UVxf"\n", f);
7287 else {
7288 /* Any multicharacter foldings
7289 * require the following transform:
7290 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7291 * where E folds into "pq" and F folds
7292 * into "rst", all other characters
7293 * fold to single characters. We save
7294 * away these multicharacter foldings,
7295 * to be later saved as part of the
7296 * additional "s" data. */
7297 SV *sv;
7298
7299 if (!unicode_alternate)
7300 unicode_alternate = newAV();
7301 sv = newSVpvn((char*)foldbuf, foldlen);
7302 SvUTF8_on(sv);
7303 av_push(unicode_alternate, sv);
7304 }
7305 }
254ba52a 7306
60a8b682
JH
7307 /* If folding and the value is one of the Greek
7308 * sigmas insert a few more sigmas to make the
7309 * folding rules of the sigmas to work right.
7310 * Note that not all the possible combinations
7311 * are handled here: some of them are handled
9e55ce06
JH
7312 * by the standard folding rules, and some of
7313 * them (literal or EXACTF cases) are handled
7314 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7315 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7316 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7317 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7318 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7319 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7320 }
7321 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7322 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7323 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7324 }
7325 }
ffc61ed2 7326 }
1b2d223b
JH
7327#ifdef EBCDIC
7328 literal_endpoint = 0;
7329#endif
8ada0baa 7330 }
ffc61ed2
JH
7331
7332 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7333 }
ffc61ed2 7334
936ed897 7335 if (need_class) {
4f66b38d 7336 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7337 if (SIZE_ONLY)
830247a4 7338 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7339 else
830247a4 7340 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7341 }
ffc61ed2 7342
7f6f358c
YO
7343
7344 if (SIZE_ONLY)
7345 return ret;
7346 /****** !SIZE_ONLY AFTER HERE *********/
7347
7348 if( stored == 1 && value < 256
7349 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7350 ) {
7351 /* optimize single char class to an EXACT node
7352 but *only* when its not a UTF/high char */
07be1b83
YO
7353 const char * cur_parse= RExC_parse;
7354 RExC_emit = (regnode *)orig_emit;
7355 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7356 ret = reg_node(pRExC_state,
7357 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7358 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7359 *STRING(ret)= (char)value;
7360 STR_LEN(ret)= 1;
7361 RExC_emit += STR_SZ(1);
7362 return ret;
7363 }
ae5c130c 7364 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7365 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7366 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7367 ) {
a0ed51b3 7368 for (value = 0; value < 256; ++value) {
936ed897 7369 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7370 UV fold = PL_fold[value];
ffc61ed2
JH
7371
7372 if (fold != value)
7373 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7374 }
7375 }
936ed897 7376 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7377 }
ffc61ed2 7378
ae5c130c 7379 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7380 if (optimize_invert &&
ffc61ed2
JH
7381 /* If the only flag is inversion. */
7382 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7383 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7384 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7385 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7386 }
7f6f358c 7387 {
097eb12c 7388 AV * const av = newAV();
ffc61ed2 7389 SV *rv;
9e55ce06 7390 /* The 0th element stores the character class description
6a0407ee 7391 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7392 * to initialize the appropriate swash (which gets stored in
7393 * the 1st element), and also useful for dumping the regnode.
7394 * The 2nd element stores the multicharacter foldings,
6a0407ee 7395 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7396 av_store(av, 0, listsv);
7397 av_store(av, 1, NULL);
9e55ce06 7398 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7399 rv = newRV_noinc((SV*)av);
19860706 7400 n = add_data(pRExC_state, 1, "s");
830247a4 7401 RExC_rx->data->data[n] = (void*)rv;
ffc61ed2 7402 ARG_SET(ret, n);
a0ed51b3 7403 }
a0ed51b3
LW
7404 return ret;
7405}
7406
76e3520e 7407STATIC char*
830247a4 7408S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7409{
097eb12c 7410 char* const retval = RExC_parse++;
a0d0e21e 7411
4633a7c4 7412 for (;;) {
830247a4
IZ
7413 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7414 RExC_parse[2] == '#') {
e994fd66
AE
7415 while (*RExC_parse != ')') {
7416 if (RExC_parse == RExC_end)
7417 FAIL("Sequence (?#... not terminated");
830247a4 7418 RExC_parse++;
e994fd66 7419 }
830247a4 7420 RExC_parse++;
4633a7c4
LW
7421 continue;
7422 }
e2509266 7423 if (RExC_flags & PMf_EXTENDED) {
830247a4
IZ
7424 if (isSPACE(*RExC_parse)) {
7425 RExC_parse++;
748a9306
LW
7426 continue;
7427 }
830247a4 7428 else if (*RExC_parse == '#') {
e994fd66
AE
7429 while (RExC_parse < RExC_end)
7430 if (*RExC_parse++ == '\n') break;
748a9306
LW
7431 continue;
7432 }
748a9306 7433 }
4633a7c4 7434 return retval;
a0d0e21e 7435 }
a687059c
LW
7436}
7437
7438/*
c277df42 7439- reg_node - emit a node
a0d0e21e 7440*/
76e3520e 7441STATIC regnode * /* Location. */
830247a4 7442S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7443{
97aff369 7444 dVAR;
c277df42 7445 register regnode *ptr;
504618e9 7446 regnode * const ret = RExC_emit;
07be1b83 7447 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7448
c277df42 7449 if (SIZE_ONLY) {
830247a4
IZ
7450 SIZE_ALIGN(RExC_size);
7451 RExC_size += 1;
a0d0e21e
LW
7452 return(ret);
7453 }
c277df42 7454 NODE_ALIGN_FILL(ret);
a0d0e21e 7455 ptr = ret;
c277df42 7456 FILL_ADVANCE_NODE(ptr, op);
fac92740 7457 if (RExC_offsets) { /* MJD */
07be1b83 7458 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7459 "reg_node", __LINE__,
7460 reg_name[op],
07be1b83
YO
7461 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7462 ? "Overwriting end of array!\n" : "OK",
7463 (UV)(RExC_emit - RExC_emit_start),
7464 (UV)(RExC_parse - RExC_start),
7465 (UV)RExC_offsets[0]));
ccb2c380 7466 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7467 }
07be1b83 7468
830247a4 7469 RExC_emit = ptr;
a687059c 7470
a0d0e21e 7471 return(ret);
a687059c
LW
7472}
7473
7474/*
a0d0e21e
LW
7475- reganode - emit a node with an argument
7476*/
76e3520e 7477STATIC regnode * /* Location. */
830247a4 7478S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7479{
97aff369 7480 dVAR;
c277df42 7481 register regnode *ptr;
504618e9 7482 regnode * const ret = RExC_emit;
07be1b83 7483 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7484
c277df42 7485 if (SIZE_ONLY) {
830247a4
IZ
7486 SIZE_ALIGN(RExC_size);
7487 RExC_size += 2;
6bda09f9
YO
7488 /*
7489 We can't do this:
7490
7491 assert(2==regarglen[op]+1);
7492
7493 Anything larger than this has to allocate the extra amount.
7494 If we changed this to be:
7495
7496 RExC_size += (1 + regarglen[op]);
7497
7498 then it wouldn't matter. Its not clear what side effect
7499 might come from that so its not done so far.
7500 -- dmq
7501 */
a0d0e21e
LW
7502 return(ret);
7503 }
fe14fcc3 7504
c277df42 7505 NODE_ALIGN_FILL(ret);
a0d0e21e 7506 ptr = ret;
c277df42 7507 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
fac92740 7508 if (RExC_offsets) { /* MJD */
07be1b83 7509 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7510 "reganode",
ccb2c380
MP
7511 __LINE__,
7512 reg_name[op],
07be1b83 7513 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7514 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7515 (UV)(RExC_emit - RExC_emit_start),
7516 (UV)(RExC_parse - RExC_start),
7517 (UV)RExC_offsets[0]));
ccb2c380 7518 Set_Cur_Node_Offset;
fac92740
MJD
7519 }
7520
830247a4 7521 RExC_emit = ptr;
fe14fcc3 7522
a0d0e21e 7523 return(ret);
fe14fcc3
LW
7524}
7525
7526/*
cd439c50 7527- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7528*/
71207a34
AL
7529STATIC STRLEN
7530S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7531{
97aff369 7532 dVAR;
71207a34 7533 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7534}
7535
7536/*
a0d0e21e
LW
7537- reginsert - insert an operator in front of already-emitted operand
7538*
7539* Means relocating the operand.
7540*/
76e3520e 7541STATIC void
6bda09f9 7542S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7543{
97aff369 7544 dVAR;
c277df42
IZ
7545 register regnode *src;
7546 register regnode *dst;
7547 register regnode *place;
504618e9 7548 const int offset = regarglen[(U8)op];
6bda09f9 7549 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7550 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7551/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7552 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7553 if (SIZE_ONLY) {
6bda09f9 7554 RExC_size += size;
a0d0e21e
LW
7555 return;
7556 }
a687059c 7557
830247a4 7558 src = RExC_emit;
6bda09f9 7559 RExC_emit += size;
830247a4 7560 dst = RExC_emit;
40d049e4 7561 if (RExC_open_parens) {
6bda09f9 7562 int paren;
40d049e4 7563 DEBUG_PARSE_FMT("inst"," - %d",RExC_npar);
6bda09f9 7564 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7565 if ( RExC_open_parens[paren] >= opnd ) {
7566 DEBUG_PARSE_FMT("open"," - %d",size);
7567 RExC_open_parens[paren] += size;
7568 } else {
7569 DEBUG_PARSE_FMT("open"," - %s","ok");
7570 }
7571 if ( RExC_close_parens[paren] >= opnd ) {
7572 DEBUG_PARSE_FMT("close"," - %d",size);
7573 RExC_close_parens[paren] += size;
7574 } else {
7575 DEBUG_PARSE_FMT("close"," - %s","ok");
7576 }
7577 }
6bda09f9 7578 }
40d049e4 7579
fac92740 7580 while (src > opnd) {
c277df42 7581 StructCopy(--src, --dst, regnode);
fac92740 7582 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7583 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7584 "reg_insert",
ccb2c380
MP
7585 __LINE__,
7586 reg_name[op],
07be1b83
YO
7587 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7588 ? "Overwriting end of array!\n" : "OK",
7589 (UV)(src - RExC_emit_start),
7590 (UV)(dst - RExC_emit_start),
7591 (UV)RExC_offsets[0]));
ccb2c380
MP
7592 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7593 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740
MJD
7594 }
7595 }
7596
a0d0e21e
LW
7597
7598 place = opnd; /* Op node, where operand used to be. */
fac92740 7599 if (RExC_offsets) { /* MJD */
07be1b83 7600 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7601 "reginsert",
ccb2c380
MP
7602 __LINE__,
7603 reg_name[op],
07be1b83 7604 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7605 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7606 (UV)(place - RExC_emit_start),
7607 (UV)(RExC_parse - RExC_start),
786e8c11 7608 (UV)RExC_offsets[0]));
ccb2c380 7609 Set_Node_Offset(place, RExC_parse);
45948336 7610 Set_Node_Length(place, 1);
fac92740 7611 }
c277df42
IZ
7612 src = NEXTOPER(place);
7613 FILL_ADVANCE_NODE(place, op);
7614 Zero(src, offset, regnode);
a687059c
LW
7615}
7616
7617/*
c277df42 7618- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7619- SEE ALSO: regtail_study
a0d0e21e 7620*/
097eb12c 7621/* TODO: All three parms should be const */
76e3520e 7622STATIC void
3dab1dad 7623S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7624{
97aff369 7625 dVAR;
c277df42 7626 register regnode *scan;
72f13be8 7627 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7628#ifndef DEBUGGING
7629 PERL_UNUSED_ARG(depth);
7630#endif
a0d0e21e 7631
c277df42 7632 if (SIZE_ONLY)
a0d0e21e
LW
7633 return;
7634
7635 /* Find last node. */
7636 scan = p;
7637 for (;;) {
504618e9 7638 regnode * const temp = regnext(scan);
3dab1dad
YO
7639 DEBUG_PARSE_r({
7640 SV * const mysv=sv_newmortal();
7641 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7642 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7643 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7644 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7645 (temp == NULL ? "->" : ""),
7646 (temp == NULL ? reg_name[OP(val)] : "")
7647 );
3dab1dad
YO
7648 });
7649 if (temp == NULL)
7650 break;
7651 scan = temp;
7652 }
7653
7654 if (reg_off_by_arg[OP(scan)]) {
7655 ARG_SET(scan, val - scan);
7656 }
7657 else {
7658 NEXT_OFF(scan) = val - scan;
7659 }
7660}
7661
07be1b83 7662#ifdef DEBUGGING
3dab1dad
YO
7663/*
7664- regtail_study - set the next-pointer at the end of a node chain of p to val.
7665- Look for optimizable sequences at the same time.
7666- currently only looks for EXACT chains.
07be1b83
YO
7667
7668This is expermental code. The idea is to use this routine to perform
7669in place optimizations on branches and groups as they are constructed,
7670with the long term intention of removing optimization from study_chunk so
7671that it is purely analytical.
7672
7673Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
7674to control which is which.
7675
3dab1dad
YO
7676*/
7677/* TODO: All four parms should be const */
07be1b83 7678
3dab1dad
YO
7679STATIC U8
7680S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
7681{
7682 dVAR;
7683 register regnode *scan;
07be1b83
YO
7684 U8 exact = PSEUDO;
7685#ifdef EXPERIMENTAL_INPLACESCAN
7686 I32 min = 0;
7687#endif
7688
3dab1dad
YO
7689 GET_RE_DEBUG_FLAGS_DECL;
7690
07be1b83 7691
3dab1dad
YO
7692 if (SIZE_ONLY)
7693 return exact;
7694
7695 /* Find last node. */
7696
7697 scan = p;
7698 for (;;) {
7699 regnode * const temp = regnext(scan);
07be1b83
YO
7700#ifdef EXPERIMENTAL_INPLACESCAN
7701 if (PL_regkind[OP(scan)] == EXACT)
7702 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
7703 return EXACT;
7704#endif
3dab1dad
YO
7705 if ( exact ) {
7706 switch (OP(scan)) {
7707 case EXACT:
7708 case EXACTF:
7709 case EXACTFL:
7710 if( exact == PSEUDO )
7711 exact= OP(scan);
07be1b83
YO
7712 else if ( exact != OP(scan) )
7713 exact= 0;
3dab1dad
YO
7714 case NOTHING:
7715 break;
7716 default:
7717 exact= 0;
7718 }
7719 }
7720 DEBUG_PARSE_r({
7721 SV * const mysv=sv_newmortal();
7722 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
7723 regprop(RExC_rx, mysv, scan);
eaf3ca90 7724 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 7725 SvPV_nolen_const(mysv),
eaf3ca90
YO
7726 REG_NODE_NUM(scan),
7727 reg_name[exact]);
3dab1dad 7728 });
a0d0e21e
LW
7729 if (temp == NULL)
7730 break;
7731 scan = temp;
7732 }
07be1b83
YO
7733 DEBUG_PARSE_r({
7734 SV * const mysv_val=sv_newmortal();
7735 DEBUG_PARSE_MSG("");
7736 regprop(RExC_rx, mysv_val, val);
7737 PerlIO_printf(Perl_debug_log, "~ attach to %s (%d) offset to %d\n",
7738 SvPV_nolen_const(mysv_val),
7739 REG_NODE_NUM(val),
7740 val - scan
7741 );
7742 });
c277df42
IZ
7743 if (reg_off_by_arg[OP(scan)]) {
7744 ARG_SET(scan, val - scan);
a0ed51b3
LW
7745 }
7746 else {
c277df42
IZ
7747 NEXT_OFF(scan) = val - scan;
7748 }
3dab1dad
YO
7749
7750 return exact;
a687059c 7751}
07be1b83 7752#endif
a687059c
LW
7753
7754/*
a687059c
LW
7755 - regcurly - a little FSA that accepts {\d+,?\d*}
7756 */
79072805 7757STATIC I32
5f66b61c 7758S_regcurly(register const char *s)
a687059c
LW
7759{
7760 if (*s++ != '{')
7761 return FALSE;
f0fcb552 7762 if (!isDIGIT(*s))
a687059c 7763 return FALSE;
f0fcb552 7764 while (isDIGIT(*s))
a687059c
LW
7765 s++;
7766 if (*s == ',')
7767 s++;
f0fcb552 7768 while (isDIGIT(*s))
a687059c
LW
7769 s++;
7770 if (*s != '}')
7771 return FALSE;
7772 return TRUE;
7773}
7774
a687059c
LW
7775
7776/*
fd181c75 7777 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
7778 */
7779void
097eb12c 7780Perl_regdump(pTHX_ const regexp *r)
a687059c 7781{
35ff7856 7782#ifdef DEBUGGING
97aff369 7783 dVAR;
c445ea15 7784 SV * const sv = sv_newmortal();
ab3bbdeb 7785 SV *dsv= sv_newmortal();
a687059c 7786
786e8c11 7787 (void)dumpuntil(r, r->program, r->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
7788
7789 /* Header fields of interest. */
ab3bbdeb
YO
7790 if (r->anchored_substr) {
7791 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
7792 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 7793 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7794 "anchored %s%s at %"IVdf" ",
7795 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 7796 (IV)r->anchored_offset);
ab3bbdeb
YO
7797 } else if (r->anchored_utf8) {
7798 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
7799 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 7800 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7801 "anchored utf8 %s%s at %"IVdf" ",
7802 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 7803 (IV)r->anchored_offset);
ab3bbdeb
YO
7804 }
7805 if (r->float_substr) {
7806 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
7807 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 7808 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7809 "floating %s%s at %"IVdf"..%"UVuf" ",
7810 s, RE_SV_TAIL(r->float_substr),
7b0972df 7811 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
7812 } else if (r->float_utf8) {
7813 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
7814 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 7815 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
7816 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
7817 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 7818 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 7819 }
33b8afdf 7820 if (r->check_substr || r->check_utf8)
b81d288d 7821 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
7822 (const char *)
7823 (r->check_substr == r->float_substr
7824 && r->check_utf8 == r->float_utf8
7825 ? "(checking floating" : "(checking anchored"));
c277df42
IZ
7826 if (r->reganch & ROPT_NOSCAN)
7827 PerlIO_printf(Perl_debug_log, " noscan");
7828 if (r->reganch & ROPT_CHECK_ALL)
7829 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 7830 if (r->check_substr || r->check_utf8)
c277df42
IZ
7831 PerlIO_printf(Perl_debug_log, ") ");
7832
46fc3d4c 7833 if (r->regstclass) {
32fc9b6a 7834 regprop(r, sv, r->regstclass);
1de06328 7835 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 7836 }
774d564b 7837 if (r->reganch & ROPT_ANCH) {
7838 PerlIO_printf(Perl_debug_log, "anchored");
7839 if (r->reganch & ROPT_ANCH_BOL)
7840 PerlIO_printf(Perl_debug_log, "(BOL)");
c277df42
IZ
7841 if (r->reganch & ROPT_ANCH_MBOL)
7842 PerlIO_printf(Perl_debug_log, "(MBOL)");
cad2e5aa
JH
7843 if (r->reganch & ROPT_ANCH_SBOL)
7844 PerlIO_printf(Perl_debug_log, "(SBOL)");
774d564b 7845 if (r->reganch & ROPT_ANCH_GPOS)
7846 PerlIO_printf(Perl_debug_log, "(GPOS)");
7847 PerlIO_putc(Perl_debug_log, ' ');
7848 }
c277df42
IZ
7849 if (r->reganch & ROPT_GPOS_SEEN)
7850 PerlIO_printf(Perl_debug_log, "GPOS ");
a0d0e21e 7851 if (r->reganch & ROPT_SKIP)
760ac839 7852 PerlIO_printf(Perl_debug_log, "plus ");
a0d0e21e 7853 if (r->reganch & ROPT_IMPLICIT)
760ac839 7854 PerlIO_printf(Perl_debug_log, "implicit ");
760ac839 7855 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
ce862d02
IZ
7856 if (r->reganch & ROPT_EVAL_SEEN)
7857 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 7858 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 7859#else
96a5add6 7860 PERL_UNUSED_CONTEXT;
65e66c80 7861 PERL_UNUSED_ARG(r);
17c3b450 7862#endif /* DEBUGGING */
a687059c
LW
7863}
7864
7865/*
a0d0e21e
LW
7866- regprop - printable representation of opcode
7867*/
46fc3d4c 7868void
32fc9b6a 7869Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 7870{
35ff7856 7871#ifdef DEBUGGING
97aff369 7872 dVAR;
9b155405 7873 register int k;
1de06328 7874 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 7875
54dc92de 7876 sv_setpvn(sv, "", 0);
03363afd 7877 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
7878 /* It would be nice to FAIL() here, but this may be called from
7879 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 7880 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 7881 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 7882
3dab1dad 7883 k = PL_regkind[OP(o)];
9b155405 7884
2a782b5b 7885 if (k == EXACT) {
396482e1 7886 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
7887 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
7888 * is a crude hack but it may be the best for now since
7889 * we have no flag "this EXACTish node was UTF-8"
7890 * --jhi */
7891 const char * const s =
ddc5bc0f 7892 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
7893 PL_colors[0], PL_colors[1],
7894 PERL_PV_ESCAPE_UNI_DETECT |
7895 PERL_PV_PRETTY_ELIPSES |
7896 PERL_PV_PRETTY_LTGT
7897 );
7898 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 7899 } else if (k == TRIE) {
3dab1dad 7900 /* print the details of the trie in dumpuntil instead, as
4f639d21 7901 * prog->data isn't available here */
1de06328
YO
7902 const char op = OP(o);
7903 const I32 n = ARG(o);
7904 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
7905 (reg_ac_data *)prog->data->data[n] :
7906 NULL;
7907 const reg_trie_data * const trie = !IS_TRIE_AC(op) ?
7908 (reg_trie_data*)prog->data->data[n] :
7909 ac->trie;
7910
7911 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
7912 DEBUG_TRIE_COMPILE_r(
7913 Perl_sv_catpvf(aTHX_ sv,
7914 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
7915 (UV)trie->startstate,
1e2e3d02 7916 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
7917 (UV)trie->wordcount,
7918 (UV)trie->minlen,
7919 (UV)trie->maxlen,
7920 (UV)TRIE_CHARCOUNT(trie),
7921 (UV)trie->uniquecharcount
7922 )
7923 );
7924 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
7925 int i;
7926 int rangestart = -1;
f46cb337 7927 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
7928 Perl_sv_catpvf(aTHX_ sv, "[");
7929 for (i = 0; i <= 256; i++) {
7930 if (i < 256 && BITMAP_TEST(bitmap,i)) {
7931 if (rangestart == -1)
7932 rangestart = i;
7933 } else if (rangestart != -1) {
7934 if (i <= rangestart + 3)
7935 for (; rangestart < i; rangestart++)
7936 put_byte(sv, rangestart);
7937 else {
7938 put_byte(sv, rangestart);
7939 sv_catpvs(sv, "-");
7940 put_byte(sv, i - 1);
7941 }
7942 rangestart = -1;
7943 }
7944 }
7945 Perl_sv_catpvf(aTHX_ sv, "]");
7946 }
7947
a3621e74 7948 } else if (k == CURLY) {
cb434fcc 7949 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
7950 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
7951 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 7952 }
2c2d71f5
JH
7953 else if (k == WHILEM && o->flags) /* Ordinal/of */
7954 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
6bda09f9 7955 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP)
894356b3 7956 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1a147d38 7957 else if (k == GOSUB)
6bda09f9 7958 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9b155405 7959 else if (k == LOGICAL)
04ebc1ab 7960 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
7961 else if (k == ANYOF) {
7962 int i, rangestart = -1;
2d03de9c 7963 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
7964
7965 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
7966 static const char * const anyofs[] = {
653099ff
GS
7967 "\\w",
7968 "\\W",
7969 "\\s",
7970 "\\S",
7971 "\\d",
7972 "\\D",
7973 "[:alnum:]",
7974 "[:^alnum:]",
7975 "[:alpha:]",
7976 "[:^alpha:]",
7977 "[:ascii:]",
7978 "[:^ascii:]",
7979 "[:ctrl:]",
7980 "[:^ctrl:]",
7981 "[:graph:]",
7982 "[:^graph:]",
7983 "[:lower:]",
7984 "[:^lower:]",
7985 "[:print:]",
7986 "[:^print:]",
7987 "[:punct:]",
7988 "[:^punct:]",
7989 "[:upper:]",
aaa51d5e 7990 "[:^upper:]",
653099ff 7991 "[:xdigit:]",
aaa51d5e
JF
7992 "[:^xdigit:]",
7993 "[:space:]",
7994 "[:^space:]",
7995 "[:blank:]",
7996 "[:^blank:]"
653099ff
GS
7997 };
7998
19860706 7999 if (flags & ANYOF_LOCALE)
396482e1 8000 sv_catpvs(sv, "{loc}");
19860706 8001 if (flags & ANYOF_FOLD)
396482e1 8002 sv_catpvs(sv, "{i}");
653099ff 8003 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8004 if (flags & ANYOF_INVERT)
396482e1 8005 sv_catpvs(sv, "^");
ffc61ed2
JH
8006 for (i = 0; i <= 256; i++) {
8007 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8008 if (rangestart == -1)
8009 rangestart = i;
8010 } else if (rangestart != -1) {
8011 if (i <= rangestart + 3)
8012 for (; rangestart < i; rangestart++)
653099ff 8013 put_byte(sv, rangestart);
ffc61ed2
JH
8014 else {
8015 put_byte(sv, rangestart);
396482e1 8016 sv_catpvs(sv, "-");
ffc61ed2 8017 put_byte(sv, i - 1);
653099ff 8018 }
ffc61ed2 8019 rangestart = -1;
653099ff 8020 }
847a199f 8021 }
ffc61ed2
JH
8022
8023 if (o->flags & ANYOF_CLASS)
bb7a0f54 8024 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8025 if (ANYOF_CLASS_TEST(o,i))
8026 sv_catpv(sv, anyofs[i]);
8027
8028 if (flags & ANYOF_UNICODE)
396482e1 8029 sv_catpvs(sv, "{unicode}");
1aa99e6b 8030 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8031 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8032
8033 {
8034 SV *lv;
32fc9b6a 8035 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8036
ffc61ed2
JH
8037 if (lv) {
8038 if (sw) {
89ebb4a3 8039 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8040
ffc61ed2 8041 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8042 uvchr_to_utf8(s, i);
ffc61ed2 8043
3568d838 8044 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8045 if (rangestart == -1)
8046 rangestart = i;
8047 } else if (rangestart != -1) {
ffc61ed2
JH
8048 if (i <= rangestart + 3)
8049 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8050 const U8 * const e = uvchr_to_utf8(s,rangestart);
8051 U8 *p;
8052 for(p = s; p < e; p++)
ffc61ed2
JH
8053 put_byte(sv, *p);
8054 }
8055 else {
2d03de9c
AL
8056 const U8 *e = uvchr_to_utf8(s,rangestart);
8057 U8 *p;
8058 for (p = s; p < e; p++)
ffc61ed2 8059 put_byte(sv, *p);
396482e1 8060 sv_catpvs(sv, "-");
2d03de9c
AL
8061 e = uvchr_to_utf8(s, i-1);
8062 for (p = s; p < e; p++)
1df70142 8063 put_byte(sv, *p);
ffc61ed2
JH
8064 }
8065 rangestart = -1;
8066 }
19860706 8067 }
ffc61ed2 8068
396482e1 8069 sv_catpvs(sv, "..."); /* et cetera */
19860706 8070 }
fde631ed 8071
ffc61ed2 8072 {
2e0de35c 8073 char *s = savesvpv(lv);
c445ea15 8074 char * const origs = s;
b81d288d 8075
3dab1dad
YO
8076 while (*s && *s != '\n')
8077 s++;
b81d288d 8078
ffc61ed2 8079 if (*s == '\n') {
2d03de9c 8080 const char * const t = ++s;
ffc61ed2
JH
8081
8082 while (*s) {
8083 if (*s == '\n')
8084 *s = ' ';
8085 s++;
8086 }
8087 if (s[-1] == ' ')
8088 s[-1] = 0;
8089
8090 sv_catpv(sv, t);
fde631ed 8091 }
b81d288d 8092
ffc61ed2 8093 Safefree(origs);
fde631ed
JH
8094 }
8095 }
653099ff 8096 }
ffc61ed2 8097
653099ff
GS
8098 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8099 }
9b155405 8100 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8101 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8102#else
96a5add6 8103 PERL_UNUSED_CONTEXT;
65e66c80
SP
8104 PERL_UNUSED_ARG(sv);
8105 PERL_UNUSED_ARG(o);
f9049ba1 8106 PERL_UNUSED_ARG(prog);
17c3b450 8107#endif /* DEBUGGING */
35ff7856 8108}
a687059c 8109
cad2e5aa
JH
8110SV *
8111Perl_re_intuit_string(pTHX_ regexp *prog)
8112{ /* Assume that RE_INTUIT is set */
97aff369 8113 dVAR;
a3621e74 8114 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8115 PERL_UNUSED_CONTEXT;
8116
a3621e74 8117 DEBUG_COMPILE_r(
cfd0369c 8118 {
2d03de9c 8119 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8120 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8121
8122 if (!PL_colorset) reginitcolors();
8123 PerlIO_printf(Perl_debug_log,
a0288114 8124 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8125 PL_colors[4],
8126 prog->check_substr ? "" : "utf8 ",
8127 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8128 s,
8129 PL_colors[1],
8130 (strlen(s) > 60 ? "..." : ""));
8131 } );
8132
33b8afdf 8133 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8134}
8135
84da74a7
YO
8136/*
8137 pregfree - free a regexp
8138
8139 See regdupe below if you change anything here.
8140*/
8141
2b69d0c2 8142void
864dbfa3 8143Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8144{
27da23d5 8145 dVAR;
0df25f3d 8146
fc32ee4a 8147 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8148
7821416a
IZ
8149 if (!r || (--r->refcnt > 0))
8150 return;
ab3bbdeb 8151 DEBUG_COMPILE_r({
0df25f3d
YO
8152 if (!PL_colorset)
8153 reginitcolors();
ab3bbdeb
YO
8154 if (RX_DEBUG(r)){
8155 SV *dsv= sv_newmortal();
8156 RE_PV_QUOTED_DECL(s, (r->reganch & ROPT_UTF8),
8157 dsv, r->precomp, r->prelen, 60);
8158 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8159 PL_colors[4],PL_colors[5],s);
8160 }
9e55ce06 8161 });
cad2e5aa 8162
43c5f42d
NC
8163 /* gcov results gave these as non-null 100% of the time, so there's no
8164 optimisation in checking them before calling Safefree */
8165 Safefree(r->precomp);
8166 Safefree(r->offsets); /* 20010421 MJD */
ed252734 8167 RX_MATCH_COPY_FREE(r);
f8c7b90f 8168#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8169 if (r->saved_copy)
8170 SvREFCNT_dec(r->saved_copy);
8171#endif
a193d654
GS
8172 if (r->substrs) {
8173 if (r->anchored_substr)
8174 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8175 if (r->anchored_utf8)
8176 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8177 if (r->float_substr)
8178 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8179 if (r->float_utf8)
8180 SvREFCNT_dec(r->float_utf8);
2779dcf1 8181 Safefree(r->substrs);
a193d654 8182 }
81714fb9
YO
8183 if (r->paren_names)
8184 SvREFCNT_dec(r->paren_names);
c277df42
IZ
8185 if (r->data) {
8186 int n = r->data->count;
f3548bdc
DM
8187 PAD* new_comppad = NULL;
8188 PAD* old_comppad;
4026c95a 8189 PADOFFSET refcnt;
dfad63ad 8190
c277df42 8191 while (--n >= 0) {
261faec3 8192 /* If you add a ->what type here, update the comment in regcomp.h */
c277df42
IZ
8193 switch (r->data->what[n]) {
8194 case 's':
81714fb9 8195 case 'S':
c277df42
IZ
8196 SvREFCNT_dec((SV*)r->data->data[n]);
8197 break;
653099ff
GS
8198 case 'f':
8199 Safefree(r->data->data[n]);
8200 break;
dfad63ad
HS
8201 case 'p':
8202 new_comppad = (AV*)r->data->data[n];
8203 break;
c277df42 8204 case 'o':
dfad63ad 8205 if (new_comppad == NULL)
cea2e8a9 8206 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8207 PAD_SAVE_LOCAL(old_comppad,
8208 /* Watch out for global destruction's random ordering. */
c445ea15 8209 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8210 );
b34c0dd4 8211 OP_REFCNT_LOCK;
4026c95a
SH
8212 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
8213 OP_REFCNT_UNLOCK;
8214 if (!refcnt)
9b978d73 8215 op_free((OP_4tree*)r->data->data[n]);
9b978d73 8216
f3548bdc 8217 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8218 SvREFCNT_dec((SV*)new_comppad);
8219 new_comppad = NULL;
c277df42
IZ
8220 break;
8221 case 'n':
9e55ce06 8222 break;
07be1b83 8223 case 'T':
be8e71aa
YO
8224 { /* Aho Corasick add-on structure for a trie node.
8225 Used in stclass optimization only */
07be1b83
YO
8226 U32 refcount;
8227 reg_ac_data *aho=(reg_ac_data*)r->data->data[n];
8228 OP_REFCNT_LOCK;
8229 refcount = --aho->refcount;
8230 OP_REFCNT_UNLOCK;
8231 if ( !refcount ) {
8232 Safefree(aho->states);
8233 Safefree(aho->fail);
8234 aho->trie=NULL; /* not necessary to free this as it is
8235 handled by the 't' case */
8236 Safefree(r->data->data[n]); /* do this last!!!! */
be8e71aa 8237 Safefree(r->regstclass);
07be1b83
YO
8238 }
8239 }
8240 break;
a3621e74 8241 case 't':
07be1b83 8242 {
be8e71aa 8243 /* trie structure. */
07be1b83
YO
8244 U32 refcount;
8245 reg_trie_data *trie=(reg_trie_data*)r->data->data[n];
8246 OP_REFCNT_LOCK;
8247 refcount = --trie->refcount;
8248 OP_REFCNT_UNLOCK;
8249 if ( !refcount ) {
8250 Safefree(trie->charmap);
8251 if (trie->widecharmap)
8252 SvREFCNT_dec((SV*)trie->widecharmap);
8253 Safefree(trie->states);
8254 Safefree(trie->trans);
8255 if (trie->bitmap)
8256 Safefree(trie->bitmap);
8257 if (trie->wordlen)
8258 Safefree(trie->wordlen);
786e8c11
YO
8259 if (trie->jump)
8260 Safefree(trie->jump);
8261 if (trie->nextword)
8262 Safefree(trie->nextword);
a3621e74 8263#ifdef DEBUGGING
be8e71aa
YO
8264 if (RX_DEBUG(r)) {
8265 if (trie->words)
8266 SvREFCNT_dec((SV*)trie->words);
8267 if (trie->revcharmap)
8268 SvREFCNT_dec((SV*)trie->revcharmap);
8269 }
a3621e74 8270#endif
07be1b83 8271 Safefree(r->data->data[n]); /* do this last!!!! */
a3621e74 8272 }
07be1b83
YO
8273 }
8274 break;
c277df42 8275 default:
830247a4 8276 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
c277df42
IZ
8277 }
8278 }
8279 Safefree(r->data->what);
8280 Safefree(r->data);
a0d0e21e
LW
8281 }
8282 Safefree(r->startp);
8283 Safefree(r->endp);
8284 Safefree(r);
a687059c 8285}
c277df42 8286
84da74a7
YO
8287#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8288#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8289#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8290#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8291
8292/*
8293 regdupe - duplicate a regexp.
8294
8295 This routine is called by sv.c's re_dup and is expected to clone a
8296 given regexp structure. It is a no-op when not under USE_ITHREADS.
8297 (Originally this *was* re_dup() for change history see sv.c)
8298
8299 See pregfree() above if you change anything here.
8300*/
a3c0e9ca 8301#if defined(USE_ITHREADS)
84da74a7
YO
8302regexp *
8303Perl_regdupe(pTHX_ const regexp *r, CLONE_PARAMS *param)
8304{
84da74a7
YO
8305 dVAR;
8306 REGEXP *ret;
8307 int i, len, npar;
8308 struct reg_substr_datum *s;
8309
8310 if (!r)
8311 return (REGEXP *)NULL;
8312
8313 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8314 return ret;
8315
8316 len = r->offsets[0];
8317 npar = r->nparens+1;
8318
8319 Newxc(ret, sizeof(regexp) + (len+1)*sizeof(regnode), char, regexp);
8320 Copy(r->program, ret->program, len+1, regnode);
8321
8322 Newx(ret->startp, npar, I32);
8323 Copy(r->startp, ret->startp, npar, I32);
8324 Newx(ret->endp, npar, I32);
8325 Copy(r->startp, ret->startp, npar, I32);
8326
8327 Newx(ret->substrs, 1, struct reg_substr_data);
8328 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8329 s->min_offset = r->substrs->data[i].min_offset;
8330 s->max_offset = r->substrs->data[i].max_offset;
8331 s->end_shift = r->substrs->data[i].end_shift;
8332 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8333 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8334 }
8335
8336 ret->regstclass = NULL;
8337 if (r->data) {
8338 struct reg_data *d;
8339 const int count = r->data->count;
8340 int i;
8341
8342 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8343 char, struct reg_data);
8344 Newx(d->what, count, U8);
8345
8346 d->count = count;
8347 for (i = 0; i < count; i++) {
8348 d->what[i] = r->data->what[i];
8349 switch (d->what[i]) {
8350 /* legal options are one of: sfpont
8351 see also regcomp.h and pregfree() */
8352 case 's':
81714fb9 8353 case 'S':
84da74a7
YO
8354 d->data[i] = sv_dup_inc((SV *)r->data->data[i], param);
8355 break;
8356 case 'p':
8357 d->data[i] = av_dup_inc((AV *)r->data->data[i], param);
8358 break;
8359 case 'f':
8360 /* This is cheating. */
8361 Newx(d->data[i], 1, struct regnode_charclass_class);
8362 StructCopy(r->data->data[i], d->data[i],
8363 struct regnode_charclass_class);
8364 ret->regstclass = (regnode*)d->data[i];
8365 break;
8366 case 'o':
8367 /* Compiled op trees are readonly, and can thus be
8368 shared without duplication. */
8369 OP_REFCNT_LOCK;
8370 d->data[i] = (void*)OpREFCNT_inc((OP*)r->data->data[i]);
8371 OP_REFCNT_UNLOCK;
8372 break;
8373 case 'n':
8374 d->data[i] = r->data->data[i];
8375 break;
8376 case 't':
8377 d->data[i] = r->data->data[i];
8378 OP_REFCNT_LOCK;
8379 ((reg_trie_data*)d->data[i])->refcount++;
8380 OP_REFCNT_UNLOCK;
8381 break;
8382 case 'T':
8383 d->data[i] = r->data->data[i];
8384 OP_REFCNT_LOCK;
8385 ((reg_ac_data*)d->data[i])->refcount++;
8386 OP_REFCNT_UNLOCK;
8387 /* Trie stclasses are readonly and can thus be shared
8388 * without duplication. We free the stclass in pregfree
8389 * when the corresponding reg_ac_data struct is freed.
8390 */
8391 ret->regstclass= r->regstclass;
8392 break;
8393 default:
8394 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", r->data->what[i]);
8395 }
8396 }
8397
8398 ret->data = d;
8399 }
8400 else
8401 ret->data = NULL;
8402
8403 Newx(ret->offsets, 2*len+1, U32);
8404 Copy(r->offsets, ret->offsets, 2*len+1, U32);
8405
8406 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8407 ret->refcnt = r->refcnt;
8408 ret->minlen = r->minlen;
8409 ret->prelen = r->prelen;
8410 ret->nparens = r->nparens;
8411 ret->lastparen = r->lastparen;
8412 ret->lastcloseparen = r->lastcloseparen;
8413 ret->reganch = r->reganch;
8414
8415 ret->sublen = r->sublen;
8416
f9f4320a 8417 ret->engine = r->engine;
81714fb9
YO
8418
8419 ret->paren_names = hv_dup_inc(r->paren_names, param);
f9f4320a 8420
84da74a7
YO
8421 if (RX_MATCH_COPIED(ret))
8422 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8423 else
8424 ret->subbeg = NULL;
8425#ifdef PERL_OLD_COPY_ON_WRITE
8426 ret->saved_copy = NULL;
8427#endif
8428
8429 ptr_table_store(PL_ptr_table, r, ret);
8430 return ret;
84da74a7 8431}
a3c0e9ca 8432#endif
84da74a7 8433
76234dfb 8434#ifndef PERL_IN_XSUB_RE
c277df42
IZ
8435/*
8436 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8437 */
8438regnode *
864dbfa3 8439Perl_regnext(pTHX_ register regnode *p)
c277df42 8440{
97aff369 8441 dVAR;
c277df42
IZ
8442 register I32 offset;
8443
3280af22 8444 if (p == &PL_regdummy)
c277df42
IZ
8445 return(NULL);
8446
8447 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8448 if (offset == 0)
8449 return(NULL);
8450
c277df42 8451 return(p+offset);
c277df42 8452}
76234dfb 8453#endif
c277df42 8454
01f988be 8455STATIC void
cea2e8a9 8456S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
8457{
8458 va_list args;
8459 STRLEN l1 = strlen(pat1);
8460 STRLEN l2 = strlen(pat2);
8461 char buf[512];
06bf62c7 8462 SV *msv;
73d840c0 8463 const char *message;
c277df42
IZ
8464
8465 if (l1 > 510)
8466 l1 = 510;
8467 if (l1 + l2 > 510)
8468 l2 = 510 - l1;
8469 Copy(pat1, buf, l1 , char);
8470 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
8471 buf[l1 + l2] = '\n';
8472 buf[l1 + l2 + 1] = '\0';
8736538c
AS
8473#ifdef I_STDARG
8474 /* ANSI variant takes additional second argument */
c277df42 8475 va_start(args, pat2);
8736538c
AS
8476#else
8477 va_start(args);
8478#endif
5a844595 8479 msv = vmess(buf, &args);
c277df42 8480 va_end(args);
cfd0369c 8481 message = SvPV_const(msv,l1);
c277df42
IZ
8482 if (l1 > 512)
8483 l1 = 512;
8484 Copy(message, buf, l1 , char);
197cf9b9 8485 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 8486 Perl_croak(aTHX_ "%s", buf);
c277df42 8487}
a0ed51b3
LW
8488
8489/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
8490
76234dfb 8491#ifndef PERL_IN_XSUB_RE
a0ed51b3 8492void
864dbfa3 8493Perl_save_re_context(pTHX)
b81d288d 8494{
97aff369 8495 dVAR;
1ade1aa1
NC
8496
8497 struct re_save_state *state;
8498
8499 SAVEVPTR(PL_curcop);
8500 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
8501
8502 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
8503 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
8504 SSPUSHINT(SAVEt_RE_STATE);
8505
46ab3289 8506 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 8507
a0ed51b3 8508 PL_reg_start_tmp = 0;
a0ed51b3 8509 PL_reg_start_tmpl = 0;
c445ea15 8510 PL_reg_oldsaved = NULL;
a5db57d6 8511 PL_reg_oldsavedlen = 0;
a5db57d6 8512 PL_reg_maxiter = 0;
a5db57d6 8513 PL_reg_leftiter = 0;
c445ea15 8514 PL_reg_poscache = NULL;
a5db57d6 8515 PL_reg_poscache_size = 0;
1ade1aa1
NC
8516#ifdef PERL_OLD_COPY_ON_WRITE
8517 PL_nrs = NULL;
8518#endif
ada6e8a9 8519
c445ea15
AL
8520 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
8521 if (PL_curpm) {
8522 const REGEXP * const rx = PM_GETRE(PL_curpm);
8523 if (rx) {
1df70142 8524 U32 i;
ada6e8a9 8525 for (i = 1; i <= rx->nparens; i++) {
1df70142 8526 char digits[TYPE_CHARS(long)];
d9fad198 8527 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
8528 GV *const *const gvp
8529 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
8530
b37c2d43
AL
8531 if (gvp) {
8532 GV * const gv = *gvp;
8533 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
8534 save_scalar(gv);
49f27e4b 8535 }
ada6e8a9
AMS
8536 }
8537 }
8538 }
a0ed51b3 8539}
76234dfb 8540#endif
51371543 8541
51371543 8542static void
acfe0abc 8543clear_re(pTHX_ void *r)
51371543 8544{
97aff369 8545 dVAR;
51371543
GS
8546 ReREFCNT_dec((regexp *)r);
8547}
ffbc6a93 8548
a28509cc
AL
8549#ifdef DEBUGGING
8550
8551STATIC void
8552S_put_byte(pTHX_ SV *sv, int c)
8553{
8554 if (isCNTRL(c) || c == 255 || !isPRINT(c))
8555 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
8556 else if (c == '-' || c == ']' || c == '\\' || c == '^')
8557 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
8558 else
8559 Perl_sv_catpvf(aTHX_ sv, "%c", c);
8560}
8561
786e8c11 8562
3dab1dad
YO
8563#define CLEAR_OPTSTART \
8564 if (optstart) STMT_START { \
07be1b83 8565 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%d nodes)\n", node - optstart)); \
3dab1dad
YO
8566 optstart=NULL; \
8567 } STMT_END
8568
786e8c11 8569#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 8570
b5a2f8d8
NC
8571STATIC const regnode *
8572S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
8573 const regnode *last, const regnode *plast,
8574 SV* sv, I32 indent, U32 depth)
a28509cc 8575{
97aff369 8576 dVAR;
786e8c11 8577 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 8578 register const regnode *next;
3dab1dad
YO
8579 const regnode *optstart= NULL;
8580 GET_RE_DEBUG_FLAGS_DECL;
a28509cc 8581
786e8c11
YO
8582#ifdef DEBUG_DUMPUNTIL
8583 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
8584 last ? last-start : 0,plast ? plast-start : 0);
8585#endif
8586
8587 if (plast && plast < last)
8588 last= plast;
8589
8590 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc
AL
8591 /* While that wasn't END last time... */
8592
8593 NODE_ALIGN(node);
8594 op = OP(node);
8595 if (op == CLOSE)
786e8c11 8596 indent--;
b5a2f8d8 8597 next = regnext((regnode *)node);
07be1b83 8598
a28509cc 8599 /* Where, what. */
8e11feef 8600 if (OP(node) == OPTIMIZED) {
e68ec53f 8601 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 8602 optstart = node;
3dab1dad 8603 else
8e11feef 8604 goto after_print;
3dab1dad
YO
8605 } else
8606 CLEAR_OPTSTART;
07be1b83 8607
32fc9b6a 8608 regprop(r, sv, node);
a28509cc 8609 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 8610 (int)(2*indent + 1), "", SvPVX_const(sv));
3dab1dad
YO
8611
8612 if (OP(node) != OPTIMIZED) {
8e11feef
RGS
8613 if (next == NULL) /* Next ptr. */
8614 PerlIO_printf(Perl_debug_log, "(0)");
786e8c11
YO
8615 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
8616 PerlIO_printf(Perl_debug_log, "(FAIL)");
8e11feef
RGS
8617 else
8618 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
786e8c11 8619
1de06328 8620 /*if (PL_regkind[(U8)op] != TRIE)*/
786e8c11 8621 (void)PerlIO_putc(Perl_debug_log, '\n');
3dab1dad
YO
8622 }
8623
a28509cc
AL
8624 after_print:
8625 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
8626 assert(next);
8627 {
8628 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
8629 ? regnext((regnode *)next)
8630 : next);
be8e71aa
YO
8631 if (last && nnode > last)
8632 nnode = last;
786e8c11 8633 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 8634 }
a28509cc
AL
8635 }
8636 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 8637 assert(next);
786e8c11 8638 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
8639 }
8640 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 8641 const regnode *this_trie = node;
1de06328 8642 const char op = OP(node);
a28509cc 8643 const I32 n = ARG(node);
1de06328
YO
8644 const reg_ac_data * const ac = op>=AHOCORASICK ?
8645 (reg_ac_data *)r->data->data[n] :
8646 NULL;
8647 const reg_trie_data * const trie = op<AHOCORASICK ?
8648 (reg_trie_data*)r->data->data[n] :
8649 ac->trie;
786e8c11 8650 const regnode *nextbranch= NULL;
a28509cc 8651 I32 word_idx;
1de06328 8652 sv_setpvn(sv, "", 0);
786e8c11 8653 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
097eb12c 8654 SV ** const elem_ptr = av_fetch(trie->words,word_idx,0);
786e8c11
YO
8655
8656 PerlIO_printf(Perl_debug_log, "%*s%s ",
8657 (int)(2*(indent+3)), "",
8658 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
8659 PL_colors[0], PL_colors[1],
8660 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
8661 PERL_PV_PRETTY_ELIPSES |
7f69552c 8662 PERL_PV_PRETTY_LTGT
786e8c11
YO
8663 )
8664 : "???"
8665 );
8666 if (trie->jump) {
40d049e4 8667 U16 dist= trie->jump[word_idx+1];
7f69552c 8668 PerlIO_printf(Perl_debug_log, "(%u)\n",
40d049e4 8669 (dist ? this_trie + dist : next) - start);
786e8c11
YO
8670 if (dist) {
8671 if (!nextbranch)
7f69552c
YO
8672 nextbranch = this_trie + trie->jump[0];
8673 DUMPUNTIL(this_trie + dist, nextbranch);
8674 }
786e8c11
YO
8675 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
8676 nextbranch= regnext((regnode *)nextbranch);
8677 } else {
8678 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 8679 }
786e8c11
YO
8680 }
8681 if (last && next > last)
8682 node= last;
8683 else
8684 node= next;
a28509cc 8685 }
786e8c11
YO
8686 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
8687 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
8688 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
8689 }
8690 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 8691 assert(next);
786e8c11 8692 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
8693 }
8694 else if ( op == PLUS || op == STAR) {
786e8c11 8695 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
8696 }
8697 else if (op == ANYOF) {
8698 /* arglen 1 + class block */
8699 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
8700 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
8701 node = NEXTOPER(node);
8702 }
8703 else if (PL_regkind[(U8)op] == EXACT) {
8704 /* Literal string, where present. */
8705 node += NODE_SZ_STR(node) - 1;
8706 node = NEXTOPER(node);
8707 }
8708 else {
8709 node = NEXTOPER(node);
8710 node += regarglen[(U8)op];
8711 }
8712 if (op == CURLYX || op == OPEN)
786e8c11 8713 indent++;
a28509cc 8714 else if (op == WHILEM)
786e8c11 8715 indent--;
a28509cc 8716 }
3dab1dad 8717 CLEAR_OPTSTART;
786e8c11
YO
8718#ifdef DEBUG_DUMPUNTIL
8719 PerlIO_printf(Perl_debug_log, "--- %d\n",indent);
8720#endif
1de06328 8721 return node;
a28509cc
AL
8722}
8723
8724#endif /* DEBUGGING */
8725
241d1a3b
NC
8726/*
8727 * Local variables:
8728 * c-indentation-style: bsd
8729 * c-basic-offset: 4
8730 * indent-tabs-mode: t
8731 * End:
8732 *
37442d52
RGS
8733 * ex: set ts=8 sts=4 sw=4 noet:
8734 */