This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: remove some gotos that cause compiler consternation.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
154#ifdef RE_TRACK_PATTERN_OFFSETS
155#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
156#endif
830247a4 157#define RExC_emit (pRExC_state->emit)
fac92740 158#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
159#define RExC_naughty (pRExC_state->naughty)
160#define RExC_sawback (pRExC_state->sawback)
161#define RExC_seen (pRExC_state->seen)
162#define RExC_size (pRExC_state->size)
163#define RExC_npar (pRExC_state->npar)
e2e6a0f1 164#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
165#define RExC_extralen (pRExC_state->extralen)
166#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
167#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 168#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 169#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
170#define RExC_open_parens (pRExC_state->open_parens)
171#define RExC_close_parens (pRExC_state->close_parens)
172#define RExC_opend (pRExC_state->opend)
81714fb9 173#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
174#define RExC_recurse (pRExC_state->recurse)
175#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 176
cde0cee5 177
a687059c
LW
178#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
179#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
180 ((*s) == '{' && regcurly(s)))
a687059c 181
35c8bce7
LW
182#ifdef SPSTART
183#undef SPSTART /* dratted cpp namespace... */
184#endif
a687059c
LW
185/*
186 * Flags to be passed up and down.
187 */
a687059c 188#define WORST 0 /* Worst case. */
821b33a5 189#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
190#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
191#define SPSTART 0x4 /* Starts with * or +. */
192#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 193
3dab1dad
YO
194#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
195
07be1b83
YO
196/* whether trie related optimizations are enabled */
197#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
198#define TRIE_STUDY_OPT
786e8c11 199#define FULL_TRIE_STUDY
07be1b83
YO
200#define TRIE_STCLASS
201#endif
1de06328
YO
202
203
40d049e4
YO
204
205#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
206#define PBITVAL(paren) (1 << ((paren) & 7))
207#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
208#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
209#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
210
211
1de06328
YO
212/* About scan_data_t.
213
214 During optimisation we recurse through the regexp program performing
215 various inplace (keyhole style) optimisations. In addition study_chunk
216 and scan_commit populate this data structure with information about
217 what strings MUST appear in the pattern. We look for the longest
218 string that must appear for at a fixed location, and we look for the
219 longest string that may appear at a floating location. So for instance
220 in the pattern:
221
222 /FOO[xX]A.*B[xX]BAR/
223
224 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
225 strings (because they follow a .* construct). study_chunk will identify
226 both FOO and BAR as being the longest fixed and floating strings respectively.
227
228 The strings can be composites, for instance
229
230 /(f)(o)(o)/
231
232 will result in a composite fixed substring 'foo'.
233
234 For each string some basic information is maintained:
235
236 - offset or min_offset
237 This is the position the string must appear at, or not before.
238 It also implicitly (when combined with minlenp) tells us how many
239 character must match before the string we are searching.
240 Likewise when combined with minlenp and the length of the string
241 tells us how many characters must appear after the string we have
242 found.
243
244 - max_offset
245 Only used for floating strings. This is the rightmost point that
246 the string can appear at. Ifset to I32 max it indicates that the
247 string can occur infinitely far to the right.
248
249 - minlenp
250 A pointer to the minimum length of the pattern that the string
251 was found inside. This is important as in the case of positive
252 lookahead or positive lookbehind we can have multiple patterns
253 involved. Consider
254
255 /(?=FOO).*F/
256
257 The minimum length of the pattern overall is 3, the minimum length
258 of the lookahead part is 3, but the minimum length of the part that
259 will actually match is 1. So 'FOO's minimum length is 3, but the
260 minimum length for the F is 1. This is important as the minimum length
261 is used to determine offsets in front of and behind the string being
262 looked for. Since strings can be composites this is the length of the
263 pattern at the time it was commited with a scan_commit. Note that
264 the length is calculated by study_chunk, so that the minimum lengths
265 are not known until the full pattern has been compiled, thus the
266 pointer to the value.
267
268 - lookbehind
269
270 In the case of lookbehind the string being searched for can be
271 offset past the start point of the final matching string.
272 If this value was just blithely removed from the min_offset it would
273 invalidate some of the calculations for how many chars must match
274 before or after (as they are derived from min_offset and minlen and
275 the length of the string being searched for).
276 When the final pattern is compiled and the data is moved from the
277 scan_data_t structure into the regexp structure the information
278 about lookbehind is factored in, with the information that would
279 have been lost precalculated in the end_shift field for the
280 associated string.
281
282 The fields pos_min and pos_delta are used to store the minimum offset
283 and the delta to the maximum offset at the current point in the pattern.
284
285*/
2c2d71f5
JH
286
287typedef struct scan_data_t {
1de06328
YO
288 /*I32 len_min; unused */
289 /*I32 len_delta; unused */
2c2d71f5
JH
290 I32 pos_min;
291 I32 pos_delta;
292 SV *last_found;
1de06328 293 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
294 I32 last_start_min;
295 I32 last_start_max;
1de06328
YO
296 SV **longest; /* Either &l_fixed, or &l_float. */
297 SV *longest_fixed; /* longest fixed string found in pattern */
298 I32 offset_fixed; /* offset where it starts */
299 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
301 SV *longest_float; /* longest floating string found in pattern */
302 I32 offset_float_min; /* earliest point in string it can appear */
303 I32 offset_float_max; /* latest point in string it can appear */
304 I32 *minlen_float; /* pointer to the minlen relevent to the string */
305 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
306 I32 flags;
307 I32 whilem_c;
cb434fcc 308 I32 *last_closep;
653099ff 309 struct regnode_charclass_class *start_class;
2c2d71f5
JH
310} scan_data_t;
311
a687059c 312/*
e50aee73 313 * Forward declarations for pregcomp()'s friends.
a687059c 314 */
a0d0e21e 315
27da23d5 316static const scan_data_t zero_scan_data =
1de06328 317 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
318
319#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
320#define SF_BEFORE_SEOL 0x0001
321#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
322#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
323#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
324
09b7f37c
CB
325#ifdef NO_UNARY_PLUS
326# define SF_FIX_SHIFT_EOL (0+2)
327# define SF_FL_SHIFT_EOL (0+4)
328#else
329# define SF_FIX_SHIFT_EOL (+2)
330# define SF_FL_SHIFT_EOL (+4)
331#endif
c277df42
IZ
332
333#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
334#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
335
336#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
337#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
338#define SF_IS_INF 0x0040
339#define SF_HAS_PAR 0x0080
340#define SF_IN_PAR 0x0100
341#define SF_HAS_EVAL 0x0200
342#define SCF_DO_SUBSTR 0x0400
653099ff
GS
343#define SCF_DO_STCLASS_AND 0x0800
344#define SCF_DO_STCLASS_OR 0x1000
345#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 346#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 347
786e8c11 348#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 349#define SCF_SEEN_ACCEPT 0x8000
07be1b83 350
eb160463 351#define UTF (RExC_utf8 != 0)
bbe252da
YO
352#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
353#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 354
ffc61ed2 355#define OOB_UNICODE 12345678
93733859 356#define OOB_NAMEDCLASS -1
b8c5462f 357
a0ed51b3
LW
358#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
359#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
360
8615cb43 361
b45f050a
JF
362/* length of regex to show in messages that don't mark a position within */
363#define RegexLengthToShowInErrorMessages 127
364
365/*
366 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
367 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
368 * op/pragma/warn/regcomp.
369 */
7253e4e3
RK
370#define MARKER1 "<-- HERE" /* marker as it appears in the description */
371#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 372
7253e4e3 373#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
374
375/*
376 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
377 * arg. Show regex, up to a maximum length. If it's too long, chop and add
378 * "...".
379 */
58e23c8d 380#define _FAIL(code) STMT_START { \
bfed75c6 381 const char *ellipses = ""; \
ccb2c380
MP
382 IV len = RExC_end - RExC_precomp; \
383 \
384 if (!SIZE_ONLY) \
385 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
386 if (len > RegexLengthToShowInErrorMessages) { \
387 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
388 len = RegexLengthToShowInErrorMessages - 10; \
389 ellipses = "..."; \
390 } \
58e23c8d 391 code; \
ccb2c380 392} STMT_END
8615cb43 393
58e23c8d
YO
394#define FAIL(msg) _FAIL( \
395 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
396 msg, (int)len, RExC_precomp, ellipses))
397
398#define FAIL2(msg,arg) _FAIL( \
399 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
400 arg, (int)len, RExC_precomp, ellipses))
401
b45f050a 402/*
b45f050a
JF
403 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
404 */
ccb2c380 405#define Simple_vFAIL(m) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
408 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
413 */
ccb2c380
MP
414#define vFAIL(m) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL(m); \
418} STMT_END
b45f050a
JF
419
420/*
421 * Like Simple_vFAIL(), but accepts two arguments.
422 */
ccb2c380 423#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 424 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
425 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
426 (int)offset, RExC_precomp, RExC_precomp + offset); \
427} STMT_END
b45f050a
JF
428
429/*
430 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
431 */
ccb2c380
MP
432#define vFAIL2(m,a1) STMT_START { \
433 if (!SIZE_ONLY) \
434 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
435 Simple_vFAIL2(m, a1); \
436} STMT_END
b45f050a
JF
437
438
439/*
440 * Like Simple_vFAIL(), but accepts three arguments.
441 */
ccb2c380 442#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a
JF
447
448/*
449 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
450 */
ccb2c380
MP
451#define vFAIL3(m,a1,a2) STMT_START { \
452 if (!SIZE_ONLY) \
453 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
454 Simple_vFAIL3(m, a1, a2); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Like Simple_vFAIL(), but accepts four arguments.
459 */
ccb2c380 460#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 461 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
462 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
463 (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
b45f050a 465
ccb2c380 466#define vWARN(loc,m) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARNdep(loc,m) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
475 "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
478
479
480#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 487 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
488 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
490} STMT_END
491
492#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
497
498#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 499 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
500 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
501 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
9d1d55b5 503
8615cb43 504
cd439c50 505/* Allow for side effects in s */
ccb2c380
MP
506#define REGC(c,s) STMT_START { \
507 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
508} STMT_END
cd439c50 509
fac92740
MJD
510/* Macros for recording node offsets. 20001227 mjd@plover.com
511 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
512 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
513 * Element 0 holds the number n.
07be1b83 514 * Position is 1 indexed.
fac92740 515 */
7122b237
YO
516#ifndef RE_TRACK_PATTERN_OFFSETS
517#define Set_Node_Offset_To_R(node,byte)
518#define Set_Node_Offset(node,byte)
519#define Set_Cur_Node_Offset
520#define Set_Node_Length_To_R(node,len)
521#define Set_Node_Length(node,len)
522#define Set_Node_Cur_Length(node)
523#define Node_Offset(n)
524#define Node_Length(n)
525#define Set_Node_Offset_Length(node,offset,len)
526#define ProgLen(ri) ri->u.proglen
527#define SetProgLen(ri,x) ri->u.proglen = x
528#else
529#define ProgLen(ri) ri->u.offsets[0]
530#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
531#define Set_Node_Offset_To_R(node,byte) STMT_START { \
532 if (! SIZE_ONLY) { \
533 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 534 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 535 if((node) < 0) { \
551405c4 536 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
537 } else { \
538 RExC_offsets[2*(node)-1] = (byte); \
539 } \
540 } \
541} STMT_END
542
543#define Set_Node_Offset(node,byte) \
544 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
545#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
546
547#define Set_Node_Length_To_R(node,len) STMT_START { \
548 if (! SIZE_ONLY) { \
549 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 550 __LINE__, (int)(node), (int)(len))); \
ccb2c380 551 if((node) < 0) { \
551405c4 552 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
553 } else { \
554 RExC_offsets[2*(node)] = (len); \
555 } \
556 } \
557} STMT_END
558
559#define Set_Node_Length(node,len) \
560 Set_Node_Length_To_R((node)-RExC_emit_start, len)
561#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
562#define Set_Node_Cur_Length(node) \
563 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
564
565/* Get offsets and lengths */
566#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
567#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
568
07be1b83
YO
569#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
570 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
571 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
572} STMT_END
7122b237 573#endif
07be1b83
YO
574
575#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
576#define EXPERIMENTAL_INPLACESCAN
7122b237 577#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 578
304ee84b
YO
579#define DEBUG_STUDYDATA(str,data,depth) \
580DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 581 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
582 "%*s" str "Pos:%"IVdf"/%"IVdf \
583 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
584 (int)(depth)*2, "", \
585 (IV)((data)->pos_min), \
586 (IV)((data)->pos_delta), \
304ee84b 587 (UV)((data)->flags), \
1de06328 588 (IV)((data)->whilem_c), \
304ee84b
YO
589 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
590 is_inf ? "INF " : "" \
1de06328
YO
591 ); \
592 if ((data)->last_found) \
593 PerlIO_printf(Perl_debug_log, \
594 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
595 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
596 SvPVX_const((data)->last_found), \
597 (IV)((data)->last_end), \
598 (IV)((data)->last_start_min), \
599 (IV)((data)->last_start_max), \
600 ((data)->longest && \
601 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
602 SvPVX_const((data)->longest_fixed), \
603 (IV)((data)->offset_fixed), \
604 ((data)->longest && \
605 (data)->longest==&((data)->longest_float)) ? "*" : "", \
606 SvPVX_const((data)->longest_float), \
607 (IV)((data)->offset_float_min), \
608 (IV)((data)->offset_float_max) \
609 ); \
610 PerlIO_printf(Perl_debug_log,"\n"); \
611});
612
acfe0abc 613static void clear_re(pTHX_ void *r);
4327152a 614
653099ff 615/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 616 Update the longest found anchored substring and the longest found
653099ff
GS
617 floating substrings if needed. */
618
4327152a 619STATIC void
304ee84b 620S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 621{
e1ec3a88
AL
622 const STRLEN l = CHR_SVLEN(data->last_found);
623 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 624 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 625
c277df42 626 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 627 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
628 if (*data->longest == data->longest_fixed) {
629 data->offset_fixed = l ? data->last_start_min : data->pos_min;
630 if (data->flags & SF_BEFORE_EOL)
b81d288d 631 data->flags
c277df42
IZ
632 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
633 else
634 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
635 data->minlen_fixed=minlenp;
636 data->lookbehind_fixed=0;
a0ed51b3 637 }
304ee84b 638 else { /* *data->longest == data->longest_float */
c277df42 639 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
640 data->offset_float_max = (l
641 ? data->last_start_max
c277df42 642 : data->pos_min + data->pos_delta);
304ee84b 643 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 644 data->offset_float_max = I32_MAX;
c277df42 645 if (data->flags & SF_BEFORE_EOL)
b81d288d 646 data->flags
c277df42
IZ
647 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
648 else
649 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
650 data->minlen_float=minlenp;
651 data->lookbehind_float=0;
c277df42
IZ
652 }
653 }
654 SvCUR_set(data->last_found, 0);
0eda9292 655 {
a28509cc 656 SV * const sv = data->last_found;
097eb12c
AL
657 if (SvUTF8(sv) && SvMAGICAL(sv)) {
658 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
659 if (mg)
660 mg->mg_len = 0;
661 }
0eda9292 662 }
c277df42
IZ
663 data->last_end = -1;
664 data->flags &= ~SF_BEFORE_EOL;
304ee84b 665 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
666}
667
653099ff
GS
668/* Can match anything (initialization) */
669STATIC void
097eb12c 670S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 671{
653099ff 672 ANYOF_CLASS_ZERO(cl);
f8bef550 673 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 674 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
675 if (LOC)
676 cl->flags |= ANYOF_LOCALE;
677}
678
679/* Can match anything (initialization) */
680STATIC int
5f66b61c 681S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
682{
683 int value;
684
aaa51d5e 685 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
686 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
687 return 1;
1aa99e6b
IH
688 if (!(cl->flags & ANYOF_UNICODE_ALL))
689 return 0;
10edeb5d 690 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 691 return 0;
653099ff
GS
692 return 1;
693}
694
695/* Can match anything (initialization) */
696STATIC void
097eb12c 697S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 698{
8ecf7187 699 Zero(cl, 1, struct regnode_charclass_class);
653099ff 700 cl->type = ANYOF;
830247a4 701 cl_anything(pRExC_state, cl);
653099ff
GS
702}
703
704STATIC void
097eb12c 705S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 706{
8ecf7187 707 Zero(cl, 1, struct regnode_charclass_class);
653099ff 708 cl->type = ANYOF;
830247a4 709 cl_anything(pRExC_state, cl);
653099ff
GS
710 if (LOC)
711 cl->flags |= ANYOF_LOCALE;
712}
713
714/* 'And' a given class with another one. Can create false positives */
715/* We assume that cl is not inverted */
716STATIC void
5f66b61c 717S_cl_and(struct regnode_charclass_class *cl,
a28509cc 718 const struct regnode_charclass_class *and_with)
653099ff 719{
40d049e4
YO
720
721 assert(and_with->type == ANYOF);
653099ff
GS
722 if (!(and_with->flags & ANYOF_CLASS)
723 && !(cl->flags & ANYOF_CLASS)
724 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
725 && !(and_with->flags & ANYOF_FOLD)
726 && !(cl->flags & ANYOF_FOLD)) {
727 int i;
728
729 if (and_with->flags & ANYOF_INVERT)
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] &= ~and_with->bitmap[i];
732 else
733 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
734 cl->bitmap[i] &= and_with->bitmap[i];
735 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
736 if (!(and_with->flags & ANYOF_EOS))
737 cl->flags &= ~ANYOF_EOS;
1aa99e6b 738
14ebb1a2
JH
739 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
740 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
741 cl->flags &= ~ANYOF_UNICODE_ALL;
742 cl->flags |= ANYOF_UNICODE;
743 ARG_SET(cl, ARG(and_with));
744 }
14ebb1a2
JH
745 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
746 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 747 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
748 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
749 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 750 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
751}
752
753/* 'OR' a given class with another one. Can create false positives */
754/* We assume that cl is not inverted */
755STATIC void
097eb12c 756S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 757{
653099ff
GS
758 if (or_with->flags & ANYOF_INVERT) {
759 /* We do not use
760 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
761 * <= (B1 | !B2) | (CL1 | !CL2)
762 * which is wasteful if CL2 is small, but we ignore CL2:
763 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
764 * XXXX Can we handle case-fold? Unclear:
765 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
766 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
767 */
768 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(or_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD) ) {
771 int i;
772
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] |= ~or_with->bitmap[i];
775 } /* XXXX: logic is complicated otherwise */
776 else {
830247a4 777 cl_anything(pRExC_state, cl);
653099ff
GS
778 }
779 } else {
780 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
781 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 782 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
783 || (cl->flags & ANYOF_FOLD)) ) {
784 int i;
785
786 /* OR char bitmap and class bitmap separately */
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] |= or_with->bitmap[i];
789 if (or_with->flags & ANYOF_CLASS) {
790 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
791 cl->classflags[i] |= or_with->classflags[i];
792 cl->flags |= ANYOF_CLASS;
793 }
794 }
795 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 796 cl_anything(pRExC_state, cl);
653099ff
GS
797 }
798 }
799 if (or_with->flags & ANYOF_EOS)
800 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
801
802 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
803 ARG(cl) != ARG(or_with)) {
804 cl->flags |= ANYOF_UNICODE_ALL;
805 cl->flags &= ~ANYOF_UNICODE;
806 }
807 if (or_with->flags & ANYOF_UNICODE_ALL) {
808 cl->flags |= ANYOF_UNICODE_ALL;
809 cl->flags &= ~ANYOF_UNICODE;
810 }
653099ff
GS
811}
812
a3621e74
YO
813#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
814#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
815#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
816#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
817
3dab1dad
YO
818
819#ifdef DEBUGGING
07be1b83 820/*
2b8b4781
NC
821 dump_trie(trie,widecharmap,revcharmap)
822 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
823 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
824
825 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
826 The _interim_ variants are used for debugging the interim
827 tables that are used to generate the final compressed
828 representation which is what dump_trie expects.
829
3dab1dad
YO
830 Part of the reason for their existance is to provide a form
831 of documentation as to how the different representations function.
07be1b83
YO
832
833*/
3dab1dad
YO
834
835/*
3dab1dad
YO
836 Dumps the final compressed table form of the trie to Perl_debug_log.
837 Used for debugging make_trie().
838*/
839
840STATIC void
2b8b4781
NC
841S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
842 AV *revcharmap, U32 depth)
3dab1dad
YO
843{
844 U32 state;
ab3bbdeb 845 SV *sv=sv_newmortal();
55eed653 846 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
847 GET_RE_DEBUG_FLAGS_DECL;
848
ab3bbdeb 849
3dab1dad
YO
850 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
851 (int)depth * 2 + 2,"",
852 "Match","Base","Ofs" );
853
854 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 855 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 856 if ( tmp ) {
ab3bbdeb
YO
857 PerlIO_printf( Perl_debug_log, "%*s",
858 colwidth,
ddc5bc0f 859 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
860 PL_colors[0], PL_colors[1],
861 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
862 PERL_PV_ESCAPE_FIRSTCHAR
863 )
864 );
3dab1dad
YO
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 871 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
872 PerlIO_printf( Perl_debug_log, "\n");
873
1e2e3d02 874 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*"UVXf,
903 colwidth,
3dab1dad
YO
904 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 } else {
ab3bbdeb 906 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
907 }
908 }
909
910 PerlIO_printf( Perl_debug_log, "]");
911
912 }
913 PerlIO_printf( Perl_debug_log, "\n" );
914 }
915}
916/*
3dab1dad
YO
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
55eed653 923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
924 HV *widecharmap, AV *revcharmap, U32 next_alloc,
925 U32 depth)
3dab1dad
YO
926{
927 U32 state;
ab3bbdeb 928 SV *sv=sv_newmortal();
55eed653 929 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
930 GET_RE_DEBUG_FLAGS_DECL;
931 /* print out the table precompression. */
ab3bbdeb
YO
932 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
933 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
934 "------:-----+-----------------\n" );
3dab1dad
YO
935
936 for( state=1 ; state < next_alloc ; state ++ ) {
937 U16 charid;
938
ab3bbdeb 939 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
940 (int)depth * 2 + 2,"", (UV)state );
941 if ( ! trie->states[ state ].wordnum ) {
942 PerlIO_printf( Perl_debug_log, "%5s| ","");
943 } else {
944 PerlIO_printf( Perl_debug_log, "W%4x| ",
945 trie->states[ state ].wordnum
946 );
947 }
948 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 949 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
950 if ( tmp ) {
951 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
952 colwidth,
ddc5bc0f 953 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
954 PL_colors[0], PL_colors[1],
955 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
956 PERL_PV_ESCAPE_FIRSTCHAR
957 ) ,
1e2e3d02
YO
958 TRIE_LIST_ITEM(state,charid).forid,
959 (UV)TRIE_LIST_ITEM(state,charid).newstate
960 );
961 if (!(charid % 10))
664e119d
RGS
962 PerlIO_printf(Perl_debug_log, "\n%*s| ",
963 (int)((depth * 2) + 14), "");
1e2e3d02 964 }
ab3bbdeb
YO
965 }
966 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
967 }
968}
969
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in table form.
972 This is the normal DFA style state transition table, with a few
973 twists to facilitate compression later.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
982 U16 charid;
ab3bbdeb 983 SV *sv=sv_newmortal();
55eed653 984 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
985 GET_RE_DEBUG_FLAGS_DECL;
986
987 /*
988 print out the table precompression so that we can do a visual check
989 that they are identical.
990 */
991
992 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
993
994 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 995 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 996 if ( tmp ) {
ab3bbdeb
YO
997 PerlIO_printf( Perl_debug_log, "%*s",
998 colwidth,
ddc5bc0f 999 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1000 PL_colors[0], PL_colors[1],
1001 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1002 PERL_PV_ESCAPE_FIRSTCHAR
1003 )
1004 );
3dab1dad
YO
1005 }
1006 }
1007
1008 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1009
1010 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1011 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1012 }
1013
1014 PerlIO_printf( Perl_debug_log, "\n" );
1015
1016 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1017
1018 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1019 (int)depth * 2 + 2,"",
1020 (UV)TRIE_NODENUM( state ) );
1021
1022 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1023 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1024 if (v)
1025 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1026 else
1027 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1028 }
1029 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1030 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1031 } else {
1032 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1033 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1034 }
1035 }
07be1b83 1036}
3dab1dad
YO
1037
1038#endif
1039
786e8c11
YO
1040/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1041 startbranch: the first branch in the whole branch sequence
1042 first : start branch of sequence of branch-exact nodes.
1043 May be the same as startbranch
1044 last : Thing following the last branch.
1045 May be the same as tail.
1046 tail : item following the branch sequence
1047 count : words in the sequence
1048 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1049 depth : indent depth
3dab1dad 1050
786e8c11 1051Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1052
786e8c11
YO
1053A trie is an N'ary tree where the branches are determined by digital
1054decomposition of the key. IE, at the root node you look up the 1st character and
1055follow that branch repeat until you find the end of the branches. Nodes can be
1056marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1057
786e8c11 1058 /he|she|his|hers/
72f13be8 1059
786e8c11
YO
1060would convert into the following structure. Numbers represent states, letters
1061following numbers represent valid transitions on the letter from that state, if
1062the number is in square brackets it represents an accepting state, otherwise it
1063will be in parenthesis.
07be1b83 1064
786e8c11
YO
1065 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1066 | |
1067 | (2)
1068 | |
1069 (1) +-i->(6)-+-s->[7]
1070 |
1071 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1072
786e8c11
YO
1073 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1074
1075This shows that when matching against the string 'hers' we will begin at state 1
1076read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1077then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1078is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1079single traverse. We store a mapping from accepting to state to which word was
1080matched, and then when we have multiple possibilities we try to complete the
1081rest of the regex in the order in which they occured in the alternation.
1082
1083The only prior NFA like behaviour that would be changed by the TRIE support is
1084the silent ignoring of duplicate alternations which are of the form:
1085
1086 / (DUPE|DUPE) X? (?{ ... }) Y /x
1087
1088Thus EVAL blocks follwing a trie may be called a different number of times with
1089and without the optimisation. With the optimisations dupes will be silently
1090ignored. This inconsistant behaviour of EVAL type nodes is well established as
1091the following demonstrates:
1092
1093 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1094
1095which prints out 'word' three times, but
1096
1097 'words'=~/(word|word|word)(?{ print $1 })S/
1098
1099which doesnt print it out at all. This is due to other optimisations kicking in.
1100
1101Example of what happens on a structural level:
1102
1103The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1104
1105 1: CURLYM[1] {1,32767}(18)
1106 5: BRANCH(8)
1107 6: EXACT <ac>(16)
1108 8: BRANCH(11)
1109 9: EXACT <ad>(16)
1110 11: BRANCH(14)
1111 12: EXACT <ab>(16)
1112 16: SUCCEED(0)
1113 17: NOTHING(18)
1114 18: END(0)
1115
1116This would be optimizable with startbranch=5, first=5, last=16, tail=16
1117and should turn into:
1118
1119 1: CURLYM[1] {1,32767}(18)
1120 5: TRIE(16)
1121 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1122 <ac>
1123 <ad>
1124 <ab>
1125 16: SUCCEED(0)
1126 17: NOTHING(18)
1127 18: END(0)
1128
1129Cases where tail != last would be like /(?foo|bar)baz/:
1130
1131 1: BRANCH(4)
1132 2: EXACT <foo>(8)
1133 4: BRANCH(7)
1134 5: EXACT <bar>(8)
1135 7: TAIL(8)
1136 8: EXACT <baz>(10)
1137 10: END(0)
1138
1139which would be optimizable with startbranch=1, first=1, last=7, tail=8
1140and would end up looking like:
1141
1142 1: TRIE(8)
1143 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1144 <foo>
1145 <bar>
1146 7: TAIL(8)
1147 8: EXACT <baz>(10)
1148 10: END(0)
1149
1150 d = uvuni_to_utf8_flags(d, uv, 0);
1151
1152is the recommended Unicode-aware way of saying
1153
1154 *(d++) = uv;
1155*/
1156
1e2e3d02 1157#define TRIE_STORE_REVCHAR \
786e8c11 1158 STMT_START { \
1e2e3d02 1159 SV *tmp = newSVpvs(""); \
786e8c11 1160 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1161 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1162 av_push( revcharmap, tmp ); \
786e8c11
YO
1163 } STMT_END
1164
1165#define TRIE_READ_CHAR STMT_START { \
1166 wordlen++; \
1167 if ( UTF ) { \
1168 if ( folder ) { \
1169 if ( foldlen > 0 ) { \
1170 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1171 foldlen -= len; \
1172 scan += len; \
1173 len = 0; \
1174 } else { \
1175 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1176 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1177 foldlen -= UNISKIP( uvc ); \
1178 scan = foldbuf + UNISKIP( uvc ); \
1179 } \
1180 } else { \
1181 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1182 } \
1183 } else { \
1184 uvc = (U32)*uc; \
1185 len = 1; \
1186 } \
1187} STMT_END
1188
1189
1190
1191#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1192 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1193 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1194 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1195 } \
1196 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1197 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1198 TRIE_LIST_CUR( state )++; \
1199} STMT_END
07be1b83 1200
786e8c11
YO
1201#define TRIE_LIST_NEW(state) STMT_START { \
1202 Newxz( trie->states[ state ].trans.list, \
1203 4, reg_trie_trans_le ); \
1204 TRIE_LIST_CUR( state ) = 1; \
1205 TRIE_LIST_LEN( state ) = 4; \
1206} STMT_END
07be1b83 1207
786e8c11
YO
1208#define TRIE_HANDLE_WORD(state) STMT_START { \
1209 U16 dupe= trie->states[ state ].wordnum; \
1210 regnode * const noper_next = regnext( noper ); \
1211 \
1212 if (trie->wordlen) \
1213 trie->wordlen[ curword ] = wordlen; \
1214 DEBUG_r({ \
1215 /* store the word for dumping */ \
1216 SV* tmp; \
1217 if (OP(noper) != NOTHING) \
1218 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1219 else \
1220 tmp = newSVpvn( "", 0 ); \
1221 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1222 av_push( trie_words, tmp ); \
786e8c11
YO
1223 }); \
1224 \
1225 curword++; \
1226 \
1227 if ( noper_next < tail ) { \
1228 if (!trie->jump) \
c944940b 1229 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1230 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1231 if (!jumper) \
1232 jumper = noper_next; \
1233 if (!nextbranch) \
1234 nextbranch= regnext(cur); \
1235 } \
1236 \
1237 if ( dupe ) { \
1238 /* So it's a dupe. This means we need to maintain a */\
1239 /* linked-list from the first to the next. */\
1240 /* we only allocate the nextword buffer when there */\
1241 /* a dupe, so first time we have to do the allocation */\
1242 if (!trie->nextword) \
c944940b 1243 trie->nextword = (U16 *) \
446bd890 1244 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1245 while ( trie->nextword[dupe] ) \
1246 dupe= trie->nextword[dupe]; \
1247 trie->nextword[dupe]= curword; \
1248 } else { \
1249 /* we haven't inserted this word yet. */ \
1250 trie->states[ state ].wordnum = curword; \
1251 } \
1252} STMT_END
07be1b83 1253
3dab1dad 1254
786e8c11
YO
1255#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1256 ( ( base + charid >= ucharcount \
1257 && base + charid < ubound \
1258 && state == trie->trans[ base - ucharcount + charid ].check \
1259 && trie->trans[ base - ucharcount + charid ].next ) \
1260 ? trie->trans[ base - ucharcount + charid ].next \
1261 : ( state==1 ? special : 0 ) \
1262 )
3dab1dad 1263
786e8c11
YO
1264#define MADE_TRIE 1
1265#define MADE_JUMP_TRIE 2
1266#define MADE_EXACT_TRIE 4
3dab1dad 1267
a3621e74 1268STATIC I32
786e8c11 1269S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1270{
27da23d5 1271 dVAR;
a3621e74
YO
1272 /* first pass, loop through and scan words */
1273 reg_trie_data *trie;
55eed653 1274 HV *widecharmap = NULL;
2b8b4781 1275 AV *revcharmap = newAV();
a3621e74 1276 regnode *cur;
9f7f3913 1277 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1278 STRLEN len = 0;
1279 UV uvc = 0;
1280 U16 curword = 0;
1281 U32 next_alloc = 0;
786e8c11
YO
1282 regnode *jumper = NULL;
1283 regnode *nextbranch = NULL;
7f69552c 1284 regnode *convert = NULL;
a3621e74 1285 /* we just use folder as a flag in utf8 */
e1ec3a88 1286 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1287 ? PL_fold
1288 : ( flags == EXACTFL
1289 ? PL_fold_locale
1290 : NULL
1291 )
1292 );
1293
2b8b4781
NC
1294#ifdef DEBUGGING
1295 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1296 AV *trie_words = NULL;
1297 /* along with revcharmap, this only used during construction but both are
1298 * useful during debugging so we store them in the struct when debugging.
8e11feef 1299 */
2b8b4781
NC
1300#else
1301 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1302 STRLEN trie_charcount=0;
3dab1dad 1303#endif
2b8b4781 1304 SV *re_trie_maxbuff;
a3621e74 1305 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1306#ifndef DEBUGGING
1307 PERL_UNUSED_ARG(depth);
1308#endif
a3621e74 1309
c944940b 1310 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1311 trie->refcount = 1;
3dab1dad 1312 trie->startstate = 1;
786e8c11 1313 trie->wordcount = word_count;
f8fc2ecf 1314 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1315 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1316 if (!(UTF && folder))
c944940b 1317 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1318 DEBUG_r({
2b8b4781 1319 trie_words = newAV();
a3621e74 1320 });
a3621e74 1321
0111c4fd 1322 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1323 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1324 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1325 }
3dab1dad
YO
1326 DEBUG_OPTIMISE_r({
1327 PerlIO_printf( Perl_debug_log,
786e8c11 1328 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1329 (int)depth * 2 + 2, "",
1330 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1331 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1332 (int)depth);
3dab1dad 1333 });
7f69552c
YO
1334
1335 /* Find the node we are going to overwrite */
1336 if ( first == startbranch && OP( last ) != BRANCH ) {
1337 /* whole branch chain */
1338 convert = first;
1339 } else {
1340 /* branch sub-chain */
1341 convert = NEXTOPER( first );
1342 }
1343
a3621e74
YO
1344 /* -- First loop and Setup --
1345
1346 We first traverse the branches and scan each word to determine if it
1347 contains widechars, and how many unique chars there are, this is
1348 important as we have to build a table with at least as many columns as we
1349 have unique chars.
1350
1351 We use an array of integers to represent the character codes 0..255
1352 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1353 native representation of the character value as the key and IV's for the
1354 coded index.
1355
1356 *TODO* If we keep track of how many times each character is used we can
1357 remap the columns so that the table compression later on is more
1358 efficient in terms of memory by ensuring most common value is in the
1359 middle and the least common are on the outside. IMO this would be better
1360 than a most to least common mapping as theres a decent chance the most
1361 common letter will share a node with the least common, meaning the node
1362 will not be compressable. With a middle is most common approach the worst
1363 case is when we have the least common nodes twice.
1364
1365 */
1366
a3621e74 1367 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1368 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1369 const U8 *uc = (U8*)STRING( noper );
a28509cc 1370 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1371 STRLEN foldlen = 0;
1372 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1373 const U8 *scan = (U8*)NULL;
07be1b83 1374 U32 wordlen = 0; /* required init */
3dab1dad 1375 STRLEN chars=0;
a3621e74 1376
3dab1dad
YO
1377 if (OP(noper) == NOTHING) {
1378 trie->minlen= 0;
1379 continue;
1380 }
1381 if (trie->bitmap) {
1382 TRIE_BITMAP_SET(trie,*uc);
1383 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1384 }
a3621e74 1385 for ( ; uc < e ; uc += len ) {
3dab1dad 1386 TRIE_CHARCOUNT(trie)++;
a3621e74 1387 TRIE_READ_CHAR;
3dab1dad 1388 chars++;
a3621e74
YO
1389 if ( uvc < 256 ) {
1390 if ( !trie->charmap[ uvc ] ) {
1391 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1392 if ( folder )
1393 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1394 TRIE_STORE_REVCHAR;
a3621e74
YO
1395 }
1396 } else {
1397 SV** svpp;
55eed653
NC
1398 if ( !widecharmap )
1399 widecharmap = newHV();
a3621e74 1400
55eed653 1401 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1402
1403 if ( !svpp )
e4584336 1404 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1405
1406 if ( !SvTRUE( *svpp ) ) {
1407 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1408 TRIE_STORE_REVCHAR;
a3621e74
YO
1409 }
1410 }
1411 }
3dab1dad
YO
1412 if( cur == first ) {
1413 trie->minlen=chars;
1414 trie->maxlen=chars;
1415 } else if (chars < trie->minlen) {
1416 trie->minlen=chars;
1417 } else if (chars > trie->maxlen) {
1418 trie->maxlen=chars;
1419 }
1420
a3621e74
YO
1421 } /* end first pass */
1422 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1423 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1424 (int)depth * 2 + 2,"",
55eed653 1425 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1426 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1427 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1428 );
c944940b 1429 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1430
1431 /*
1432 We now know what we are dealing with in terms of unique chars and
1433 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1434 representation using a flat table will take. If it's over a reasonable
1435 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1436 conservative but potentially much slower representation using an array
1437 of lists.
1438
1439 At the end we convert both representations into the same compressed
1440 form that will be used in regexec.c for matching with. The latter
1441 is a form that cannot be used to construct with but has memory
1442 properties similar to the list form and access properties similar
1443 to the table form making it both suitable for fast searches and
1444 small enough that its feasable to store for the duration of a program.
1445
1446 See the comment in the code where the compressed table is produced
1447 inplace from the flat tabe representation for an explanation of how
1448 the compression works.
1449
1450 */
1451
1452
3dab1dad 1453 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1454 /*
1455 Second Pass -- Array Of Lists Representation
1456
1457 Each state will be represented by a list of charid:state records
1458 (reg_trie_trans_le) the first such element holds the CUR and LEN
1459 points of the allocated array. (See defines above).
1460
1461 We build the initial structure using the lists, and then convert
1462 it into the compressed table form which allows faster lookups
1463 (but cant be modified once converted).
a3621e74
YO
1464 */
1465
a3621e74
YO
1466 STRLEN transcount = 1;
1467
1e2e3d02
YO
1468 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1469 "%*sCompiling trie using list compiler\n",
1470 (int)depth * 2 + 2, ""));
446bd890 1471
c944940b
JH
1472 trie->states = (reg_trie_state *)
1473 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1474 sizeof(reg_trie_state) );
a3621e74
YO
1475 TRIE_LIST_NEW(1);
1476 next_alloc = 2;
1477
1478 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1479
c445ea15
AL
1480 regnode * const noper = NEXTOPER( cur );
1481 U8 *uc = (U8*)STRING( noper );
1482 const U8 * const e = uc + STR_LEN( noper );
1483 U32 state = 1; /* required init */
1484 U16 charid = 0; /* sanity init */
1485 U8 *scan = (U8*)NULL; /* sanity init */
1486 STRLEN foldlen = 0; /* required init */
07be1b83 1487 U32 wordlen = 0; /* required init */
c445ea15
AL
1488 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1489
3dab1dad 1490 if (OP(noper) != NOTHING) {
786e8c11 1491 for ( ; uc < e ; uc += len ) {
c445ea15 1492
786e8c11 1493 TRIE_READ_CHAR;
c445ea15 1494
786e8c11
YO
1495 if ( uvc < 256 ) {
1496 charid = trie->charmap[ uvc ];
c445ea15 1497 } else {
55eed653 1498 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1499 if ( !svpp ) {
1500 charid = 0;
1501 } else {
1502 charid=(U16)SvIV( *svpp );
1503 }
c445ea15 1504 }
786e8c11
YO
1505 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1506 if ( charid ) {
a3621e74 1507
786e8c11
YO
1508 U16 check;
1509 U32 newstate = 0;
a3621e74 1510
786e8c11
YO
1511 charid--;
1512 if ( !trie->states[ state ].trans.list ) {
1513 TRIE_LIST_NEW( state );
c445ea15 1514 }
786e8c11
YO
1515 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1516 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1517 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1518 break;
1519 }
1520 }
1521 if ( ! newstate ) {
1522 newstate = next_alloc++;
1523 TRIE_LIST_PUSH( state, charid, newstate );
1524 transcount++;
1525 }
1526 state = newstate;
1527 } else {
1528 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1529 }
a28509cc 1530 }
c445ea15 1531 }
3dab1dad 1532 TRIE_HANDLE_WORD(state);
a3621e74
YO
1533
1534 } /* end second pass */
1535
1e2e3d02
YO
1536 /* next alloc is the NEXT state to be allocated */
1537 trie->statecount = next_alloc;
c944940b
JH
1538 trie->states = (reg_trie_state *)
1539 PerlMemShared_realloc( trie->states,
1540 next_alloc
1541 * sizeof(reg_trie_state) );
a3621e74 1542
3dab1dad 1543 /* and now dump it out before we compress it */
2b8b4781
NC
1544 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1545 revcharmap, next_alloc,
1546 depth+1)
1e2e3d02 1547 );
a3621e74 1548
c944940b
JH
1549 trie->trans = (reg_trie_trans *)
1550 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1551 {
1552 U32 state;
a3621e74
YO
1553 U32 tp = 0;
1554 U32 zp = 0;
1555
1556
1557 for( state=1 ; state < next_alloc ; state ++ ) {
1558 U32 base=0;
1559
1560 /*
1561 DEBUG_TRIE_COMPILE_MORE_r(
1562 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1563 );
1564 */
1565
1566 if (trie->states[state].trans.list) {
1567 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1568 U16 maxid=minid;
a28509cc 1569 U16 idx;
a3621e74
YO
1570
1571 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1572 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1573 if ( forid < minid ) {
1574 minid=forid;
1575 } else if ( forid > maxid ) {
1576 maxid=forid;
1577 }
a3621e74
YO
1578 }
1579 if ( transcount < tp + maxid - minid + 1) {
1580 transcount *= 2;
c944940b
JH
1581 trie->trans = (reg_trie_trans *)
1582 PerlMemShared_realloc( trie->trans,
446bd890
NC
1583 transcount
1584 * sizeof(reg_trie_trans) );
a3621e74
YO
1585 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1586 }
1587 base = trie->uniquecharcount + tp - minid;
1588 if ( maxid == minid ) {
1589 U32 set = 0;
1590 for ( ; zp < tp ; zp++ ) {
1591 if ( ! trie->trans[ zp ].next ) {
1592 base = trie->uniquecharcount + zp - minid;
1593 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1594 trie->trans[ zp ].check = state;
1595 set = 1;
1596 break;
1597 }
1598 }
1599 if ( !set ) {
1600 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1601 trie->trans[ tp ].check = state;
1602 tp++;
1603 zp = tp;
1604 }
1605 } else {
1606 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1607 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1608 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1609 trie->trans[ tid ].check = state;
1610 }
1611 tp += ( maxid - minid + 1 );
1612 }
1613 Safefree(trie->states[ state ].trans.list);
1614 }
1615 /*
1616 DEBUG_TRIE_COMPILE_MORE_r(
1617 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1618 );
1619 */
1620 trie->states[ state ].trans.base=base;
1621 }
cc601c31 1622 trie->lasttrans = tp + 1;
a3621e74
YO
1623 }
1624 } else {
1625 /*
1626 Second Pass -- Flat Table Representation.
1627
1628 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1629 We know that we will need Charcount+1 trans at most to store the data
1630 (one row per char at worst case) So we preallocate both structures
1631 assuming worst case.
1632
1633 We then construct the trie using only the .next slots of the entry
1634 structs.
1635
1636 We use the .check field of the first entry of the node temporarily to
1637 make compression both faster and easier by keeping track of how many non
1638 zero fields are in the node.
1639
1640 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1641 transition.
1642
1643 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1644 number representing the first entry of the node, and state as a
1645 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1646 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1647 are 2 entrys per node. eg:
1648
1649 A B A B
1650 1. 2 4 1. 3 7
1651 2. 0 3 3. 0 5
1652 3. 0 0 5. 0 0
1653 4. 0 0 7. 0 0
1654
1655 The table is internally in the right hand, idx form. However as we also
1656 have to deal with the states array which is indexed by nodenum we have to
1657 use TRIE_NODENUM() to convert.
1658
1659 */
1e2e3d02
YO
1660 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1661 "%*sCompiling trie using table compiler\n",
1662 (int)depth * 2 + 2, ""));
3dab1dad 1663
c944940b
JH
1664 trie->trans = (reg_trie_trans *)
1665 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1666 * trie->uniquecharcount + 1,
1667 sizeof(reg_trie_trans) );
1668 trie->states = (reg_trie_state *)
1669 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1670 sizeof(reg_trie_state) );
a3621e74
YO
1671 next_alloc = trie->uniquecharcount + 1;
1672
3dab1dad 1673
a3621e74
YO
1674 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1675
c445ea15 1676 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1677 const U8 *uc = (U8*)STRING( noper );
1678 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1679
1680 U32 state = 1; /* required init */
1681
1682 U16 charid = 0; /* sanity init */
1683 U32 accept_state = 0; /* sanity init */
1684 U8 *scan = (U8*)NULL; /* sanity init */
1685
1686 STRLEN foldlen = 0; /* required init */
07be1b83 1687 U32 wordlen = 0; /* required init */
a3621e74
YO
1688 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1689
3dab1dad 1690 if ( OP(noper) != NOTHING ) {
786e8c11 1691 for ( ; uc < e ; uc += len ) {
a3621e74 1692
786e8c11 1693 TRIE_READ_CHAR;
a3621e74 1694
786e8c11
YO
1695 if ( uvc < 256 ) {
1696 charid = trie->charmap[ uvc ];
1697 } else {
55eed653 1698 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1699 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1700 }
786e8c11
YO
1701 if ( charid ) {
1702 charid--;
1703 if ( !trie->trans[ state + charid ].next ) {
1704 trie->trans[ state + charid ].next = next_alloc;
1705 trie->trans[ state ].check++;
1706 next_alloc += trie->uniquecharcount;
1707 }
1708 state = trie->trans[ state + charid ].next;
1709 } else {
1710 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1711 }
1712 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1713 }
a3621e74 1714 }
3dab1dad
YO
1715 accept_state = TRIE_NODENUM( state );
1716 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1717
1718 } /* end second pass */
1719
3dab1dad 1720 /* and now dump it out before we compress it */
2b8b4781
NC
1721 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1722 revcharmap,
1723 next_alloc, depth+1));
a3621e74 1724
a3621e74
YO
1725 {
1726 /*
1727 * Inplace compress the table.*
1728
1729 For sparse data sets the table constructed by the trie algorithm will
1730 be mostly 0/FAIL transitions or to put it another way mostly empty.
1731 (Note that leaf nodes will not contain any transitions.)
1732
1733 This algorithm compresses the tables by eliminating most such
1734 transitions, at the cost of a modest bit of extra work during lookup:
1735
1736 - Each states[] entry contains a .base field which indicates the
1737 index in the state[] array wheres its transition data is stored.
1738
1739 - If .base is 0 there are no valid transitions from that node.
1740
1741 - If .base is nonzero then charid is added to it to find an entry in
1742 the trans array.
1743
1744 -If trans[states[state].base+charid].check!=state then the
1745 transition is taken to be a 0/Fail transition. Thus if there are fail
1746 transitions at the front of the node then the .base offset will point
1747 somewhere inside the previous nodes data (or maybe even into a node
1748 even earlier), but the .check field determines if the transition is
1749 valid.
1750
786e8c11 1751 XXX - wrong maybe?
a3621e74
YO
1752 The following process inplace converts the table to the compressed
1753 table: We first do not compress the root node 1,and mark its all its
1754 .check pointers as 1 and set its .base pointer as 1 as well. This
1755 allows to do a DFA construction from the compressed table later, and
1756 ensures that any .base pointers we calculate later are greater than
1757 0.
1758
1759 - We set 'pos' to indicate the first entry of the second node.
1760
1761 - We then iterate over the columns of the node, finding the first and
1762 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1763 and set the .check pointers accordingly, and advance pos
1764 appropriately and repreat for the next node. Note that when we copy
1765 the next pointers we have to convert them from the original
1766 NODEIDX form to NODENUM form as the former is not valid post
1767 compression.
1768
1769 - If a node has no transitions used we mark its base as 0 and do not
1770 advance the pos pointer.
1771
1772 - If a node only has one transition we use a second pointer into the
1773 structure to fill in allocated fail transitions from other states.
1774 This pointer is independent of the main pointer and scans forward
1775 looking for null transitions that are allocated to a state. When it
1776 finds one it writes the single transition into the "hole". If the
786e8c11 1777 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1778
1779 - Once compressed we can Renew/realloc the structures to release the
1780 excess space.
1781
1782 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1783 specifically Fig 3.47 and the associated pseudocode.
1784
1785 demq
1786 */
a3b680e6 1787 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1788 U32 state, charid;
a3621e74 1789 U32 pos = 0, zp=0;
1e2e3d02 1790 trie->statecount = laststate;
a3621e74
YO
1791
1792 for ( state = 1 ; state < laststate ; state++ ) {
1793 U8 flag = 0;
a28509cc
AL
1794 const U32 stateidx = TRIE_NODEIDX( state );
1795 const U32 o_used = trie->trans[ stateidx ].check;
1796 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1797 trie->trans[ stateidx ].check = 0;
1798
1799 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1800 if ( flag || trie->trans[ stateidx + charid ].next ) {
1801 if ( trie->trans[ stateidx + charid ].next ) {
1802 if (o_used == 1) {
1803 for ( ; zp < pos ; zp++ ) {
1804 if ( ! trie->trans[ zp ].next ) {
1805 break;
1806 }
1807 }
1808 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1809 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1810 trie->trans[ zp ].check = state;
1811 if ( ++zp > pos ) pos = zp;
1812 break;
1813 }
1814 used--;
1815 }
1816 if ( !flag ) {
1817 flag = 1;
1818 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1819 }
1820 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1821 trie->trans[ pos ].check = state;
1822 pos++;
1823 }
1824 }
1825 }
cc601c31 1826 trie->lasttrans = pos + 1;
c944940b
JH
1827 trie->states = (reg_trie_state *)
1828 PerlMemShared_realloc( trie->states, laststate
1829 * sizeof(reg_trie_state) );
a3621e74 1830 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1831 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1832 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1833 (int)depth * 2 + 2,"",
1834 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1835 (IV)next_alloc,
1836 (IV)pos,
a3621e74
YO
1837 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1838 );
1839
1840 } /* end table compress */
1841 }
1e2e3d02
YO
1842 DEBUG_TRIE_COMPILE_MORE_r(
1843 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1844 (int)depth * 2 + 2, "",
1845 (UV)trie->statecount,
1846 (UV)trie->lasttrans)
1847 );
cc601c31 1848 /* resize the trans array to remove unused space */
c944940b
JH
1849 trie->trans = (reg_trie_trans *)
1850 PerlMemShared_realloc( trie->trans, trie->lasttrans
1851 * sizeof(reg_trie_trans) );
a3621e74 1852
3dab1dad 1853 /* and now dump out the compressed format */
2b8b4781 1854 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1855
3dab1dad 1856 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1857 U8 nodetype =(U8)(flags & 0xFF);
1858 char *str=NULL;
786e8c11 1859
07be1b83 1860#ifdef DEBUGGING
e62cc96a 1861 regnode *optimize = NULL;
7122b237
YO
1862#ifdef RE_TRACK_PATTERN_OFFSETS
1863
b57a0404
JH
1864 U32 mjd_offset = 0;
1865 U32 mjd_nodelen = 0;
7122b237
YO
1866#endif /* RE_TRACK_PATTERN_OFFSETS */
1867#endif /* DEBUGGING */
a3621e74 1868 /*
3dab1dad
YO
1869 This means we convert either the first branch or the first Exact,
1870 depending on whether the thing following (in 'last') is a branch
1871 or not and whther first is the startbranch (ie is it a sub part of
1872 the alternation or is it the whole thing.)
1873 Assuming its a sub part we conver the EXACT otherwise we convert
1874 the whole branch sequence, including the first.
a3621e74 1875 */
3dab1dad 1876 /* Find the node we are going to overwrite */
7f69552c 1877 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1878 /* branch sub-chain */
3dab1dad 1879 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1880#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1881 DEBUG_r({
1882 mjd_offset= Node_Offset((convert));
1883 mjd_nodelen= Node_Length((convert));
1884 });
7122b237 1885#endif
7f69552c 1886 /* whole branch chain */
7122b237
YO
1887 }
1888#ifdef RE_TRACK_PATTERN_OFFSETS
1889 else {
7f69552c
YO
1890 DEBUG_r({
1891 const regnode *nop = NEXTOPER( convert );
1892 mjd_offset= Node_Offset((nop));
1893 mjd_nodelen= Node_Length((nop));
1894 });
07be1b83
YO
1895 }
1896 DEBUG_OPTIMISE_r(
1897 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1898 (int)depth * 2 + 2, "",
786e8c11 1899 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1900 );
7122b237 1901#endif
3dab1dad
YO
1902 /* But first we check to see if there is a common prefix we can
1903 split out as an EXACT and put in front of the TRIE node. */
1904 trie->startstate= 1;
55eed653 1905 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1906 U32 state;
1e2e3d02 1907 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1908 U32 ofs = 0;
8e11feef
RGS
1909 I32 idx = -1;
1910 U32 count = 0;
1911 const U32 base = trie->states[ state ].trans.base;
a3621e74 1912
3dab1dad 1913 if ( trie->states[state].wordnum )
8e11feef 1914 count = 1;
a3621e74 1915
8e11feef 1916 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1917 if ( ( base + ofs >= trie->uniquecharcount ) &&
1918 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1919 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1920 {
3dab1dad 1921 if ( ++count > 1 ) {
2b8b4781 1922 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1923 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1924 if ( state == 1 ) break;
3dab1dad
YO
1925 if ( count == 2 ) {
1926 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1927 DEBUG_OPTIMISE_r(
8e11feef
RGS
1928 PerlIO_printf(Perl_debug_log,
1929 "%*sNew Start State=%"UVuf" Class: [",
1930 (int)depth * 2 + 2, "",
786e8c11 1931 (UV)state));
be8e71aa 1932 if (idx >= 0) {
2b8b4781 1933 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1934 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1935
3dab1dad 1936 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1937 if ( folder )
1938 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1939 DEBUG_OPTIMISE_r(
07be1b83 1940 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1941 );
8e11feef
RGS
1942 }
1943 }
1944 TRIE_BITMAP_SET(trie,*ch);
1945 if ( folder )
1946 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1947 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1948 }
1949 idx = ofs;
1950 }
3dab1dad
YO
1951 }
1952 if ( count == 1 ) {
2b8b4781 1953 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1954 char *ch = SvPV_nolen( *tmp );
1955 DEBUG_OPTIMISE_r({
1956 SV *sv=sv_newmortal();
8e11feef
RGS
1957 PerlIO_printf( Perl_debug_log,
1958 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1959 (int)depth * 2 + 2, "",
de734bd5
A
1960 (UV)state, (UV)idx,
1961 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1962 PL_colors[0], PL_colors[1],
1963 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1964 PERL_PV_ESCAPE_FIRSTCHAR
1965 )
1966 );
1967 });
3dab1dad
YO
1968 if ( state==1 ) {
1969 OP( convert ) = nodetype;
1970 str=STRING(convert);
1971 STR_LEN(convert)=0;
1972 }
de734bd5
A
1973 while (*ch) {
1974 *str++ = *ch++;
1975 STR_LEN(convert)++;
1976 }
1977
8e11feef 1978 } else {
f9049ba1 1979#ifdef DEBUGGING
8e11feef
RGS
1980 if (state>1)
1981 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1982#endif
8e11feef
RGS
1983 break;
1984 }
1985 }
3dab1dad 1986 if (str) {
8e11feef 1987 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1988 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1989 trie->startstate = state;
07be1b83
YO
1990 trie->minlen -= (state - 1);
1991 trie->maxlen -= (state - 1);
1992 DEBUG_r({
1993 regnode *fix = convert;
de734bd5 1994 U32 word = trie->wordcount;
07be1b83
YO
1995 mjd_nodelen++;
1996 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1997 while( ++fix < n ) {
1998 Set_Node_Offset_Length(fix, 0, 0);
1999 }
de734bd5 2000 while (word--) {
2b8b4781 2001 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
2002 if (tmp) {
2003 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2004 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2005 else
2006 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2007 }
2008 }
07be1b83 2009 });
8e11feef
RGS
2010 if (trie->maxlen) {
2011 convert = n;
2012 } else {
3dab1dad 2013 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2014 DEBUG_r(optimize= n);
3dab1dad
YO
2015 }
2016 }
2017 }
a5ca303d
YO
2018 if (!jumper)
2019 jumper = last;
3dab1dad 2020 if ( trie->maxlen ) {
8e11feef
RGS
2021 NEXT_OFF( convert ) = (U16)(tail - convert);
2022 ARG_SET( convert, data_slot );
786e8c11
YO
2023 /* Store the offset to the first unabsorbed branch in
2024 jump[0], which is otherwise unused by the jump logic.
2025 We use this when dumping a trie and during optimisation. */
2026 if (trie->jump)
7f69552c 2027 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2028
786e8c11
YO
2029 /* XXXX */
2030 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2031 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2032 {
2033 OP( convert ) = TRIEC;
2034 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2035 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2036 trie->bitmap= NULL;
2037 } else
2038 OP( convert ) = TRIE;
a3621e74 2039
3dab1dad
YO
2040 /* store the type in the flags */
2041 convert->flags = nodetype;
a5ca303d
YO
2042 DEBUG_r({
2043 optimize = convert
2044 + NODE_STEP_REGNODE
2045 + regarglen[ OP( convert ) ];
2046 });
2047 /* XXX We really should free up the resource in trie now,
2048 as we won't use them - (which resources?) dmq */
3dab1dad 2049 }
a3621e74 2050 /* needed for dumping*/
e62cc96a 2051 DEBUG_r(if (optimize) {
07be1b83 2052 regnode *opt = convert;
e62cc96a 2053 while ( ++opt < optimize) {
07be1b83
YO
2054 Set_Node_Offset_Length(opt,0,0);
2055 }
786e8c11
YO
2056 /*
2057 Try to clean up some of the debris left after the
2058 optimisation.
a3621e74 2059 */
786e8c11 2060 while( optimize < jumper ) {
07be1b83 2061 mjd_nodelen += Node_Length((optimize));
a3621e74 2062 OP( optimize ) = OPTIMIZED;
07be1b83 2063 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2064 optimize++;
2065 }
07be1b83 2066 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2067 });
2068 } /* end node insert */
55eed653 2069 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2070#ifdef DEBUGGING
2071 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2072 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2073#else
2074 SvREFCNT_dec(revcharmap);
07be1b83 2075#endif
786e8c11
YO
2076 return trie->jump
2077 ? MADE_JUMP_TRIE
2078 : trie->startstate>1
2079 ? MADE_EXACT_TRIE
2080 : MADE_TRIE;
2081}
2082
2083STATIC void
2084S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2085{
2086/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2087
2088 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2089 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2090 ISBN 0-201-10088-6
2091
2092 We find the fail state for each state in the trie, this state is the longest proper
2093 suffix of the current states 'word' that is also a proper prefix of another word in our
2094 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2095 the DFA not to have to restart after its tried and failed a word at a given point, it
2096 simply continues as though it had been matching the other word in the first place.
2097 Consider
2098 'abcdgu'=~/abcdefg|cdgu/
2099 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2100 fail, which would bring use to the state representing 'd' in the second word where we would
2101 try 'g' and succeed, prodceding to match 'cdgu'.
2102 */
2103 /* add a fail transition */
3251b653
NC
2104 const U32 trie_offset = ARG(source);
2105 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2106 U32 *q;
2107 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2108 const U32 numstates = trie->statecount;
786e8c11
YO
2109 const U32 ubound = trie->lasttrans + ucharcount;
2110 U32 q_read = 0;
2111 U32 q_write = 0;
2112 U32 charid;
2113 U32 base = trie->states[ 1 ].trans.base;
2114 U32 *fail;
2115 reg_ac_data *aho;
2116 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2117 GET_RE_DEBUG_FLAGS_DECL;
2118#ifndef DEBUGGING
2119 PERL_UNUSED_ARG(depth);
2120#endif
2121
2122
2123 ARG_SET( stclass, data_slot );
c944940b 2124 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2125 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2126 aho->trie=trie_offset;
446bd890
NC
2127 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2128 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2129 Newxz( q, numstates, U32);
c944940b 2130 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2131 aho->refcount = 1;
2132 fail = aho->fail;
2133 /* initialize fail[0..1] to be 1 so that we always have
2134 a valid final fail state */
2135 fail[ 0 ] = fail[ 1 ] = 1;
2136
2137 for ( charid = 0; charid < ucharcount ; charid++ ) {
2138 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2139 if ( newstate ) {
2140 q[ q_write ] = newstate;
2141 /* set to point at the root */
2142 fail[ q[ q_write++ ] ]=1;
2143 }
2144 }
2145 while ( q_read < q_write) {
2146 const U32 cur = q[ q_read++ % numstates ];
2147 base = trie->states[ cur ].trans.base;
2148
2149 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2150 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2151 if (ch_state) {
2152 U32 fail_state = cur;
2153 U32 fail_base;
2154 do {
2155 fail_state = fail[ fail_state ];
2156 fail_base = aho->states[ fail_state ].trans.base;
2157 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2158
2159 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2160 fail[ ch_state ] = fail_state;
2161 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2162 {
2163 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2164 }
2165 q[ q_write++ % numstates] = ch_state;
2166 }
2167 }
2168 }
2169 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2170 when we fail in state 1, this allows us to use the
2171 charclass scan to find a valid start char. This is based on the principle
2172 that theres a good chance the string being searched contains lots of stuff
2173 that cant be a start char.
2174 */
2175 fail[ 0 ] = fail[ 1 ] = 0;
2176 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2177 PerlIO_printf(Perl_debug_log,
2178 "%*sStclass Failtable (%"UVuf" states): 0",
2179 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2180 );
786e8c11
YO
2181 for( q_read=1; q_read<numstates; q_read++ ) {
2182 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2183 }
2184 PerlIO_printf(Perl_debug_log, "\n");
2185 });
2186 Safefree(q);
2187 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2188}
2189
786e8c11 2190
a3621e74 2191/*
5d1c421c
JH
2192 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2193 * These need to be revisited when a newer toolchain becomes available.
2194 */
2195#if defined(__sparc64__) && defined(__GNUC__)
2196# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2197# undef SPARC64_GCC_WORKAROUND
2198# define SPARC64_GCC_WORKAROUND 1
2199# endif
2200#endif
2201
07be1b83 2202#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2203 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2204 SV * const mysv=sv_newmortal(); \
2205 regnode *Next = regnext(scan); \
2206 regprop(RExC_rx, mysv, scan); \
7f69552c 2207 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2208 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2209 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2210 }});
07be1b83 2211
1de06328
YO
2212
2213
2214
2215
07be1b83
YO
2216#define JOIN_EXACT(scan,min,flags) \
2217 if (PL_regkind[OP(scan)] == EXACT) \
2218 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2219
be8e71aa 2220STATIC U32
07be1b83
YO
2221S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2222 /* Merge several consecutive EXACTish nodes into one. */
2223 regnode *n = regnext(scan);
2224 U32 stringok = 1;
2225 regnode *next = scan + NODE_SZ_STR(scan);
2226 U32 merged = 0;
2227 U32 stopnow = 0;
2228#ifdef DEBUGGING
2229 regnode *stop = scan;
72f13be8 2230 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2231#else
d47053eb
RGS
2232 PERL_UNUSED_ARG(depth);
2233#endif
2234#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2235 PERL_UNUSED_ARG(flags);
2236 PERL_UNUSED_ARG(val);
07be1b83 2237#endif
07be1b83
YO
2238 DEBUG_PEEP("join",scan,depth);
2239
2240 /* Skip NOTHING, merge EXACT*. */
2241 while (n &&
2242 ( PL_regkind[OP(n)] == NOTHING ||
2243 (stringok && (OP(n) == OP(scan))))
2244 && NEXT_OFF(n)
2245 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2246
2247 if (OP(n) == TAIL || n > next)
2248 stringok = 0;
2249 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2250 DEBUG_PEEP("skip:",n,depth);
2251 NEXT_OFF(scan) += NEXT_OFF(n);
2252 next = n + NODE_STEP_REGNODE;
2253#ifdef DEBUGGING
2254 if (stringok)
2255 stop = n;
2256#endif
2257 n = regnext(n);
2258 }
2259 else if (stringok) {
786e8c11 2260 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2261 regnode * const nnext = regnext(n);
2262
2263 DEBUG_PEEP("merg",n,depth);
2264
2265 merged++;
2266 if (oldl + STR_LEN(n) > U8_MAX)
2267 break;
2268 NEXT_OFF(scan) += NEXT_OFF(n);
2269 STR_LEN(scan) += STR_LEN(n);
2270 next = n + NODE_SZ_STR(n);
2271 /* Now we can overwrite *n : */
2272 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2273#ifdef DEBUGGING
2274 stop = next - 1;
2275#endif
2276 n = nnext;
2277 if (stopnow) break;
2278 }
2279
d47053eb
RGS
2280#ifdef EXPERIMENTAL_INPLACESCAN
2281 if (flags && !NEXT_OFF(n)) {
2282 DEBUG_PEEP("atch", val, depth);
2283 if (reg_off_by_arg[OP(n)]) {
2284 ARG_SET(n, val - n);
2285 }
2286 else {
2287 NEXT_OFF(n) = val - n;
2288 }
2289 stopnow = 1;
2290 }
07be1b83
YO
2291#endif
2292 }
2293
2294 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2295 /*
2296 Two problematic code points in Unicode casefolding of EXACT nodes:
2297
2298 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2299 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2300
2301 which casefold to
2302
2303 Unicode UTF-8
2304
2305 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2306 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2307
2308 This means that in case-insensitive matching (or "loose matching",
2309 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2310 length of the above casefolded versions) can match a target string
2311 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2312 This would rather mess up the minimum length computation.
2313
2314 What we'll do is to look for the tail four bytes, and then peek
2315 at the preceding two bytes to see whether we need to decrease
2316 the minimum length by four (six minus two).
2317
2318 Thanks to the design of UTF-8, there cannot be false matches:
2319 A sequence of valid UTF-8 bytes cannot be a subsequence of
2320 another valid sequence of UTF-8 bytes.
2321
2322 */
2323 char * const s0 = STRING(scan), *s, *t;
2324 char * const s1 = s0 + STR_LEN(scan) - 1;
2325 char * const s2 = s1 - 4;
e294cc5d
JH
2326#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2327 const char t0[] = "\xaf\x49\xaf\x42";
2328#else
07be1b83 2329 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2330#endif
07be1b83
YO
2331 const char * const t1 = t0 + 3;
2332
2333 for (s = s0 + 2;
2334 s < s2 && (t = ninstr(s, s1, t0, t1));
2335 s = t + 4) {
e294cc5d
JH
2336#ifdef EBCDIC
2337 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2338 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2339#else
07be1b83
YO
2340 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2341 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2342#endif
07be1b83
YO
2343 *min -= 4;
2344 }
2345 }
2346
2347#ifdef DEBUGGING
2348 /* Allow dumping */
2349 n = scan + NODE_SZ_STR(scan);
2350 while (n <= stop) {
2351 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2352 OP(n) = OPTIMIZED;
2353 NEXT_OFF(n) = 0;
2354 }
2355 n++;
2356 }
2357#endif
2358 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2359 return stopnow;
2360}
2361
653099ff
GS
2362/* REx optimizer. Converts nodes into quickier variants "in place".
2363 Finds fixed substrings. */
2364
a0288114 2365/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2366 to the position after last scanned or to NULL. */
2367
40d049e4
YO
2368#define INIT_AND_WITHP \
2369 assert(!and_withp); \
2370 Newx(and_withp,1,struct regnode_charclass_class); \
2371 SAVEFREEPV(and_withp)
07be1b83 2372
b515a41d
YO
2373/* this is a chain of data about sub patterns we are processing that
2374 need to be handled seperately/specially in study_chunk. Its so
2375 we can simulate recursion without losing state. */
2376struct scan_frame;
2377typedef struct scan_frame {
2378 regnode *last; /* last node to process in this frame */
2379 regnode *next; /* next node to process when last is reached */
2380 struct scan_frame *prev; /*previous frame*/
2381 I32 stop; /* what stopparen do we use */
2382} scan_frame;
2383
304ee84b
YO
2384
2385#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2386
76e3520e 2387STATIC I32
40d049e4 2388S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2389 I32 *minlenp, I32 *deltap,
40d049e4
YO
2390 regnode *last,
2391 scan_data_t *data,
2392 I32 stopparen,
2393 U8* recursed,
2394 struct regnode_charclass_class *and_withp,
2395 U32 flags, U32 depth)
c277df42
IZ
2396 /* scanp: Start here (read-write). */
2397 /* deltap: Write maxlen-minlen here. */
2398 /* last: Stop before this one. */
40d049e4
YO
2399 /* data: string data about the pattern */
2400 /* stopparen: treat close N as END */
2401 /* recursed: which subroutines have we recursed into */
2402 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2403{
97aff369 2404 dVAR;
c277df42
IZ
2405 I32 min = 0, pars = 0, code;
2406 regnode *scan = *scanp, *next;
2407 I32 delta = 0;
2408 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2409 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2410 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2411 scan_data_t data_fake;
a3621e74 2412 SV *re_trie_maxbuff = NULL;
786e8c11 2413 regnode *first_non_open = scan;
e2e6a0f1 2414 I32 stopmin = I32_MAX;
8aa23a47
YO
2415 scan_frame *frame = NULL;
2416
a3621e74 2417 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2418
13a24bad 2419#ifdef DEBUGGING
40d049e4 2420 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2421#endif
40d049e4 2422
786e8c11 2423 if ( depth == 0 ) {
40d049e4 2424 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2425 first_non_open=regnext(first_non_open);
2426 }
2427
b81d288d 2428
8aa23a47
YO
2429 fake_study_recurse:
2430 while ( scan && OP(scan) != END && scan < last ){
2431 /* Peephole optimizer: */
304ee84b 2432 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2433 DEBUG_PEEP("Peep",scan,depth);
2434 JOIN_EXACT(scan,&min,0);
2435
2436 /* Follow the next-chain of the current node and optimize
2437 away all the NOTHINGs from it. */
2438 if (OP(scan) != CURLYX) {
2439 const int max = (reg_off_by_arg[OP(scan)]
2440 ? I32_MAX
2441 /* I32 may be smaller than U16 on CRAYs! */
2442 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2443 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2444 int noff;
2445 regnode *n = scan;
2446
2447 /* Skip NOTHING and LONGJMP. */
2448 while ((n = regnext(n))
2449 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2450 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2451 && off + noff < max)
2452 off += noff;
2453 if (reg_off_by_arg[OP(scan)])
2454 ARG(scan) = off;
2455 else
2456 NEXT_OFF(scan) = off;
2457 }
a3621e74 2458
c277df42 2459
8aa23a47
YO
2460
2461 /* The principal pseudo-switch. Cannot be a switch, since we
2462 look into several different things. */
2463 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2464 || OP(scan) == IFTHEN) {
2465 next = regnext(scan);
2466 code = OP(scan);
2467 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2468
2469 if (OP(next) == code || code == IFTHEN) {
2470 /* NOTE - There is similar code to this block below for handling
2471 TRIE nodes on a re-study. If you change stuff here check there
2472 too. */
2473 I32 max1 = 0, min1 = I32_MAX, num = 0;
2474 struct regnode_charclass_class accum;
2475 regnode * const startbranch=scan;
2476
2477 if (flags & SCF_DO_SUBSTR)
304ee84b 2478 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2479 if (flags & SCF_DO_STCLASS)
2480 cl_init_zero(pRExC_state, &accum);
2481
2482 while (OP(scan) == code) {
2483 I32 deltanext, minnext, f = 0, fake;
2484 struct regnode_charclass_class this_class;
2485
2486 num++;
2487 data_fake.flags = 0;
2488 if (data) {
2489 data_fake.whilem_c = data->whilem_c;
2490 data_fake.last_closep = data->last_closep;
2491 }
2492 else
2493 data_fake.last_closep = &fake;
58e23c8d
YO
2494
2495 data_fake.pos_delta = delta;
8aa23a47
YO
2496 next = regnext(scan);
2497 scan = NEXTOPER(scan);
2498 if (code != BRANCH)
c277df42 2499 scan = NEXTOPER(scan);
8aa23a47
YO
2500 if (flags & SCF_DO_STCLASS) {
2501 cl_init(pRExC_state, &this_class);
2502 data_fake.start_class = &this_class;
2503 f = SCF_DO_STCLASS_AND;
58e23c8d 2504 }
8aa23a47
YO
2505 if (flags & SCF_WHILEM_VISITED_POS)
2506 f |= SCF_WHILEM_VISITED_POS;
2507
2508 /* we suppose the run is continuous, last=next...*/
2509 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2510 next, &data_fake,
2511 stopparen, recursed, NULL, f,depth+1);
2512 if (min1 > minnext)
2513 min1 = minnext;
2514 if (max1 < minnext + deltanext)
2515 max1 = minnext + deltanext;
2516 if (deltanext == I32_MAX)
2517 is_inf = is_inf_internal = 1;
2518 scan = next;
2519 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2520 pars++;
2521 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2522 if ( stopmin > minnext)
2523 stopmin = min + min1;
2524 flags &= ~SCF_DO_SUBSTR;
2525 if (data)
2526 data->flags |= SCF_SEEN_ACCEPT;
2527 }
2528 if (data) {
2529 if (data_fake.flags & SF_HAS_EVAL)
2530 data->flags |= SF_HAS_EVAL;
2531 data->whilem_c = data_fake.whilem_c;
3dab1dad 2532 }
8aa23a47
YO
2533 if (flags & SCF_DO_STCLASS)
2534 cl_or(pRExC_state, &accum, &this_class);
2535 }
2536 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2537 min1 = 0;
2538 if (flags & SCF_DO_SUBSTR) {
2539 data->pos_min += min1;
2540 data->pos_delta += max1 - min1;
2541 if (max1 != min1 || is_inf)
2542 data->longest = &(data->longest_float);
2543 }
2544 min += min1;
2545 delta += max1 - min1;
2546 if (flags & SCF_DO_STCLASS_OR) {
2547 cl_or(pRExC_state, data->start_class, &accum);
2548 if (min1) {
2549 cl_and(data->start_class, and_withp);
2550 flags &= ~SCF_DO_STCLASS;
653099ff 2551 }
8aa23a47
YO
2552 }
2553 else if (flags & SCF_DO_STCLASS_AND) {
2554 if (min1) {
2555 cl_and(data->start_class, &accum);
2556 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2557 }
8aa23a47
YO
2558 else {
2559 /* Switch to OR mode: cache the old value of
2560 * data->start_class */
2561 INIT_AND_WITHP;
2562 StructCopy(data->start_class, and_withp,
2563 struct regnode_charclass_class);
2564 flags &= ~SCF_DO_STCLASS_AND;
2565 StructCopy(&accum, data->start_class,
2566 struct regnode_charclass_class);
2567 flags |= SCF_DO_STCLASS_OR;
2568 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2569 }
8aa23a47 2570 }
a3621e74 2571
8aa23a47
YO
2572 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2573 /* demq.
a3621e74 2574
8aa23a47
YO
2575 Assuming this was/is a branch we are dealing with: 'scan' now
2576 points at the item that follows the branch sequence, whatever
2577 it is. We now start at the beginning of the sequence and look
2578 for subsequences of
a3621e74 2579
8aa23a47
YO
2580 BRANCH->EXACT=>x1
2581 BRANCH->EXACT=>x2
2582 tail
a3621e74 2583
8aa23a47 2584 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2585
8aa23a47
YO
2586 If we can find such a subseqence we need to turn the first
2587 element into a trie and then add the subsequent branch exact
2588 strings to the trie.
a3621e74 2589
8aa23a47 2590 We have two cases
a3621e74 2591
8aa23a47 2592 1. patterns where the whole set of branch can be converted.
a3621e74 2593
8aa23a47 2594 2. patterns where only a subset can be converted.
a3621e74 2595
8aa23a47
YO
2596 In case 1 we can replace the whole set with a single regop
2597 for the trie. In case 2 we need to keep the start and end
2598 branchs so
a3621e74 2599
8aa23a47
YO
2600 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2601 becomes BRANCH TRIE; BRANCH X;
786e8c11 2602
8aa23a47
YO
2603 There is an additional case, that being where there is a
2604 common prefix, which gets split out into an EXACT like node
2605 preceding the TRIE node.
a3621e74 2606
8aa23a47
YO
2607 If x(1..n)==tail then we can do a simple trie, if not we make
2608 a "jump" trie, such that when we match the appropriate word
2609 we "jump" to the appopriate tail node. Essentailly we turn
2610 a nested if into a case structure of sorts.
b515a41d 2611
8aa23a47
YO
2612 */
2613
2614 int made=0;
2615 if (!re_trie_maxbuff) {
2616 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2617 if (!SvIOK(re_trie_maxbuff))
2618 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2619 }
2620 if ( SvIV(re_trie_maxbuff)>=0 ) {
2621 regnode *cur;
2622 regnode *first = (regnode *)NULL;
2623 regnode *last = (regnode *)NULL;
2624 regnode *tail = scan;
2625 U8 optype = 0;
2626 U32 count=0;
a3621e74
YO
2627
2628#ifdef DEBUGGING
8aa23a47 2629 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2630#endif
8aa23a47
YO
2631 /* var tail is used because there may be a TAIL
2632 regop in the way. Ie, the exacts will point to the
2633 thing following the TAIL, but the last branch will
2634 point at the TAIL. So we advance tail. If we
2635 have nested (?:) we may have to move through several
2636 tails.
2637 */
2638
2639 while ( OP( tail ) == TAIL ) {
2640 /* this is the TAIL generated by (?:) */
2641 tail = regnext( tail );
2642 }
a3621e74 2643
8aa23a47
YO
2644
2645 DEBUG_OPTIMISE_r({
2646 regprop(RExC_rx, mysv, tail );
2647 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2648 (int)depth * 2 + 2, "",
2649 "Looking for TRIE'able sequences. Tail node is: ",
2650 SvPV_nolen_const( mysv )
2651 );
2652 });
2653
2654 /*
2655
2656 step through the branches, cur represents each
2657 branch, noper is the first thing to be matched
2658 as part of that branch and noper_next is the
2659 regnext() of that node. if noper is an EXACT
2660 and noper_next is the same as scan (our current
2661 position in the regex) then the EXACT branch is
2662 a possible optimization target. Once we have
2663 two or more consequetive such branches we can
2664 create a trie of the EXACT's contents and stich
2665 it in place. If the sequence represents all of
2666 the branches we eliminate the whole thing and
2667 replace it with a single TRIE. If it is a
2668 subsequence then we need to stitch it in. This
2669 means the first branch has to remain, and needs
2670 to be repointed at the item on the branch chain
2671 following the last branch optimized. This could
2672 be either a BRANCH, in which case the
2673 subsequence is internal, or it could be the
2674 item following the branch sequence in which
2675 case the subsequence is at the end.
2676
2677 */
2678
2679 /* dont use tail as the end marker for this traverse */
2680 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2681 regnode * const noper = NEXTOPER( cur );
b515a41d 2682#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2683 regnode * const noper_next = regnext( noper );
b515a41d
YO
2684#endif
2685
8aa23a47
YO
2686 DEBUG_OPTIMISE_r({
2687 regprop(RExC_rx, mysv, cur);
2688 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2689 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2690
2691 regprop(RExC_rx, mysv, noper);
2692 PerlIO_printf( Perl_debug_log, " -> %s",
2693 SvPV_nolen_const(mysv));
2694
2695 if ( noper_next ) {
2696 regprop(RExC_rx, mysv, noper_next );
2697 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2698 SvPV_nolen_const(mysv));
2699 }
2700 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2701 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2702 });
2703 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2704 : PL_regkind[ OP( noper ) ] == EXACT )
2705 || OP(noper) == NOTHING )
786e8c11 2706#ifdef NOJUMPTRIE
8aa23a47 2707 && noper_next == tail
786e8c11 2708#endif
8aa23a47
YO
2709 && count < U16_MAX)
2710 {
2711 count++;
2712 if ( !first || optype == NOTHING ) {
2713 if (!first) first = cur;
2714 optype = OP( noper );
2715 } else {
2716 last = cur;
2717 }
2718 } else {
2719 if ( last ) {
2720 make_trie( pRExC_state,
2721 startbranch, first, cur, tail, count,
2722 optype, depth+1 );
2723 }
2724 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2725#ifdef NOJUMPTRIE
8aa23a47 2726 && noper_next == tail
786e8c11 2727#endif
8aa23a47
YO
2728 ){
2729 count = 1;
2730 first = cur;
2731 optype = OP( noper );
2732 } else {
2733 count = 0;
2734 first = NULL;
2735 optype = 0;
2736 }
2737 last = NULL;
2738 }
2739 }
2740 DEBUG_OPTIMISE_r({
2741 regprop(RExC_rx, mysv, cur);
2742 PerlIO_printf( Perl_debug_log,
2743 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2744 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2745
2746 });
2747 if ( last ) {
2748 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2749#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2750 if ( ((made == MADE_EXACT_TRIE &&
2751 startbranch == first)
2752 || ( first_non_open == first )) &&
2753 depth==0 ) {
2754 flags |= SCF_TRIE_RESTUDY;
2755 if ( startbranch == first
2756 && scan == tail )
2757 {
2758 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2759 }
2760 }
3dab1dad 2761#endif
8aa23a47
YO
2762 }
2763 }
2764
2765 } /* do trie */
2766
653099ff 2767 }
8aa23a47
YO
2768 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2769 scan = NEXTOPER(NEXTOPER(scan));
2770 } else /* single branch is optimized. */
2771 scan = NEXTOPER(scan);
2772 continue;
2773 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2774 scan_frame *newframe = NULL;
2775 I32 paren;
2776 regnode *start;
2777 regnode *end;
2778
2779 if (OP(scan) != SUSPEND) {
2780 /* set the pointer */
2781 if (OP(scan) == GOSUB) {
2782 paren = ARG(scan);
2783 RExC_recurse[ARG2L(scan)] = scan;
2784 start = RExC_open_parens[paren-1];
2785 end = RExC_close_parens[paren-1];
2786 } else {
2787 paren = 0;
f8fc2ecf 2788 start = RExC_rxi->program + 1;
8aa23a47
YO
2789 end = RExC_opend;
2790 }
2791 if (!recursed) {
2792 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2793 SAVEFREEPV(recursed);
2794 }
2795 if (!PAREN_TEST(recursed,paren+1)) {
2796 PAREN_SET(recursed,paren+1);
2797 Newx(newframe,1,scan_frame);
2798 } else {
2799 if (flags & SCF_DO_SUBSTR) {
304ee84b 2800 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2801 data->longest = &(data->longest_float);
2802 }
2803 is_inf = is_inf_internal = 1;
2804 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2805 cl_anything(pRExC_state, data->start_class);
2806 flags &= ~SCF_DO_STCLASS;
2807 }
2808 } else {
2809 Newx(newframe,1,scan_frame);
2810 paren = stopparen;
2811 start = scan+2;
2812 end = regnext(scan);
2813 }
2814 if (newframe) {
2815 assert(start);
2816 assert(end);
2817 SAVEFREEPV(newframe);
2818 newframe->next = regnext(scan);
2819 newframe->last = last;
2820 newframe->stop = stopparen;
2821 newframe->prev = frame;
2822
2823 frame = newframe;
2824 scan = start;
2825 stopparen = paren;
2826 last = end;
2827
2828 continue;
2829 }
2830 }
2831 else if (OP(scan) == EXACT) {
2832 I32 l = STR_LEN(scan);
2833 UV uc;
2834 if (UTF) {
2835 const U8 * const s = (U8*)STRING(scan);
2836 l = utf8_length(s, s + l);
2837 uc = utf8_to_uvchr(s, NULL);
2838 } else {
2839 uc = *((U8*)STRING(scan));
2840 }
2841 min += l;
2842 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2843 /* The code below prefers earlier match for fixed
2844 offset, later match for variable offset. */
2845 if (data->last_end == -1) { /* Update the start info. */
2846 data->last_start_min = data->pos_min;
2847 data->last_start_max = is_inf
2848 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2849 }
8aa23a47
YO
2850 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2851 if (UTF)
2852 SvUTF8_on(data->last_found);
2853 {
2854 SV * const sv = data->last_found;
2855 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2856 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2857 if (mg && mg->mg_len >= 0)
2858 mg->mg_len += utf8_length((U8*)STRING(scan),
2859 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2860 }
8aa23a47
YO
2861 data->last_end = data->pos_min + l;
2862 data->pos_min += l; /* As in the first entry. */
2863 data->flags &= ~SF_BEFORE_EOL;
2864 }
2865 if (flags & SCF_DO_STCLASS_AND) {
2866 /* Check whether it is compatible with what we know already! */
2867 int compat = 1;
2868
2869 if (uc >= 0x100 ||
2870 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2871 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2872 && (!(data->start_class->flags & ANYOF_FOLD)
2873 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2874 )
2875 compat = 0;
2876 ANYOF_CLASS_ZERO(data->start_class);
2877 ANYOF_BITMAP_ZERO(data->start_class);
2878 if (compat)
2879 ANYOF_BITMAP_SET(data->start_class, uc);
2880 data->start_class->flags &= ~ANYOF_EOS;
2881 if (uc < 0x100)
2882 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2883 }
2884 else if (flags & SCF_DO_STCLASS_OR) {
2885 /* false positive possible if the class is case-folded */
2886 if (uc < 0x100)
2887 ANYOF_BITMAP_SET(data->start_class, uc);
2888 else
2889 data->start_class->flags |= ANYOF_UNICODE_ALL;
2890 data->start_class->flags &= ~ANYOF_EOS;
2891 cl_and(data->start_class, and_withp);
2892 }
2893 flags &= ~SCF_DO_STCLASS;
2894 }
2895 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2896 I32 l = STR_LEN(scan);
2897 UV uc = *((U8*)STRING(scan));
2898
2899 /* Search for fixed substrings supports EXACT only. */
2900 if (flags & SCF_DO_SUBSTR) {
2901 assert(data);
304ee84b 2902 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2903 }
2904 if (UTF) {
2905 const U8 * const s = (U8 *)STRING(scan);
2906 l = utf8_length(s, s + l);
2907 uc = utf8_to_uvchr(s, NULL);
2908 }
2909 min += l;
2910 if (flags & SCF_DO_SUBSTR)
2911 data->pos_min += l;
2912 if (flags & SCF_DO_STCLASS_AND) {
2913 /* Check whether it is compatible with what we know already! */
2914 int compat = 1;
2915
2916 if (uc >= 0x100 ||
2917 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2918 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2919 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2920 compat = 0;
2921 ANYOF_CLASS_ZERO(data->start_class);
2922 ANYOF_BITMAP_ZERO(data->start_class);
2923 if (compat) {
2924 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2925 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2926 data->start_class->flags |= ANYOF_FOLD;
2927 if (OP(scan) == EXACTFL)
2928 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2929 }
8aa23a47
YO
2930 }
2931 else if (flags & SCF_DO_STCLASS_OR) {
2932 if (data->start_class->flags & ANYOF_FOLD) {
2933 /* false positive possible if the class is case-folded.
2934 Assume that the locale settings are the same... */
1aa99e6b
IH
2935 if (uc < 0x100)
2936 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2937 data->start_class->flags &= ~ANYOF_EOS;
2938 }
8aa23a47 2939 cl_and(data->start_class, and_withp);
653099ff 2940 }
8aa23a47
YO
2941 flags &= ~SCF_DO_STCLASS;
2942 }
2943 else if (strchr((const char*)PL_varies,OP(scan))) {
2944 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2945 I32 f = flags, pos_before = 0;
2946 regnode * const oscan = scan;
2947 struct regnode_charclass_class this_class;
2948 struct regnode_charclass_class *oclass = NULL;
2949 I32 next_is_eval = 0;
2950
2951 switch (PL_regkind[OP(scan)]) {
2952 case WHILEM: /* End of (?:...)* . */
2953 scan = NEXTOPER(scan);
2954 goto finish;
2955 case PLUS:
2956 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2957 next = NEXTOPER(scan);
2958 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2959 mincount = 1;
2960 maxcount = REG_INFTY;
2961 next = regnext(scan);
2962 scan = NEXTOPER(scan);
2963 goto do_curly;
2964 }
2965 }
2966 if (flags & SCF_DO_SUBSTR)
2967 data->pos_min++;
2968 min++;
2969 /* Fall through. */
2970 case STAR:
2971 if (flags & SCF_DO_STCLASS) {
2972 mincount = 0;
2973 maxcount = REG_INFTY;
2974 next = regnext(scan);
2975 scan = NEXTOPER(scan);
2976 goto do_curly;
2977 }
2978 is_inf = is_inf_internal = 1;
2979 scan = regnext(scan);
c277df42 2980 if (flags & SCF_DO_SUBSTR) {
304ee84b 2981 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2982 data->longest = &(data->longest_float);
c277df42 2983 }
8aa23a47
YO
2984 goto optimize_curly_tail;
2985 case CURLY:
2986 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2987 && (scan->flags == stopparen))
2988 {
2989 mincount = 1;
2990 maxcount = 1;
2991 } else {
2992 mincount = ARG1(scan);
2993 maxcount = ARG2(scan);
653099ff 2994 }
8aa23a47
YO
2995 next = regnext(scan);
2996 if (OP(scan) == CURLYX) {
2997 I32 lp = (data ? *(data->last_closep) : 0);
2998 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2999 }
8aa23a47
YO
3000 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3001 next_is_eval = (OP(scan) == EVAL);
3002 do_curly:
3003 if (flags & SCF_DO_SUBSTR) {
304ee84b 3004 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3005 pos_before = data->pos_min;
b45f050a 3006 }
8aa23a47
YO
3007 if (data) {
3008 fl = data->flags;
3009 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3010 if (is_inf)
3011 data->flags |= SF_IS_INF;
3012 }
3013 if (flags & SCF_DO_STCLASS) {
3014 cl_init(pRExC_state, &this_class);
3015 oclass = data->start_class;
3016 data->start_class = &this_class;
3017 f |= SCF_DO_STCLASS_AND;
3018 f &= ~SCF_DO_STCLASS_OR;
3019 }
3020 /* These are the cases when once a subexpression
3021 fails at a particular position, it cannot succeed
3022 even after backtracking at the enclosing scope.
3023
3024 XXXX what if minimal match and we are at the
3025 initial run of {n,m}? */
3026 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3027 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3028
8aa23a47
YO
3029 /* This will finish on WHILEM, setting scan, or on NULL: */
3030 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3031 last, data, stopparen, recursed, NULL,
3032 (mincount == 0
3033 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3034
8aa23a47
YO
3035 if (flags & SCF_DO_STCLASS)
3036 data->start_class = oclass;
3037 if (mincount == 0 || minnext == 0) {
3038 if (flags & SCF_DO_STCLASS_OR) {
3039 cl_or(pRExC_state, data->start_class, &this_class);
3040 }
3041 else if (flags & SCF_DO_STCLASS_AND) {
3042 /* Switch to OR mode: cache the old value of
3043 * data->start_class */
3044 INIT_AND_WITHP;
3045 StructCopy(data->start_class, and_withp,
3046 struct regnode_charclass_class);
3047 flags &= ~SCF_DO_STCLASS_AND;
3048 StructCopy(&this_class, data->start_class,
3049 struct regnode_charclass_class);
3050 flags |= SCF_DO_STCLASS_OR;
3051 data->start_class->flags |= ANYOF_EOS;
3052 }
3053 } else { /* Non-zero len */
3054 if (flags & SCF_DO_STCLASS_OR) {
3055 cl_or(pRExC_state, data->start_class, &this_class);
3056 cl_and(data->start_class, and_withp);
3057 }
3058 else if (flags & SCF_DO_STCLASS_AND)
3059 cl_and(data->start_class, &this_class);
3060 flags &= ~SCF_DO_STCLASS;
3061 }
3062 if (!scan) /* It was not CURLYX, but CURLY. */
3063 scan = next;
3064 if ( /* ? quantifier ok, except for (?{ ... }) */
3065 (next_is_eval || !(mincount == 0 && maxcount == 1))
3066 && (minnext == 0) && (deltanext == 0)
3067 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3068 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3069 && ckWARN(WARN_REGEXP))
3070 {
3071 vWARN(RExC_parse,
3072 "Quantifier unexpected on zero-length expression");
3073 }
3074
3075 min += minnext * mincount;
3076 is_inf_internal |= ((maxcount == REG_INFTY
3077 && (minnext + deltanext) > 0)
3078 || deltanext == I32_MAX);
3079 is_inf |= is_inf_internal;
3080 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3081
3082 /* Try powerful optimization CURLYX => CURLYN. */
3083 if ( OP(oscan) == CURLYX && data
3084 && data->flags & SF_IN_PAR
3085 && !(data->flags & SF_HAS_EVAL)
3086 && !deltanext && minnext == 1 ) {
3087 /* Try to optimize to CURLYN. */
3088 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3089 regnode * const nxt1 = nxt;
497b47a8 3090#ifdef DEBUGGING
8aa23a47 3091 regnode *nxt2;
497b47a8 3092#endif
c277df42 3093
8aa23a47
YO
3094 /* Skip open. */
3095 nxt = regnext(nxt);
3096 if (!strchr((const char*)PL_simple,OP(nxt))
3097 && !(PL_regkind[OP(nxt)] == EXACT
3098 && STR_LEN(nxt) == 1))
3099 goto nogo;
497b47a8 3100#ifdef DEBUGGING
8aa23a47 3101 nxt2 = nxt;
497b47a8 3102#endif
8aa23a47
YO
3103 nxt = regnext(nxt);
3104 if (OP(nxt) != CLOSE)
3105 goto nogo;
3106 if (RExC_open_parens) {
3107 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3108 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3109 }
3110 /* Now we know that nxt2 is the only contents: */
3111 oscan->flags = (U8)ARG(nxt);
3112 OP(oscan) = CURLYN;
3113 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3114
c277df42 3115#ifdef DEBUGGING
8aa23a47
YO
3116 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3117 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3118 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3119 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3120 OP(nxt + 1) = OPTIMIZED; /* was count. */
3121 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3122#endif
8aa23a47
YO
3123 }
3124 nogo:
3125
3126 /* Try optimization CURLYX => CURLYM. */
3127 if ( OP(oscan) == CURLYX && data
3128 && !(data->flags & SF_HAS_PAR)
3129 && !(data->flags & SF_HAS_EVAL)
3130 && !deltanext /* atom is fixed width */
3131 && minnext != 0 /* CURLYM can't handle zero width */
3132 ) {
3133 /* XXXX How to optimize if data == 0? */
3134 /* Optimize to a simpler form. */
3135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3136 regnode *nxt2;
3137
3138 OP(oscan) = CURLYM;
3139 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3140 && (OP(nxt2) != WHILEM))
3141 nxt = nxt2;
3142 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3143 /* Need to optimize away parenths. */
3144 if (data->flags & SF_IN_PAR) {
3145 /* Set the parenth number. */
3146 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3147
3148 if (OP(nxt) != CLOSE)
3149 FAIL("Panic opt close");
3150 oscan->flags = (U8)ARG(nxt);
3151 if (RExC_open_parens) {
3152 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3153 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3154 }
8aa23a47
YO
3155 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3156 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3157
c277df42 3158#ifdef DEBUGGING
8aa23a47
YO
3159 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3160 OP(nxt + 1) = OPTIMIZED; /* was count. */
3161 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3162 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3163#endif
c277df42 3164#if 0
8aa23a47
YO
3165 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3166 regnode *nnxt = regnext(nxt1);
3167
3168 if (nnxt == nxt) {
3169 if (reg_off_by_arg[OP(nxt1)])
3170 ARG_SET(nxt1, nxt2 - nxt1);
3171 else if (nxt2 - nxt1 < U16_MAX)
3172 NEXT_OFF(nxt1) = nxt2 - nxt1;
3173 else
3174 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3175 }
8aa23a47 3176 nxt1 = nnxt;
c277df42 3177 }
5d1c421c 3178#endif
8aa23a47
YO
3179 /* Optimize again: */
3180 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3181 NULL, stopparen, recursed, NULL, 0,depth+1);
3182 }
3183 else
3184 oscan->flags = 0;
3185 }
3186 else if ((OP(oscan) == CURLYX)
3187 && (flags & SCF_WHILEM_VISITED_POS)
3188 /* See the comment on a similar expression above.
3189 However, this time it not a subexpression
3190 we care about, but the expression itself. */
3191 && (maxcount == REG_INFTY)
3192 && data && ++data->whilem_c < 16) {
3193 /* This stays as CURLYX, we can put the count/of pair. */
3194 /* Find WHILEM (as in regexec.c) */
3195 regnode *nxt = oscan + NEXT_OFF(oscan);
3196
3197 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3198 nxt += ARG(nxt);
3199 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3200 | (RExC_whilem_seen << 4)); /* On WHILEM */
3201 }
3202 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3203 pars++;
3204 if (flags & SCF_DO_SUBSTR) {
3205 SV *last_str = NULL;
3206 int counted = mincount != 0;
a0ed51b3 3207
8aa23a47
YO
3208 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3209#if defined(SPARC64_GCC_WORKAROUND)
3210 I32 b = 0;
3211 STRLEN l = 0;
3212 const char *s = NULL;
3213 I32 old = 0;
b515a41d 3214
8aa23a47
YO
3215 if (pos_before >= data->last_start_min)
3216 b = pos_before;
3217 else
3218 b = data->last_start_min;
b515a41d 3219
8aa23a47
YO
3220 l = 0;
3221 s = SvPV_const(data->last_found, l);
3222 old = b - data->last_start_min;
3223
3224#else
3225 I32 b = pos_before >= data->last_start_min
3226 ? pos_before : data->last_start_min;
3227 STRLEN l;
3228 const char * const s = SvPV_const(data->last_found, l);
3229 I32 old = b - data->last_start_min;
3230#endif
3231
3232 if (UTF)
3233 old = utf8_hop((U8*)s, old) - (U8*)s;
3234
3235 l -= old;
3236 /* Get the added string: */
3237 last_str = newSVpvn(s + old, l);
3238 if (UTF)
3239 SvUTF8_on(last_str);
3240 if (deltanext == 0 && pos_before == b) {
3241 /* What was added is a constant string */
3242 if (mincount > 1) {
3243 SvGROW(last_str, (mincount * l) + 1);
3244 repeatcpy(SvPVX(last_str) + l,
3245 SvPVX_const(last_str), l, mincount - 1);
3246 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3247 /* Add additional parts. */
3248 SvCUR_set(data->last_found,
3249 SvCUR(data->last_found) - l);
3250 sv_catsv(data->last_found, last_str);
3251 {
3252 SV * sv = data->last_found;
3253 MAGIC *mg =
3254 SvUTF8(sv) && SvMAGICAL(sv) ?
3255 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3256 if (mg && mg->mg_len >= 0)
3257 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3258 }
8aa23a47 3259 data->last_end += l * (mincount - 1);
b515a41d 3260 }
8aa23a47
YO
3261 } else {
3262 /* start offset must point into the last copy */
3263 data->last_start_min += minnext * (mincount - 1);
3264 data->last_start_max += is_inf ? I32_MAX
3265 : (maxcount - 1) * (minnext + data->pos_delta);
3266 }
c277df42 3267 }
8aa23a47
YO
3268 /* It is counted once already... */
3269 data->pos_min += minnext * (mincount - counted);
3270 data->pos_delta += - counted * deltanext +
3271 (minnext + deltanext) * maxcount - minnext * mincount;
3272 if (mincount != maxcount) {
3273 /* Cannot extend fixed substrings found inside
3274 the group. */
304ee84b 3275 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3276 if (mincount && last_str) {
3277 SV * const sv = data->last_found;
3278 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3279 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3280
3281 if (mg)
3282 mg->mg_len = -1;
3283 sv_setsv(sv, last_str);
3284 data->last_end = data->pos_min;
3285 data->last_start_min =
3286 data->pos_min - CHR_SVLEN(last_str);
3287 data->last_start_max = is_inf
3288 ? I32_MAX
3289 : data->pos_min + data->pos_delta
3290 - CHR_SVLEN(last_str);
3291 }
3292 data->longest = &(data->longest_float);
3293 }
3294 SvREFCNT_dec(last_str);
c277df42 3295 }
8aa23a47
YO
3296 if (data && (fl & SF_HAS_EVAL))
3297 data->flags |= SF_HAS_EVAL;
3298 optimize_curly_tail:
3299 if (OP(oscan) != CURLYX) {
3300 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3301 && NEXT_OFF(next))
3302 NEXT_OFF(oscan) += NEXT_OFF(next);
3303 }
3304 continue;
3305 default: /* REF and CLUMP only? */
3306 if (flags & SCF_DO_SUBSTR) {
304ee84b 3307 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3308 data->longest = &(data->longest_float);
3309 }
3310 is_inf = is_inf_internal = 1;
3311 if (flags & SCF_DO_STCLASS_OR)
3312 cl_anything(pRExC_state, data->start_class);
3313 flags &= ~SCF_DO_STCLASS;
3314 break;
c277df42 3315 }
8aa23a47
YO
3316 }
3317 else if (strchr((const char*)PL_simple,OP(scan))) {
3318 int value = 0;
653099ff 3319
8aa23a47 3320 if (flags & SCF_DO_SUBSTR) {
304ee84b 3321 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3322 data->pos_min++;
3323 }
3324 min++;
3325 if (flags & SCF_DO_STCLASS) {
3326 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3327
8aa23a47
YO
3328 /* Some of the logic below assumes that switching
3329 locale on will only add false positives. */
3330 switch (PL_regkind[OP(scan)]) {
3331 case SANY:
3332 default:
3333 do_default:
3334 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3335 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3336 cl_anything(pRExC_state, data->start_class);
3337 break;
3338 case REG_ANY:
3339 if (OP(scan) == SANY)
3340 goto do_default;
3341 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3342 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3343 || (data->start_class->flags & ANYOF_CLASS));
3344 cl_anything(pRExC_state, data->start_class);
653099ff 3345 }
8aa23a47
YO
3346 if (flags & SCF_DO_STCLASS_AND || !value)
3347 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3348 break;
3349 case ANYOF:
3350 if (flags & SCF_DO_STCLASS_AND)
3351 cl_and(data->start_class,
3352 (struct regnode_charclass_class*)scan);
653099ff 3353 else
8aa23a47
YO
3354 cl_or(pRExC_state, data->start_class,
3355 (struct regnode_charclass_class*)scan);
3356 break;
3357 case ALNUM:
3358 if (flags & SCF_DO_STCLASS_AND) {
3359 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3361 for (value = 0; value < 256; value++)
3362 if (!isALNUM(value))
3363 ANYOF_BITMAP_CLEAR(data->start_class, value);
3364 }
653099ff 3365 }
8aa23a47
YO
3366 else {
3367 if (data->start_class->flags & ANYOF_LOCALE)
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3369 else {
3370 for (value = 0; value < 256; value++)
3371 if (isALNUM(value))
3372 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3373 }
8aa23a47
YO
3374 }
3375 break;
3376 case ALNUML:
3377 if (flags & SCF_DO_STCLASS_AND) {
3378 if (data->start_class->flags & ANYOF_LOCALE)
3379 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3380 }
3381 else {
3382 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3383 data->start_class->flags |= ANYOF_LOCALE;
3384 }
3385 break;
3386 case NALNUM:
3387 if (flags & SCF_DO_STCLASS_AND) {
3388 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3389 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3390 for (value = 0; value < 256; value++)
3391 if (isALNUM(value))
3392 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3393 }
3394 }
8aa23a47
YO
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (!isALNUM(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
3402 }
653099ff 3403 }
8aa23a47
YO
3404 break;
3405 case NALNUML:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 if (data->start_class->flags & ANYOF_LOCALE)
3408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3409 }
8aa23a47
YO
3410 else {
3411 data->start_class->flags |= ANYOF_LOCALE;
3412 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3413 }
3414 break;
3415 case SPACE:
3416 if (flags & SCF_DO_STCLASS_AND) {
3417 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3418 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3419 for (value = 0; value < 256; value++)
3420 if (!isSPACE(value))
3421 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3422 }
3423 }
8aa23a47
YO
3424 else {
3425 if (data->start_class->flags & ANYOF_LOCALE)
3426 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3427 else {
3428 for (value = 0; value < 256; value++)
3429 if (isSPACE(value))
3430 ANYOF_BITMAP_SET(data->start_class, value);
3431 }
653099ff 3432 }
8aa23a47
YO
3433 break;
3434 case SPACEL:
3435 if (flags & SCF_DO_STCLASS_AND) {
3436 if (data->start_class->flags & ANYOF_LOCALE)
3437 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3438 }
3439 else {
3440 data->start_class->flags |= ANYOF_LOCALE;
3441 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3442 }
3443 break;
3444 case NSPACE:
3445 if (flags & SCF_DO_STCLASS_AND) {
3446 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3448 for (value = 0; value < 256; value++)
3449 if (isSPACE(value))
3450 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3451 }
8aa23a47
YO
3452 }
3453 else {
3454 if (data->start_class->flags & ANYOF_LOCALE)
3455 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3456 else {
3457 for (value = 0; value < 256; value++)
3458 if (!isSPACE(value))
3459 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3460 }
3461 }
8aa23a47
YO
3462 break;
3463 case NSPACEL:
3464 if (flags & SCF_DO_STCLASS_AND) {
3465 if (data->start_class->flags & ANYOF_LOCALE) {
3466 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3467 for (value = 0; value < 256; value++)
3468 if (!isSPACE(value))
3469 ANYOF_BITMAP_CLEAR(data->start_class, value);
3470 }
653099ff 3471 }
8aa23a47
YO
3472 else {
3473 data->start_class->flags |= ANYOF_LOCALE;
3474 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3475 }
3476 break;
3477 case DIGIT:
3478 if (flags & SCF_DO_STCLASS_AND) {
3479 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3480 for (value = 0; value < 256; value++)
3481 if (!isDIGIT(value))
3482 ANYOF_BITMAP_CLEAR(data->start_class, value);
3483 }
3484 else {
3485 if (data->start_class->flags & ANYOF_LOCALE)
3486 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3487 else {
3488 for (value = 0; value < 256; value++)
3489 if (isDIGIT(value))
3490 ANYOF_BITMAP_SET(data->start_class, value);
3491 }
3492 }
3493 break;
3494 case NDIGIT:
3495 if (flags & SCF_DO_STCLASS_AND) {
3496 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3497 for (value = 0; value < 256; value++)
3498 if (isDIGIT(value))
3499 ANYOF_BITMAP_CLEAR(data->start_class, value);
3500 }
3501 else {
3502 if (data->start_class->flags & ANYOF_LOCALE)
3503 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3504 else {
3505 for (value = 0; value < 256; value++)
3506 if (!isDIGIT(value))
3507 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3508 }
3509 }
8aa23a47
YO
3510 break;
3511 }
3512 if (flags & SCF_DO_STCLASS_OR)
3513 cl_and(data->start_class, and_withp);
3514 flags &= ~SCF_DO_STCLASS;
3515 }
3516 }
3517 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3518 data->flags |= (OP(scan) == MEOL
3519 ? SF_BEFORE_MEOL
3520 : SF_BEFORE_SEOL);
3521 }
3522 else if ( PL_regkind[OP(scan)] == BRANCHJ
3523 /* Lookbehind, or need to calculate parens/evals/stclass: */
3524 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3525 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3526 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3527 || OP(scan) == UNLESSM )
3528 {
3529 /* Negative Lookahead/lookbehind
3530 In this case we can't do fixed string optimisation.
3531 */
1de06328 3532
8aa23a47
YO
3533 I32 deltanext, minnext, fake = 0;
3534 regnode *nscan;
3535 struct regnode_charclass_class intrnl;
3536 int f = 0;
1de06328 3537
8aa23a47
YO
3538 data_fake.flags = 0;
3539 if (data) {
3540 data_fake.whilem_c = data->whilem_c;
3541 data_fake.last_closep = data->last_closep;
c277df42 3542 }
8aa23a47
YO
3543 else
3544 data_fake.last_closep = &fake;
58e23c8d 3545 data_fake.pos_delta = delta;
8aa23a47
YO
3546 if ( flags & SCF_DO_STCLASS && !scan->flags
3547 && OP(scan) == IFMATCH ) { /* Lookahead */
3548 cl_init(pRExC_state, &intrnl);
3549 data_fake.start_class = &intrnl;
3550 f |= SCF_DO_STCLASS_AND;
3551 }
3552 if (flags & SCF_WHILEM_VISITED_POS)
3553 f |= SCF_WHILEM_VISITED_POS;
3554 next = regnext(scan);
3555 nscan = NEXTOPER(NEXTOPER(scan));
3556 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3557 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3558 if (scan->flags) {
3559 if (deltanext) {
58e23c8d 3560 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3561 }
3562 else if (minnext > (I32)U8_MAX) {
58e23c8d 3563 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3564 }
3565 scan->flags = (U8)minnext;
3566 }
3567 if (data) {
3568 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3569 pars++;
3570 if (data_fake.flags & SF_HAS_EVAL)
3571 data->flags |= SF_HAS_EVAL;
3572 data->whilem_c = data_fake.whilem_c;
3573 }
3574 if (f & SCF_DO_STCLASS_AND) {
3575 const int was = (data->start_class->flags & ANYOF_EOS);
3576
3577 cl_and(data->start_class, &intrnl);
3578 if (was)
3579 data->start_class->flags |= ANYOF_EOS;
3580 }
cb434fcc 3581 }
8aa23a47
YO
3582#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3583 else {
3584 /* Positive Lookahead/lookbehind
3585 In this case we can do fixed string optimisation,
3586 but we must be careful about it. Note in the case of
3587 lookbehind the positions will be offset by the minimum
3588 length of the pattern, something we won't know about
3589 until after the recurse.
3590 */
3591 I32 deltanext, fake = 0;
3592 regnode *nscan;
3593 struct regnode_charclass_class intrnl;
3594 int f = 0;
3595 /* We use SAVEFREEPV so that when the full compile
3596 is finished perl will clean up the allocated
3597 minlens when its all done. This was we don't
3598 have to worry about freeing them when we know
3599 they wont be used, which would be a pain.
3600 */
3601 I32 *minnextp;
3602 Newx( minnextp, 1, I32 );
3603 SAVEFREEPV(minnextp);
3604
3605 if (data) {
3606 StructCopy(data, &data_fake, scan_data_t);
3607 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3608 f |= SCF_DO_SUBSTR;
3609 if (scan->flags)
304ee84b 3610 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3611 data_fake.last_found=newSVsv(data->last_found);
3612 }
3613 }
3614 else
3615 data_fake.last_closep = &fake;
3616 data_fake.flags = 0;
58e23c8d 3617 data_fake.pos_delta = delta;
8aa23a47
YO
3618 if (is_inf)
3619 data_fake.flags |= SF_IS_INF;
3620 if ( flags & SCF_DO_STCLASS && !scan->flags
3621 && OP(scan) == IFMATCH ) { /* Lookahead */
3622 cl_init(pRExC_state, &intrnl);
3623 data_fake.start_class = &intrnl;
3624 f |= SCF_DO_STCLASS_AND;
3625 }
3626 if (flags & SCF_WHILEM_VISITED_POS)
3627 f |= SCF_WHILEM_VISITED_POS;
3628 next = regnext(scan);
3629 nscan = NEXTOPER(NEXTOPER(scan));
3630
3631 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3632 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3633 if (scan->flags) {
3634 if (deltanext) {
58e23c8d 3635 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3636 }
3637 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3638 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3639 }
3640 scan->flags = (U8)*minnextp;
3641 }
3642
3643 *minnextp += min;
3644
3645 if (f & SCF_DO_STCLASS_AND) {
3646 const int was = (data->start_class->flags & ANYOF_EOS);
3647
3648 cl_and(data->start_class, &intrnl);
3649 if (was)
3650 data->start_class->flags |= ANYOF_EOS;
3651 }
3652 if (data) {
3653 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3654 pars++;
3655 if (data_fake.flags & SF_HAS_EVAL)
3656 data->flags |= SF_HAS_EVAL;
3657 data->whilem_c = data_fake.whilem_c;
3658 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3659 if (RExC_rx->minlen<*minnextp)
3660 RExC_rx->minlen=*minnextp;
304ee84b 3661 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3662 SvREFCNT_dec(data_fake.last_found);
3663
3664 if ( data_fake.minlen_fixed != minlenp )
3665 {
3666 data->offset_fixed= data_fake.offset_fixed;
3667 data->minlen_fixed= data_fake.minlen_fixed;
3668 data->lookbehind_fixed+= scan->flags;
3669 }
3670 if ( data_fake.minlen_float != minlenp )
3671 {
3672 data->minlen_float= data_fake.minlen_float;
3673 data->offset_float_min=data_fake.offset_float_min;
3674 data->offset_float_max=data_fake.offset_float_max;
3675 data->lookbehind_float+= scan->flags;
3676 }
3677 }
3678 }
3679
3680
40d049e4 3681 }
8aa23a47
YO
3682#endif
3683 }
3684 else if (OP(scan) == OPEN) {
3685 if (stopparen != (I32)ARG(scan))
3686 pars++;
3687 }
3688 else if (OP(scan) == CLOSE) {
3689 if (stopparen == (I32)ARG(scan)) {
3690 break;
3691 }
3692 if ((I32)ARG(scan) == is_par) {
3693 next = regnext(scan);
b515a41d 3694
8aa23a47
YO
3695 if ( next && (OP(next) != WHILEM) && next < last)
3696 is_par = 0; /* Disable optimization */
40d049e4 3697 }
8aa23a47
YO
3698 if (data)
3699 *(data->last_closep) = ARG(scan);
3700 }
3701 else if (OP(scan) == EVAL) {
c277df42
IZ
3702 if (data)
3703 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3704 }
3705 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3706 if (flags & SCF_DO_SUBSTR) {
304ee84b 3707 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3708 flags &= ~SCF_DO_SUBSTR;
40d049e4 3709 }
8aa23a47
YO
3710 if (data && OP(scan)==ACCEPT) {
3711 data->flags |= SCF_SEEN_ACCEPT;
3712 if (stopmin > min)
3713 stopmin = min;
e2e6a0f1 3714 }
8aa23a47
YO
3715 }
3716 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3717 {
0f5d15d6 3718 if (flags & SCF_DO_SUBSTR) {
304ee84b 3719 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3720 data->longest = &(data->longest_float);
3721 }
3722 is_inf = is_inf_internal = 1;
653099ff 3723 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3724 cl_anything(pRExC_state, data->start_class);
96776eda 3725 flags &= ~SCF_DO_STCLASS;
8aa23a47 3726 }
58e23c8d 3727 else if (OP(scan) == GPOS) {
bbe252da 3728 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3729 !(delta || is_inf || (data && data->pos_delta)))
3730 {
bbe252da
YO
3731 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3732 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3733 if (RExC_rx->gofs < (U32)min)
3734 RExC_rx->gofs = min;
3735 } else {
bbe252da 3736 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3737 RExC_rx->gofs = 0;
3738 }
3739 }
786e8c11 3740#ifdef TRIE_STUDY_OPT
40d049e4 3741#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3742 else if (PL_regkind[OP(scan)] == TRIE) {
3743 /* NOTE - There is similar code to this block above for handling
3744 BRANCH nodes on the initial study. If you change stuff here
3745 check there too. */
3746 regnode *trie_node= scan;
3747 regnode *tail= regnext(scan);
f8fc2ecf 3748 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3749 I32 max1 = 0, min1 = I32_MAX;
3750 struct regnode_charclass_class accum;
3751
3752 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3753 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3754 if (flags & SCF_DO_STCLASS)
3755 cl_init_zero(pRExC_state, &accum);
3756
3757 if (!trie->jump) {
3758 min1= trie->minlen;
3759 max1= trie->maxlen;
3760 } else {
3761 const regnode *nextbranch= NULL;
3762 U32 word;
3763
3764 for ( word=1 ; word <= trie->wordcount ; word++)
3765 {
3766 I32 deltanext=0, minnext=0, f = 0, fake;
3767 struct regnode_charclass_class this_class;
3768
3769 data_fake.flags = 0;
3770 if (data) {
3771 data_fake.whilem_c = data->whilem_c;
3772 data_fake.last_closep = data->last_closep;
3773 }
3774 else
3775 data_fake.last_closep = &fake;
58e23c8d 3776 data_fake.pos_delta = delta;
8aa23a47
YO
3777 if (flags & SCF_DO_STCLASS) {
3778 cl_init(pRExC_state, &this_class);
3779 data_fake.start_class = &this_class;
3780 f = SCF_DO_STCLASS_AND;
3781 }
3782 if (flags & SCF_WHILEM_VISITED_POS)
3783 f |= SCF_WHILEM_VISITED_POS;
3784
3785 if (trie->jump[word]) {
3786 if (!nextbranch)
3787 nextbranch = trie_node + trie->jump[0];
3788 scan= trie_node + trie->jump[word];
3789 /* We go from the jump point to the branch that follows
3790 it. Note this means we need the vestigal unused branches
3791 even though they arent otherwise used.
3792 */
3793 minnext = study_chunk(pRExC_state, &scan, minlenp,
3794 &deltanext, (regnode *)nextbranch, &data_fake,
3795 stopparen, recursed, NULL, f,depth+1);
3796 }
3797 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3798 nextbranch= regnext((regnode*)nextbranch);
3799
3800 if (min1 > (I32)(minnext + trie->minlen))
3801 min1 = minnext + trie->minlen;
3802 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3803 max1 = minnext + deltanext + trie->maxlen;
3804 if (deltanext == I32_MAX)
3805 is_inf = is_inf_internal = 1;
3806
3807 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3808 pars++;
3809 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3810 if ( stopmin > min + min1)
3811 stopmin = min + min1;
3812 flags &= ~SCF_DO_SUBSTR;
3813 if (data)
3814 data->flags |= SCF_SEEN_ACCEPT;
3815 }
3816 if (data) {
3817 if (data_fake.flags & SF_HAS_EVAL)
3818 data->flags |= SF_HAS_EVAL;
3819 data->whilem_c = data_fake.whilem_c;
3820 }
3821 if (flags & SCF_DO_STCLASS)
3822 cl_or(pRExC_state, &accum, &this_class);
3823 }
3824 }
3825 if (flags & SCF_DO_SUBSTR) {
3826 data->pos_min += min1;
3827 data->pos_delta += max1 - min1;
3828 if (max1 != min1 || is_inf)
3829 data->longest = &(data->longest_float);
3830 }
3831 min += min1;
3832 delta += max1 - min1;
3833 if (flags & SCF_DO_STCLASS_OR) {
3834 cl_or(pRExC_state, data->start_class, &accum);
3835 if (min1) {
3836 cl_and(data->start_class, and_withp);
3837 flags &= ~SCF_DO_STCLASS;
3838 }
3839 }
3840 else if (flags & SCF_DO_STCLASS_AND) {
3841 if (min1) {
3842 cl_and(data->start_class, &accum);
3843 flags &= ~SCF_DO_STCLASS;
3844 }
3845 else {
3846 /* Switch to OR mode: cache the old value of
3847 * data->start_class */
3848 INIT_AND_WITHP;
3849 StructCopy(data->start_class, and_withp,
3850 struct regnode_charclass_class);
3851 flags &= ~SCF_DO_STCLASS_AND;
3852 StructCopy(&accum, data->start_class,
3853 struct regnode_charclass_class);
3854 flags |= SCF_DO_STCLASS_OR;
3855 data->start_class->flags |= ANYOF_EOS;
3856 }
3857 }
3858 scan= tail;
3859 continue;
3860 }
786e8c11 3861#else
8aa23a47 3862 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3863 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3864 U8*bang=NULL;
3865
3866 min += trie->minlen;
3867 delta += (trie->maxlen - trie->minlen);
3868 flags &= ~SCF_DO_STCLASS; /* xxx */
3869 if (flags & SCF_DO_SUBSTR) {
304ee84b 3870 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3871 data->pos_min += trie->minlen;
3872 data->pos_delta += (trie->maxlen - trie->minlen);
3873 if (trie->maxlen != trie->minlen)
3874 data->longest = &(data->longest_float);
3875 }
3876 if (trie->jump) /* no more substrings -- for now /grr*/
3877 flags &= ~SCF_DO_SUBSTR;
b515a41d 3878 }
8aa23a47
YO
3879#endif /* old or new */
3880#endif /* TRIE_STUDY_OPT */
3881 /* Else: zero-length, ignore. */
3882 scan = regnext(scan);
3883 }
3884 if (frame) {
3885 last = frame->last;
3886 scan = frame->next;
3887 stopparen = frame->stop;
3888 frame = frame->prev;
3889 goto fake_study_recurse;
c277df42
IZ
3890 }
3891
3892 finish:
8aa23a47 3893 assert(!frame);
304ee84b 3894 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3895
c277df42 3896 *scanp = scan;
aca2d497 3897 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3898 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3899 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3900 if (is_par > (I32)U8_MAX)
c277df42
IZ
3901 is_par = 0;
3902 if (is_par && pars==1 && data) {
3903 data->flags |= SF_IN_PAR;
3904 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3905 }
3906 else if (pars && data) {
c277df42
IZ
3907 data->flags |= SF_HAS_PAR;
3908 data->flags &= ~SF_IN_PAR;
3909 }
653099ff 3910 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3911 cl_and(data->start_class, and_withp);
786e8c11
YO
3912 if (flags & SCF_TRIE_RESTUDY)
3913 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3914
304ee84b 3915 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3916
e2e6a0f1 3917 return min < stopmin ? min : stopmin;
c277df42
IZ
3918}
3919
2eccd3b2
NC
3920STATIC U32
3921S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3922{
4a4e7719
NC
3923 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3924
3925 Renewc(RExC_rxi->data,
3926 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3927 char, struct reg_data);
3928 if(count)
f8fc2ecf 3929 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3930 else
f8fc2ecf 3931 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3932 RExC_rxi->data->count = count + n;
3933 Copy(s, RExC_rxi->data->what + count, n, U8);
3934 return count;
c277df42
IZ
3935}
3936
f8149455 3937/*XXX: todo make this not included in a non debugging perl */
76234dfb 3938#ifndef PERL_IN_XSUB_RE
d88dccdf 3939void
864dbfa3 3940Perl_reginitcolors(pTHX)
d88dccdf 3941{
97aff369 3942 dVAR;
1df70142 3943 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3944 if (s) {
1df70142
AL
3945 char *t = savepv(s);
3946 int i = 0;
3947 PL_colors[0] = t;
d88dccdf 3948 while (++i < 6) {
1df70142
AL
3949 t = strchr(t, '\t');
3950 if (t) {
3951 *t = '\0';
3952 PL_colors[i] = ++t;
d88dccdf
IZ
3953 }
3954 else
1df70142 3955 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3956 }
3957 } else {
1df70142 3958 int i = 0;
b81d288d 3959 while (i < 6)
06b5626a 3960 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3961 }
3962 PL_colorset = 1;
3963}
76234dfb 3964#endif
8615cb43 3965
07be1b83 3966
786e8c11
YO
3967#ifdef TRIE_STUDY_OPT
3968#define CHECK_RESTUDY_GOTO \
3969 if ( \
3970 (data.flags & SCF_TRIE_RESTUDY) \
3971 && ! restudied++ \
3972 ) goto reStudy
3973#else
3974#define CHECK_RESTUDY_GOTO
3975#endif
f9f4320a 3976
a687059c 3977/*
e50aee73 3978 - pregcomp - compile a regular expression into internal code
a687059c
LW
3979 *
3980 * We can't allocate space until we know how big the compiled form will be,
3981 * but we can't compile it (and thus know how big it is) until we've got a
3982 * place to put the code. So we cheat: we compile it twice, once with code
3983 * generation turned off and size counting turned on, and once "for real".
3984 * This also means that we don't allocate space until we are sure that the
3985 * thing really will compile successfully, and we never have to move the
3986 * code and thus invalidate pointers into it. (Note that it has to be in
3987 * one piece because free() must be able to free it all.) [NB: not true in perl]
3988 *
3989 * Beware that the optimization-preparation code in here knows about some
3990 * of the structure of the compiled regexp. [I'll say.]
3991 */
b9b4dddf
YO
3992
3993
3994
f9f4320a 3995#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3996#define RE_ENGINE_PTR &PL_core_reg_engine
3997#else
f9f4320a
YO
3998extern const struct regexp_engine my_reg_engine;
3999#define RE_ENGINE_PTR &my_reg_engine
4000#endif
6d5c990f
RGS
4001
4002#ifndef PERL_IN_XSUB_RE
a687059c 4003regexp *
864dbfa3 4004Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 4005{
97aff369 4006 dVAR;
6d5c990f 4007 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
4008 /* Dispatch a request to compile a regexp to correct
4009 regexp engine. */
f9f4320a
YO
4010 if (table) {
4011 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4012 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4013 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4014 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4015 DEBUG_COMPILE_r({
8d8756e7 4016 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4017 SvIV(*ptr));
4018 });
f2f78491 4019 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 4020 }
b9b4dddf 4021 }
2a5d9b1d
RGS
4022 return Perl_re_compile(aTHX_ exp, xend, pm);
4023}
6d5c990f 4024#endif
2a5d9b1d
RGS
4025
4026regexp *
4027Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4028{
4029 dVAR;
a0d0e21e 4030 register regexp *r;
f8fc2ecf 4031 register regexp_internal *ri;
c277df42 4032 regnode *scan;
c277df42 4033 regnode *first;
a0d0e21e 4034 I32 flags;
a0d0e21e
LW
4035 I32 minlen = 0;
4036 I32 sawplus = 0;
4037 I32 sawopen = 0;
2c2d71f5 4038 scan_data_t data;
830247a4 4039 RExC_state_t RExC_state;
be8e71aa 4040 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4041#ifdef TRIE_STUDY_OPT
4042 int restudied= 0;
4043 RExC_state_t copyRExC_state;
4044#endif
2a5d9b1d 4045 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4046 DEBUG_r(if (!PL_colorset) reginitcolors());
4047
a0d0e21e 4048 if (exp == NULL)
c277df42 4049 FAIL("NULL regexp argument");
a0d0e21e 4050
a5961de5 4051 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4052
5cfc7842 4053 RExC_precomp = exp;
a3621e74 4054 DEBUG_COMPILE_r({
ab3bbdeb
YO
4055 SV *dsv= sv_newmortal();
4056 RE_PV_QUOTED_DECL(s, RExC_utf8,
4057 dsv, RExC_precomp, (xend - exp), 60);
4058 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4059 PL_colors[4],PL_colors[5],s);
a5961de5 4060 });
e2509266 4061 RExC_flags = pm->op_pmflags;
830247a4 4062 RExC_sawback = 0;
bbce6d69 4063
830247a4
IZ
4064 RExC_seen = 0;
4065 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4066 RExC_seen_evals = 0;
4067 RExC_extralen = 0;
c277df42 4068
bbce6d69 4069 /* First pass: determine size, legality. */
830247a4 4070 RExC_parse = exp;
fac92740 4071 RExC_start = exp;
830247a4
IZ
4072 RExC_end = xend;
4073 RExC_naughty = 0;
4074 RExC_npar = 1;
e2e6a0f1 4075 RExC_nestroot = 0;
830247a4
IZ
4076 RExC_size = 0L;
4077 RExC_emit = &PL_regdummy;
4078 RExC_whilem_seen = 0;
fc8cd66c 4079 RExC_charnames = NULL;
40d049e4
YO
4080 RExC_open_parens = NULL;
4081 RExC_close_parens = NULL;
4082 RExC_opend = NULL;
81714fb9 4083 RExC_paren_names = NULL;
1f1031fe
YO
4084#ifdef DEBUGGING
4085 RExC_paren_name_list = NULL;
4086#endif
40d049e4
YO
4087 RExC_recurse = NULL;
4088 RExC_recurse_count = 0;
81714fb9 4089
85ddcde9
JH
4090#if 0 /* REGC() is (currently) a NOP at the first pass.
4091 * Clever compilers notice this and complain. --jhi */
830247a4 4092 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4093#endif
3dab1dad
YO
4094 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4095 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4096 RExC_precomp = NULL;
a0d0e21e
LW
4097 return(NULL);
4098 }
07be1b83 4099 DEBUG_PARSE_r({
81714fb9
YO
4100 PerlIO_printf(Perl_debug_log,
4101 "Required size %"IVdf" nodes\n"
4102 "Starting second pass (creation)\n",
4103 (IV)RExC_size);
07be1b83
YO
4104 RExC_lastnum=0;
4105 RExC_lastparse=NULL;
4106 });
c277df42
IZ
4107 /* Small enough for pointer-storage convention?
4108 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4109 if (RExC_size >= 0x10000L && RExC_extralen)
4110 RExC_size += RExC_extralen;
c277df42 4111 else
830247a4
IZ
4112 RExC_extralen = 0;
4113 if (RExC_whilem_seen > 15)
4114 RExC_whilem_seen = 15;
a0d0e21e 4115
e2e6a0f1
YO
4116#ifdef DEBUGGING
4117 /* Make room for a sentinel value at the end of the program */
4118 RExC_size++;
4119#endif
4120
f9f4320a
YO
4121 /* Allocate space and zero-initialize. Note, the two step process
4122 of zeroing when in debug mode, thus anything assigned has to
4123 happen after that */
f8fc2ecf
YO
4124 Newxz(r, 1, regexp);
4125 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4126 char, regexp_internal);
4127 if ( r == NULL || ri == NULL )
b45f050a 4128 FAIL("Regexp out of space");
0f79a09d
GS
4129#ifdef DEBUGGING
4130 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4131 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4132#else
f8fc2ecf
YO
4133 /* bulk initialize base fields with 0. */
4134 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4135#endif
58e23c8d
YO
4136
4137 /* non-zero initialization begins here */
f8fc2ecf 4138 RXi_SET( r, ri );
f9f4320a 4139 r->engine= RE_ENGINE_PTR;
c277df42 4140 r->refcnt = 1;
bbce6d69 4141 r->prelen = xend - exp;
5cfc7842 4142 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4143 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4144 r->intflags = 0;
830247a4 4145 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4146
6bda09f9 4147 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4148 Newxz(RExC_open_parens, RExC_npar,regnode *);
4149 SAVEFREEPV(RExC_open_parens);
4150 Newxz(RExC_close_parens,RExC_npar,regnode *);
4151 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4152 }
4153
4154 /* Useful during FAIL. */
7122b237
YO
4155#ifdef RE_TRACK_PATTERN_OFFSETS
4156 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4157 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4158 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4159 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4160 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4161#endif
4162 SetProgLen(ri,RExC_size);
830247a4 4163 RExC_rx = r;
f8fc2ecf 4164 RExC_rxi = ri;
bbce6d69 4165
4166 /* Second pass: emit code. */
e2509266 4167 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4168 RExC_parse = exp;
4169 RExC_end = xend;
4170 RExC_naughty = 0;
4171 RExC_npar = 1;
f8fc2ecf
YO
4172 RExC_emit_start = ri->program;
4173 RExC_emit = ri->program;
e2e6a0f1
YO
4174#ifdef DEBUGGING
4175 /* put a sentinal on the end of the program so we can check for
4176 overwrites */
f8fc2ecf 4177 ri->program[RExC_size].type = 255;
e2e6a0f1 4178#endif
2cd61cdb 4179 /* Store the count of eval-groups for security checks: */
f8149455 4180 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4181 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4182 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4183 return(NULL);
6bda09f9 4184
07be1b83
YO
4185 /* XXXX To minimize changes to RE engine we always allocate
4186 3-units-long substrs field. */
4187 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4188 if (RExC_recurse_count) {
4189 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4190 SAVEFREEPV(RExC_recurse);
4191 }
a0d0e21e 4192
07be1b83 4193reStudy:
1de06328 4194 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4195 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4196
07be1b83
YO
4197#ifdef TRIE_STUDY_OPT
4198 if ( restudied ) {
5d458dd8 4199 U32 seen=RExC_seen;
07be1b83 4200 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4201
4202 RExC_state = copyRExC_state;
4203 if (seen & REG_TOP_LEVEL_BRANCHES)
4204 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4205 else
4206 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4207 if (data.last_found) {
07be1b83 4208 SvREFCNT_dec(data.longest_fixed);
07be1b83 4209 SvREFCNT_dec(data.longest_float);
07be1b83 4210 SvREFCNT_dec(data.last_found);
1de06328 4211 }
40d049e4 4212 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4213 } else {
40d049e4 4214 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4215 copyRExC_state = RExC_state;
07be1b83 4216 }
40d049e4
YO
4217#else
4218 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4219#endif
fc8cd66c 4220
a0d0e21e 4221 /* Dig out information for optimizations. */
bbe252da 4222 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4223 pm->op_pmflags = RExC_flags;
a0ed51b3 4224 if (UTF)
bbe252da 4225 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4226 ri->regstclass = NULL;
830247a4 4227 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4228 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4229 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4230
1de06328
YO
4231 /* testing for BRANCH here tells us whether there is "must appear"
4232 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4233 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4234 I32 fake;
c5254dd6 4235 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4236 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4237 int stclass_flag;
07be1b83 4238 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4239
4240 first = scan;
c277df42 4241 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4242 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4243 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4244 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4245 /* for now we can't handle lookbehind IFMATCH*/
4246 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4247 (OP(first) == PLUS) ||
4248 (OP(first) == MINMOD) ||
653099ff 4249 /* An {n,m} with n>0 */
07be1b83
YO
4250 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4251 {
786e8c11 4252
a0d0e21e
LW
4253 if (OP(first) == PLUS)
4254 sawplus = 1;
4255 else
3dab1dad 4256 first += regarglen[OP(first)];
07be1b83
YO
4257 if (OP(first) == IFMATCH) {
4258 first = NEXTOPER(first);
4259 first += EXTRA_STEP_2ARGS;
7c167cea 4260 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4261 first = NEXTOPER(first);
a687059c
LW
4262 }
4263
a0d0e21e
LW
4264 /* Starting-point info. */
4265 again:
786e8c11 4266 DEBUG_PEEP("first:",first,0);
07be1b83 4267 /* Ignore EXACT as we deal with it later. */
3dab1dad 4268 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4269 if (OP(first) == EXACT)
6f207bd3 4270 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4271 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4272 ri->regstclass = first;
b3c9acc1 4273 }
07be1b83 4274#ifdef TRIE_STCLASS
786e8c11 4275 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4276 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4277 {
786e8c11 4278 regnode *trie_op;
07be1b83 4279 /* this can happen only on restudy */
786e8c11 4280 if ( OP(first) == TRIE ) {
c944940b 4281 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4282 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4283 StructCopy(first,trieop,struct regnode_1);
4284 trie_op=(regnode *)trieop;
4285 } else {
c944940b 4286 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4287 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4288 StructCopy(first,trieop,struct regnode_charclass);
4289 trie_op=(regnode *)trieop;
4290 }
1de06328 4291 OP(trie_op)+=2;
786e8c11 4292 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4293 ri->regstclass = trie_op;
07be1b83
YO
4294 }
4295#endif
bfed75c6 4296 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4297 ri->regstclass = first;
3dab1dad
YO
4298 else if (PL_regkind[OP(first)] == BOUND ||
4299 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4300 ri->regstclass = first;
3dab1dad 4301 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4302 r->extflags |= (OP(first) == MBOL
4303 ? RXf_ANCH_MBOL
cad2e5aa 4304 : (OP(first) == SBOL
bbe252da
YO
4305 ? RXf_ANCH_SBOL
4306 : RXf_ANCH_BOL));
a0d0e21e 4307 first = NEXTOPER(first);
774d564b 4308 goto again;
4309 }
4310 else if (OP(first) == GPOS) {
bbe252da 4311 r->extflags |= RXf_ANCH_GPOS;
774d564b 4312 first = NEXTOPER(first);
4313 goto again;
a0d0e21e 4314 }
cf2a2b69
YO
4315 else if ((!sawopen || !RExC_sawback) &&
4316 (OP(first) == STAR &&
3dab1dad 4317 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4318 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4319 {
4320 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4321 const int type =
4322 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4323 ? RXf_ANCH_MBOL
4324 : RXf_ANCH_SBOL;
4325 r->extflags |= type;
4326 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4327 first = NEXTOPER(first);
774d564b 4328 goto again;
a0d0e21e 4329 }
b81d288d 4330 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4331 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4332 /* x+ must match at the 1st pos of run of x's */
bbe252da 4333 r->intflags |= PREGf_SKIP;
a0d0e21e 4334
c277df42 4335 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4336#ifdef TRIE_STUDY_OPT
81714fb9 4337 DEBUG_PARSE_r(
be8e71aa
YO
4338 if (!restudied)
4339 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4340 (IV)(first - scan + 1))
4341 );
4342#else
81714fb9 4343 DEBUG_PARSE_r(
be8e71aa
YO
4344 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4345 (IV)(first - scan + 1))
4346 );
4347#endif
4348
4349
a0d0e21e
LW
4350 /*
4351 * If there's something expensive in the r.e., find the
4352 * longest literal string that must appear and make it the
4353 * regmust. Resolve ties in favor of later strings, since
4354 * the regstart check works with the beginning of the r.e.
4355 * and avoiding duplication strengthens checking. Not a
4356 * strong reason, but sufficient in the absence of others.
4357 * [Now we resolve ties in favor of the earlier string if
c277df42 4358 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4359 * earlier string may buy us something the later one won't.]
4360 */
de8c5301 4361
396482e1
GA
4362 data.longest_fixed = newSVpvs("");
4363 data.longest_float = newSVpvs("");
4364 data.last_found = newSVpvs("");
c277df42
IZ
4365 data.longest = &(data.longest_fixed);
4366 first = scan;
f8fc2ecf 4367 if (!ri->regstclass) {
830247a4 4368 cl_init(pRExC_state, &ch_class);
653099ff
GS
4369 data.start_class = &ch_class;
4370 stclass_flag = SCF_DO_STCLASS_AND;
4371 } else /* XXXX Check for BOUND? */
4372 stclass_flag = 0;
cb434fcc 4373 data.last_closep = &last_close;
de8c5301 4374
1de06328 4375 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4376 &data, -1, NULL, NULL,
4377 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4378
07be1b83 4379
786e8c11
YO
4380 CHECK_RESTUDY_GOTO;
4381
4382
830247a4 4383 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4384 && data.last_start_min == 0 && data.last_end > 0
830247a4 4385 && !RExC_seen_zerolen
2bf803e2 4386 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4387 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4388 r->extflags |= RXf_CHECK_ALL;
304ee84b 4389 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4390 SvREFCNT_dec(data.last_found);
4391
1de06328
YO
4392 /* Note that code very similar to this but for anchored string
4393 follows immediately below, changes may need to be made to both.
4394 Be careful.
4395 */
a0ed51b3 4396 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4397 if (longest_float_length
c277df42
IZ
4398 || (data.flags & SF_FL_BEFORE_EOL
4399 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4400 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4401 {
1182767e 4402 I32 t,ml;
cf93c79d 4403
1de06328 4404 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4405 && data.offset_fixed == data.offset_float_min
4406 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4407 goto remove_float; /* As in (a)+. */
4408
1de06328
YO
4409 /* copy the information about the longest float from the reg_scan_data
4410 over to the program. */
33b8afdf
JH
4411 if (SvUTF8(data.longest_float)) {
4412 r->float_utf8 = data.longest_float;
c445ea15 4413 r->float_substr = NULL;
33b8afdf
JH
4414 } else {
4415 r->float_substr = data.longest_float;
c445ea15 4416 r->float_utf8 = NULL;
33b8afdf 4417 }
1de06328
YO
4418 /* float_end_shift is how many chars that must be matched that
4419 follow this item. We calculate it ahead of time as once the
4420 lookbehind offset is added in we lose the ability to correctly
4421 calculate it.*/
4422 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4423 : (I32)longest_float_length;
1de06328
YO
4424 r->float_end_shift = ml - data.offset_float_min
4425 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4426 + data.lookbehind_float;
4427 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4428 r->float_max_offset = data.offset_float_max;
1182767e 4429 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4430 r->float_max_offset -= data.lookbehind_float;
4431
cf93c79d
IZ
4432 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4433 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4434 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4435 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4436 }
4437 else {
aca2d497 4438 remove_float:
c445ea15 4439 r->float_substr = r->float_utf8 = NULL;
c277df42 4440 SvREFCNT_dec(data.longest_float);
c5254dd6 4441 longest_float_length = 0;
a0d0e21e 4442 }
c277df42 4443
1de06328
YO
4444 /* Note that code very similar to this but for floating string
4445 is immediately above, changes may need to be made to both.
4446 Be careful.
4447 */
a0ed51b3 4448 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4449 if (longest_fixed_length
c277df42
IZ
4450 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4451 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4452 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4453 {
1182767e 4454 I32 t,ml;
cf93c79d 4455
1de06328
YO
4456 /* copy the information about the longest fixed
4457 from the reg_scan_data over to the program. */
33b8afdf
JH
4458 if (SvUTF8(data.longest_fixed)) {
4459 r->anchored_utf8 = data.longest_fixed;
c445ea15 4460 r->anchored_substr = NULL;
33b8afdf
JH
4461 } else {
4462 r->anchored_substr = data.longest_fixed;
c445ea15 4463 r->anchored_utf8 = NULL;
33b8afdf 4464 }
1de06328
YO
4465 /* fixed_end_shift is how many chars that must be matched that
4466 follow this item. We calculate it ahead of time as once the
4467 lookbehind offset is added in we lose the ability to correctly
4468 calculate it.*/
4469 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4470 : (I32)longest_fixed_length;
1de06328
YO
4471 r->anchored_end_shift = ml - data.offset_fixed
4472 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4473 + data.lookbehind_fixed;
4474 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4475
cf93c79d
IZ
4476 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4477 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4478 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4479 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4480 }
4481 else {
c445ea15 4482 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4483 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4484 longest_fixed_length = 0;
a0d0e21e 4485 }
f8fc2ecf
YO
4486 if (ri->regstclass
4487 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4488 ri->regstclass = NULL;
33b8afdf
JH
4489 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4490 && stclass_flag
653099ff 4491 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4492 && !cl_is_anything(data.start_class))
4493 {
2eccd3b2 4494 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4495
f8fc2ecf 4496 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4497 struct regnode_charclass_class);
4498 StructCopy(data.start_class,
f8fc2ecf 4499 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4500 struct regnode_charclass_class);
f8fc2ecf 4501 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4502 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4503 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4504 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4505 PerlIO_printf(Perl_debug_log,
a0288114 4506 "synthetic stclass \"%s\".\n",
3f7c398e 4507 SvPVX_const(sv));});
653099ff 4508 }
c277df42
IZ
4509
4510 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4511 if (longest_fixed_length > longest_float_length) {
1de06328 4512 r->check_end_shift = r->anchored_end_shift;
c277df42 4513 r->check_substr = r->anchored_substr;
33b8afdf 4514 r->check_utf8 = r->anchored_utf8;
c277df42 4515 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4516 if (r->extflags & RXf_ANCH_SINGLE)
4517 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4518 }
4519 else {
1de06328 4520 r->check_end_shift = r->float_end_shift;
c277df42 4521 r->check_substr = r->float_substr;
33b8afdf 4522 r->check_utf8 = r->float_utf8;
1de06328
YO
4523 r->check_offset_min = r->float_min_offset;
4524 r->check_offset_max = r->float_max_offset;
a0d0e21e 4525 }
30382c73
IZ
4526 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4527 This should be changed ASAP! */
bbe252da
YO
4528 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4529 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4530 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4531 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4532 }
1de06328
YO
4533 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4534 if ( (STRLEN)minlen < longest_float_length )
4535 minlen= longest_float_length;
4536 if ( (STRLEN)minlen < longest_fixed_length )
4537 minlen= longest_fixed_length;
4538 */
a0ed51b3
LW
4539 }
4540 else {
c277df42
IZ
4541 /* Several toplevels. Best we can is to set minlen. */
4542 I32 fake;
653099ff 4543 struct regnode_charclass_class ch_class;
cb434fcc 4544 I32 last_close = 0;
c277df42 4545
5d458dd8 4546 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4547
f8fc2ecf 4548 scan = ri->program + 1;
830247a4 4549 cl_init(pRExC_state, &ch_class);
653099ff 4550 data.start_class = &ch_class;
cb434fcc 4551 data.last_closep = &last_close;
07be1b83 4552
de8c5301 4553
1de06328 4554 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4555 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4556
786e8c11 4557 CHECK_RESTUDY_GOTO;
07be1b83 4558
33b8afdf 4559 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4560 = r->float_substr = r->float_utf8 = NULL;
653099ff 4561 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4562 && !cl_is_anything(data.start_class))
4563 {
2eccd3b2 4564 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4565
f8fc2ecf 4566 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4567 struct regnode_charclass_class);
4568 StructCopy(data.start_class,
f8fc2ecf 4569 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4570 struct regnode_charclass_class);
f8fc2ecf 4571 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4572 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4573 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4574 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4575 PerlIO_printf(Perl_debug_log,
a0288114 4576 "synthetic stclass \"%s\".\n",
3f7c398e 4577 SvPVX_const(sv));});
653099ff 4578 }
a0d0e21e
LW
4579 }
4580
1de06328
YO
4581 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4582 the "real" pattern. */
cf9788e3
RGS
4583 DEBUG_OPTIMISE_r({
4584 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4585 (IV)minlen, (IV)r->minlen);
cf9788e3 4586 });
de8c5301 4587 r->minlenret = minlen;
1de06328
YO
4588 if (r->minlen < minlen)
4589 r->minlen = minlen;
4590
b81d288d 4591 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4592 r->extflags |= RXf_GPOS_SEEN;
830247a4 4593 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4594 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4595 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4596 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4597 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4598 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4599 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4600 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4601 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4602 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4603 if (RExC_paren_names)
4604 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4605 else
4606 r->paren_names = NULL;
e357fc67
YO
4607 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4608 r->extflags |= RXf_WHITE;
4609 else if (r->prelen == 1 && r->precomp[0] == '^')
4610 r->extflags |= RXf_START_ONLY;
4611
1f1031fe
YO
4612#ifdef DEBUGGING
4613 if (RExC_paren_names) {
4614 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4615 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4616 } else
1f1031fe 4617#endif
cde0cee5 4618 ri->name_list_idx = 0;
1f1031fe 4619
40d049e4
YO
4620 if (RExC_recurse_count) {
4621 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4622 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4623 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4624 }
4625 }
a02a5408
JC
4626 Newxz(r->startp, RExC_npar, I32);
4627 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4628 /* assume we don't need to swap parens around before we match */
4629
be8e71aa
YO
4630 DEBUG_DUMP_r({
4631 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4632 regdump(r);
4633 });
7122b237
YO
4634#ifdef RE_TRACK_PATTERN_OFFSETS
4635 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4636 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4637 U32 i;
4638 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4639 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4640 for (i = 1; i <= len; i++) {
7122b237 4641 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4642 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4643 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4644 }
4645 PerlIO_printf(Perl_debug_log, "\n");
4646 });
7122b237 4647#endif
a0d0e21e 4648 return(r);
a687059c
LW
4649}
4650
f9f4320a 4651#undef CORE_ONLY_BLOCK
f9f4320a 4652#undef RE_ENGINE_PTR
3dab1dad 4653
9af228c6 4654#ifndef PERL_IN_XSUB_RE
81714fb9 4655SV*
44a2ac75 4656Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
81714fb9 4657{
44a2ac75
YO
4658 AV *retarray = NULL;
4659 SV *ret;
4660 if (flags & 1)
4661 retarray=newAV();
4662
4663 if (from_re || PL_curpm) {
4664 const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
81714fb9
YO
4665 if (rx && rx->paren_names) {
4666 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4667 if (he_str) {
4668 IV i;
4669 SV* sv_dat=HeVAL(he_str);
4670 I32 *nums=(I32*)SvPVX(sv_dat);
4671 for ( i=0; i<SvIVX(sv_dat); i++ ) {
ded05c2a
YO
4672 if ((I32)(rx->nparens) >= nums[i]
4673 && rx->startp[nums[i]] != -1
4674 && rx->endp[nums[i]] != -1)
81714fb9 4675 {
44a2ac75
YO
4676 ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
4677 if (!retarray)
4678 return ret;
4679 } else {
4680 ret = newSVsv(&PL_sv_undef);
4681 }
4682 if (retarray) {
4683 SvREFCNT_inc(ret);
4684 av_push(retarray, ret);
81714fb9
YO
4685 }
4686 }
44a2ac75
YO
4687 if (retarray)
4688 return (SV*)retarray;
81714fb9
YO
4689 }
4690 }
4691 }
44a2ac75
YO
4692 return NULL;
4693}
4694
4695SV*
4696Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
4697{
4698 char *s = NULL;
a9d504c3 4699 I32 i = 0;
44a2ac75
YO
4700 I32 s1, t1;
4701 SV *sv = usesv ? usesv : newSVpvs("");
cb5c6874 4702 PERL_UNUSED_ARG(flags);
44a2ac75 4703
cde0cee5
YO
4704 if (!rx->subbeg) {
4705 sv_setsv(sv,&PL_sv_undef);
4706 return sv;
4707 }
4708 else
4709 if (paren == -2 && rx->startp[0] != -1) {
44a2ac75
YO
4710 /* $` */
4711 i = rx->startp[0];
cde0cee5 4712 s = rx->subbeg;
44a2ac75
YO
4713 }
4714 else
cde0cee5 4715 if (paren == -1 && rx->endp[0] != -1) {
44a2ac75
YO
4716 /* $' */
4717 s = rx->subbeg + rx->endp[0];
4718 i = rx->sublen - rx->endp[0];
4719 }
4720 else
4721 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4722 (s1 = rx->startp[paren]) != -1 &&
4723 (t1 = rx->endp[paren]) != -1)
4724 {
4725 /* $& $1 ... */
4726 i = t1 - s1;
4727 s = rx->subbeg + s1;
cde0cee5
YO
4728 } else {
4729 sv_setsv(sv,&PL_sv_undef);
4730 return sv;
4731 }
4732 assert(rx->sublen >= (s - rx->subbeg) + i );
4733 if (i >= 0) {
4734 const int oldtainted = PL_tainted;
4735 TAINT_NOT;
4736 sv_setpvn(sv, s, i);
4737 PL_tainted = oldtainted;
4738 if ( (rx->extflags & RXf_CANY_SEEN)
4739 ? (RX_MATCH_UTF8(rx)
4740 && (!i || is_utf8_string((U8*)s, i)))
4741 : (RX_MATCH_UTF8(rx)) )
4742 {
4743 SvUTF8_on(sv);
4744 }
4745 else
4746 SvUTF8_off(sv);
4747 if (PL_tainting) {
4748 if (RX_MATCH_TAINTED(rx)) {
4749 if (SvTYPE(sv) >= SVt_PVMG) {
4750 MAGIC* const mg = SvMAGIC(sv);
4751 MAGIC* mgt;
4752 PL_tainted = 1;
4753 SvMAGIC_set(sv, mg->mg_moremagic);
4754 SvTAINT(sv);
4755 if ((mgt = SvMAGIC(sv))) {
4756 mg->mg_moremagic = mgt;
4757 SvMAGIC_set(sv, mg);
44a2ac75 4758 }
cde0cee5
YO
4759 } else {
4760 PL_tainted = 1;
4761 SvTAINT(sv);
4762 }
4763 } else
4764 SvTAINTED_off(sv);
44a2ac75 4765 }
81714fb9 4766 } else {
44a2ac75 4767 sv_setsv(sv,&PL_sv_undef);
81714fb9 4768 }
44a2ac75 4769 return sv;
81714fb9 4770}
9af228c6 4771#endif
0a4db386 4772
894be9b7 4773/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4774 * If flags is REG_RSN_RETURN_NULL returns null.
4775 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4776 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4777 * to the parsed name as looked up in the RExC_paren_names hash.
4778 * If there is an error throws a vFAIL().. type exception.
894be9b7 4779 */
0a4db386
YO
4780
4781#define REG_RSN_RETURN_NULL 0
4782#define REG_RSN_RETURN_NAME 1
4783#define REG_RSN_RETURN_DATA 2
4784
894be9b7
YO
4785STATIC SV*
4786S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4787 char *name_start = RExC_parse;
1f1031fe
YO
4788
4789 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4790 /* skip IDFIRST by using do...while */
4791 if (UTF)
4792 do {
4793 RExC_parse += UTF8SKIP(RExC_parse);
4794 } while (isALNUM_utf8((U8*)RExC_parse));
4795 else
4796 do {
4797 RExC_parse++;
4798 } while (isALNUM(*RExC_parse));
894be9b7 4799 }
1f1031fe 4800
0a4db386
YO
4801 if ( flags ) {
4802 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4803 (int)(RExC_parse - name_start)));
894be9b7 4804 if (UTF)
0a4db386
YO
4805 SvUTF8_on(sv_name);
4806 if ( flags == REG_RSN_RETURN_NAME)
4807 return sv_name;
4808 else if (flags==REG_RSN_RETURN_DATA) {
4809 HE *he_str = NULL;
4810 SV *sv_dat = NULL;
4811 if ( ! sv_name ) /* should not happen*/
4812 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4813 if (RExC_paren_names)
4814 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4815 if ( he_str )
4816 sv_dat = HeVAL(he_str);
4817 if ( ! sv_dat )
4818 vFAIL("Reference to nonexistent named group");
4819 return sv_dat;
4820 }
4821 else {
4822 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4823 }
4824 /* NOT REACHED */
894be9b7 4825 }
0a4db386 4826 return NULL;
894be9b7
YO
4827}
4828
3dab1dad
YO
4829#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4830 int rem=(int)(RExC_end - RExC_parse); \
4831 int cut; \
4832 int num; \
4833 int iscut=0; \
4834 if (rem>10) { \
4835 rem=10; \
4836 iscut=1; \
4837 } \
4838 cut=10-rem; \
4839 if (RExC_lastparse!=RExC_parse) \
4840 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4841 rem, RExC_parse, \
4842 cut + 4, \
4843 iscut ? "..." : "<" \
4844 ); \
4845 else \
4846 PerlIO_printf(Perl_debug_log,"%16s",""); \
4847 \
4848 if (SIZE_ONLY) \
4849 num=RExC_size; \
4850 else \
4851 num=REG_NODE_NUM(RExC_emit); \
4852 if (RExC_lastnum!=num) \
0a4db386 4853 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4854 else \
0a4db386 4855 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4856 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4857 (int)((depth*2)), "", \
3dab1dad
YO
4858 (funcname) \
4859 ); \
4860 RExC_lastnum=num; \
4861 RExC_lastparse=RExC_parse; \
4862})
4863
07be1b83
YO
4864
4865
3dab1dad
YO
4866#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4867 DEBUG_PARSE_MSG((funcname)); \
4868 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4869})
6bda09f9
YO
4870#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4871 DEBUG_PARSE_MSG((funcname)); \
4872 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4873})
a687059c
LW
4874/*
4875 - reg - regular expression, i.e. main body or parenthesized thing
4876 *
4877 * Caller must absorb opening parenthesis.
4878 *
4879 * Combining parenthesis handling with the base level of regular expression
4880 * is a trifle forced, but the need to tie the tails of the branches to what
4881 * follows makes it hard to avoid.
4882 */
07be1b83
YO
4883#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4884#ifdef DEBUGGING
4885#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4886#else
4887#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4888#endif
3dab1dad 4889
e2e6a0f1
YO
4890/* this idea is borrowed from STR_WITH_LEN in handy.h */
4891#define CHECK_WORD(s,v,l) \
4892 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4893
76e3520e 4894STATIC regnode *
3dab1dad 4895S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4896 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4897{
27da23d5 4898 dVAR;
c277df42
IZ
4899 register regnode *ret; /* Will be the head of the group. */
4900 register regnode *br;
4901 register regnode *lastbr;
cbbf8932 4902 register regnode *ender = NULL;
a0d0e21e 4903 register I32 parno = 0;
cbbf8932
AL
4904 I32 flags;
4905 const I32 oregflags = RExC_flags;
6136c704
AL
4906 bool have_branch = 0;
4907 bool is_open = 0;
9d1d55b5
JP
4908
4909 /* for (?g), (?gc), and (?o) warnings; warning
4910 about (?c) will warn about (?g) -- japhy */
4911
6136c704
AL
4912#define WASTED_O 0x01
4913#define WASTED_G 0x02
4914#define WASTED_C 0x04
4915#define WASTED_GC (0x02|0x04)
cbbf8932 4916 I32 wastedflags = 0x00;
9d1d55b5 4917
fac92740 4918 char * parse_start = RExC_parse; /* MJD */
a28509cc 4919 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4920
3dab1dad
YO
4921 GET_RE_DEBUG_FLAGS_DECL;
4922 DEBUG_PARSE("reg ");
4923
4924
821b33a5 4925 *flagp = 0; /* Tentatively. */
a0d0e21e 4926
9d1d55b5 4927
a0d0e21e
LW
4928 /* Make an OPEN node, if parenthesized. */
4929 if (paren) {
e2e6a0f1
YO
4930 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4931 char *start_verb = RExC_parse;
4932 STRLEN verb_len = 0;
4933 char *start_arg = NULL;
4934 unsigned char op = 0;
4935 int argok = 1;
4936 int internal_argval = 0; /* internal_argval is only useful if !argok */
4937 while ( *RExC_parse && *RExC_parse != ')' ) {
4938 if ( *RExC_parse == ':' ) {
4939 start_arg = RExC_parse + 1;
4940 break;
4941 }
4942 RExC_parse++;
4943 }
4944 ++start_verb;
4945 verb_len = RExC_parse - start_verb;
4946 if ( start_arg ) {
4947 RExC_parse++;
4948 while ( *RExC_parse && *RExC_parse != ')' )
4949 RExC_parse++;
4950 if ( *RExC_parse != ')' )
4951 vFAIL("Unterminated verb pattern argument");
4952 if ( RExC_parse == start_arg )
4953 start_arg = NULL;
4954 } else {
4955 if ( *RExC_parse != ')' )
4956 vFAIL("Unterminated verb pattern");
4957 }
5d458dd8 4958
e2e6a0f1
YO
4959 switch ( *start_verb ) {
4960 case 'A': /* (*ACCEPT) */
4961 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4962 op = ACCEPT;
4963 internal_argval = RExC_nestroot;
4964 }
4965 break;
4966 case 'C': /* (*COMMIT) */
4967 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4968 op = COMMIT;
e2e6a0f1
YO
4969 break;
4970 case 'F': /* (*FAIL) */
4971 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4972 op = OPFAIL;
4973 argok = 0;
4974 }
4975 break;
5d458dd8
YO
4976 case ':': /* (*:NAME) */
4977 case 'M': /* (*MARK:NAME) */
4978 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4979 op = MARKPOINT;
5d458dd8
YO
4980 argok = -1;
4981 }
4982 break;
4983 case 'P': /* (*PRUNE) */
4984 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4985 op = PRUNE;
e2e6a0f1 4986 break;
5d458dd8
YO
4987 case 'S': /* (*SKIP) */
4988 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4989 op = SKIP;
4990 break;
4991 case 'T': /* (*THEN) */
4992 /* [19:06] <TimToady> :: is then */
4993 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4994 op = CUTGROUP;
4995 RExC_seen |= REG_SEEN_CUTGROUP;
4996 }
e2e6a0f1
YO
4997 break;
4998 }
4999 if ( ! op ) {
5000 RExC_parse++;
5001 vFAIL3("Unknown verb pattern '%.*s'",
5002 verb_len, start_verb);
5003 }
5004 if ( argok ) {
5005 if ( start_arg && internal_argval ) {
5006 vFAIL3("Verb pattern '%.*s' may not have an argument",
5007 verb_len, start_verb);
5008 } else if ( argok < 0 && !start_arg ) {
5009 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5010 verb_len, start_verb);
5011 } else {
5012 ret = reganode(pRExC_state, op, internal_argval);
5013 if ( ! internal_argval && ! SIZE_ONLY ) {
5014 if (start_arg) {
5015 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5016 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5017 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5018 ret->flags = 0;
5019 } else {
5020 ret->flags = 1;
5021 }
5022 }
5023 }
5024 if (!internal_argval)
5025 RExC_seen |= REG_SEEN_VERBARG;
5026 } else if ( start_arg ) {
5027 vFAIL3("Verb pattern '%.*s' may not have an argument",
5028 verb_len, start_verb);
5029 } else {
5030 ret = reg_node(pRExC_state, op);
5031 }
5032 nextchar(pRExC_state);
5033 return ret;
5034 } else
fac92740 5035 if (*RExC_parse == '?') { /* (?...) */
6136c704 5036 bool is_logical = 0;
a28509cc 5037 const char * const seqstart = RExC_parse;
ca9dfc88 5038
830247a4
IZ
5039 RExC_parse++;
5040 paren = *RExC_parse++;
c277df42 5041 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5042 switch (paren) {
894be9b7 5043
1f1031fe
YO
5044 case 'P': /* (?P...) variants for those used to PCRE/Python */
5045 paren = *RExC_parse++;
5046 if ( paren == '<') /* (?P<...>) named capture */
5047 goto named_capture;
5048 else if (paren == '>') { /* (?P>name) named recursion */
5049 goto named_recursion;
5050 }
5051 else if (paren == '=') { /* (?P=...) named backref */
5052 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5053 you change this make sure you change that */
5054 char* name_start = RExC_parse;
5055 U32 num = 0;
5056 SV *sv_dat = reg_scan_name(pRExC_state,
5057 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5058 if (RExC_parse == name_start || *RExC_parse != ')')
5059 vFAIL2("Sequence %.3s... not terminated",parse_start);
5060
5061 if (!SIZE_ONLY) {
5062 num = add_data( pRExC_state, 1, "S" );
5063 RExC_rxi->data->data[num]=(void*)sv_dat;
5064 SvREFCNT_inc(sv_dat);
5065 }
5066 RExC_sawback = 1;
5067 ret = reganode(pRExC_state,
5068 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5069 num);
5070 *flagp |= HASWIDTH;
5071
5072 Set_Node_Offset(ret, parse_start+1);
5073 Set_Node_Cur_Length(ret); /* MJD */
5074
5075 nextchar(pRExC_state);
5076 return ret;
5077 }
57b84237
YO
5078 RExC_parse++;
5079 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5080 /*NOTREACHED*/
5081 case '<': /* (?<...) */
b81d288d 5082 if (*RExC_parse == '!')
c277df42 5083 paren = ',';
0a4db386 5084 else if (*RExC_parse != '=')
1f1031fe 5085 named_capture:
0a4db386 5086 { /* (?<...>) */
81714fb9 5087 char *name_start;
894be9b7 5088 SV *svname;
81714fb9
YO
5089 paren= '>';
5090 case '\'': /* (?'...') */
5091 name_start= RExC_parse;
0a4db386
YO
5092 svname = reg_scan_name(pRExC_state,
5093 SIZE_ONLY ? /* reverse test from the others */
5094 REG_RSN_RETURN_NAME :
5095 REG_RSN_RETURN_NULL);
57b84237
YO
5096 if (RExC_parse == name_start) {
5097 RExC_parse++;
5098 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5099 /*NOTREACHED*/
5100 }
81714fb9
YO
5101 if (*RExC_parse != paren)
5102 vFAIL2("Sequence (?%c... not terminated",
5103 paren=='>' ? '<' : paren);
5104 if (SIZE_ONLY) {
e62cc96a
YO
5105 HE *he_str;
5106 SV *sv_dat = NULL;
894be9b7
YO
5107 if (!svname) /* shouldnt happen */
5108 Perl_croak(aTHX_
5109 "panic: reg_scan_name returned NULL");
81714fb9
YO
5110 if (!RExC_paren_names) {
5111 RExC_paren_names= newHV();
5112 sv_2mortal((SV*)RExC_paren_names);
1f1031fe
YO
5113#ifdef DEBUGGING
5114 RExC_paren_name_list= newAV();
5115 sv_2mortal((SV*)RExC_paren_name_list);
5116#endif
81714fb9
YO
5117 }
5118 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5119 if ( he_str )
81714fb9 5120 sv_dat = HeVAL(he_str);
e62cc96a 5121 if ( ! sv_dat ) {
81714fb9 5122 /* croak baby croak */
e62cc96a
YO
5123 Perl_croak(aTHX_
5124 "panic: paren_name hash element allocation failed");
5125 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
5126 IV count=SvIV(sv_dat);
5127 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
5128 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
5129 pv[count]=RExC_npar;
5130 SvIVX(sv_dat)++;
5131 } else {
5132 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5133 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5134 SvIOK_on(sv_dat);
5135 SvIVX(sv_dat)= 1;
e62cc96a 5136 }
1f1031fe
YO
5137#ifdef DEBUGGING
5138 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5139 SvREFCNT_dec(svname);
5140#endif
e62cc96a 5141
81714fb9
YO
5142 /*sv_dump(sv_dat);*/
5143 }
5144 nextchar(pRExC_state);
5145 paren = 1;
5146 goto capturing_parens;
5147 }
5148 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5149 RExC_parse++;
fac92740
MJD
5150 case '=': /* (?=...) */
5151 case '!': /* (?!...) */
830247a4 5152 RExC_seen_zerolen++;
e2e6a0f1
YO
5153 if (*RExC_parse == ')') {
5154 ret=reg_node(pRExC_state, OPFAIL);
5155 nextchar(pRExC_state);
5156 return ret;
5157 }
fac92740
MJD
5158 case ':': /* (?:...) */
5159 case '>': /* (?>...) */
a0d0e21e 5160 break;
fac92740
MJD
5161 case '$': /* (?$...) */
5162 case '@': /* (?@...) */
8615cb43 5163 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5164 break;
fac92740 5165 case '#': /* (?#...) */
830247a4
IZ
5166 while (*RExC_parse && *RExC_parse != ')')
5167 RExC_parse++;
5168 if (*RExC_parse != ')')
c277df42 5169 FAIL("Sequence (?#... not terminated");
830247a4 5170 nextchar(pRExC_state);
a0d0e21e
LW
5171 *flagp = TRYAGAIN;
5172 return NULL;
894be9b7
YO
5173 case '0' : /* (?0) */
5174 case 'R' : /* (?R) */
5175 if (*RExC_parse != ')')
6bda09f9 5176 FAIL("Sequence (?R) not terminated");
1a147d38 5177 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
5178 nextchar(pRExC_state);
5179 return ret;
5180 /*notreached*/
894be9b7
YO
5181 { /* named and numeric backreferences */
5182 I32 num;
894be9b7
YO
5183 case '&': /* (?&NAME) */
5184 parse_start = RExC_parse - 1;
1f1031fe 5185 named_recursion:
894be9b7 5186 {
0a4db386
YO
5187 SV *sv_dat = reg_scan_name(pRExC_state,
5188 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5189 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5190 }
5191 goto gen_recurse_regop;
5192 /* NOT REACHED */
542fa716
YO
5193 case '+':
5194 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5195 RExC_parse++;
5196 vFAIL("Illegal pattern");
5197 }
5198 goto parse_recursion;
5199 /* NOT REACHED*/
5200 case '-': /* (?-1) */
5201 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5202 RExC_parse--; /* rewind to let it be handled later */
5203 goto parse_flags;
5204 }
5205 /*FALLTHROUGH */
6bda09f9
YO
5206 case '1': case '2': case '3': case '4': /* (?1) */
5207 case '5': case '6': case '7': case '8': case '9':
5208 RExC_parse--;
542fa716 5209 parse_recursion:
894be9b7
YO
5210 num = atoi(RExC_parse);
5211 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5212 if (*RExC_parse == '-')
5213 RExC_parse++;
6bda09f9
YO
5214 while (isDIGIT(*RExC_parse))
5215 RExC_parse++;
5216 if (*RExC_parse!=')')
5217 vFAIL("Expecting close bracket");
894be9b7
YO
5218
5219 gen_recurse_regop:
542fa716
YO
5220 if ( paren == '-' ) {
5221 /*
5222 Diagram of capture buffer numbering.
5223 Top line is the normal capture buffer numbers
5224 Botton line is the negative indexing as from
5225 the X (the (?-2))
5226
5227 + 1 2 3 4 5 X 6 7
5228 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5229 - 5 4 3 2 1 X x x
5230
5231 */
5232 num = RExC_npar + num;
5233 if (num < 1) {
5234 RExC_parse++;
5235 vFAIL("Reference to nonexistent group");
5236 }
5237 } else if ( paren == '+' ) {
5238 num = RExC_npar + num - 1;
5239 }
5240
1a147d38 5241 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5242 if (!SIZE_ONLY) {
5243 if (num > (I32)RExC_rx->nparens) {
5244 RExC_parse++;
5245 vFAIL("Reference to nonexistent group");
5246 }
40d049e4 5247 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5248 RExC_emit++;
226de585 5249 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5250 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5251 } else {
6bda09f9 5252 RExC_size++;
6bda09f9 5253 }
0a4db386 5254 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5255 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5256 Set_Node_Offset(ret, parse_start); /* MJD */
5257
6bda09f9
YO
5258 nextchar(pRExC_state);
5259 return ret;
894be9b7
YO
5260 } /* named and numeric backreferences */
5261 /* NOT REACHED */
5262
fac92740 5263 case 'p': /* (?p...) */
9014280d 5264 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5265 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5266 /* FALL THROUGH*/
fac92740 5267 case '?': /* (??...) */
6136c704 5268 is_logical = 1;
57b84237
YO
5269 if (*RExC_parse != '{') {
5270 RExC_parse++;
5271 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5272 /*NOTREACHED*/
5273 }
830247a4 5274 paren = *RExC_parse++;
0f5d15d6 5275 /* FALL THROUGH */
fac92740 5276 case '{': /* (?{...}) */
c277df42 5277 {
2eccd3b2
NC
5278 I32 count = 1;
5279 U32 n = 0;
c277df42 5280 char c;
830247a4 5281 char *s = RExC_parse;
c277df42 5282
830247a4
IZ
5283 RExC_seen_zerolen++;
5284 RExC_seen |= REG_SEEN_EVAL;
5285 while (count && (c = *RExC_parse)) {
6136c704
AL
5286 if (c == '\\') {
5287 if (RExC_parse[1])
5288 RExC_parse++;
5289 }
b81d288d 5290 else if (c == '{')
c277df42 5291 count++;
b81d288d 5292 else if (c == '}')
c277df42 5293 count--;
830247a4 5294 RExC_parse++;
c277df42 5295 }
6136c704 5296 if (*RExC_parse != ')') {
b81d288d 5297 RExC_parse = s;
b45f050a
JF
5298 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5299 }
c277df42 5300 if (!SIZE_ONLY) {
f3548bdc 5301 PAD *pad;
6136c704
AL
5302 OP_4tree *sop, *rop;
5303 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5304
569233ed
SB
5305 ENTER;
5306 Perl_save_re_context(aTHX);
f3548bdc 5307 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5308 sop->op_private |= OPpREFCOUNTED;
5309 /* re_dup will OpREFCNT_inc */
5310 OpREFCNT_set(sop, 1);
569233ed 5311 LEAVE;
c277df42 5312
830247a4 5313 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5314 RExC_rxi->data->data[n] = (void*)rop;
5315 RExC_rxi->data->data[n+1] = (void*)sop;
5316 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5317 SvREFCNT_dec(sv);
a0ed51b3 5318 }
e24b16f9 5319 else { /* First pass */
830247a4 5320 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5321 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5322 /* No compiled RE interpolated, has runtime
5323 components ===> unsafe. */
5324 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5325 if (PL_tainting && PL_tainted)
cc6b7395 5326 FAIL("Eval-group in insecure regular expression");
54df2634 5327#if PERL_VERSION > 8
923e4eb5 5328 if (IN_PERL_COMPILETIME)
b5c19bd7 5329 PL_cv_has_eval = 1;
54df2634 5330#endif
c277df42 5331 }
b5c19bd7 5332
830247a4 5333 nextchar(pRExC_state);
6136c704 5334 if (is_logical) {
830247a4 5335 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5336 if (!SIZE_ONLY)
5337 ret->flags = 2;
3dab1dad 5338 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5339 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5340 return ret;
5341 }
ccb2c380
MP
5342 ret = reganode(pRExC_state, EVAL, n);
5343 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5344 Set_Node_Offset(ret, parse_start);
5345 return ret;
c277df42 5346 }
fac92740 5347 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5348 {
0a4db386 5349 int is_define= 0;
fac92740 5350 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5351 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5352 || RExC_parse[1] == '<'
830247a4 5353 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5354 I32 flag;
5355
830247a4 5356 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5357 if (!SIZE_ONLY)
5358 ret->flags = 1;
3dab1dad 5359 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5360 goto insert_if;
b81d288d 5361 }
a0ed51b3 5362 }
0a4db386
YO
5363 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5364 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5365 {
5366 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5367 char *name_start= RExC_parse++;
2eccd3b2 5368 U32 num = 0;
0a4db386
YO
5369 SV *sv_dat=reg_scan_name(pRExC_state,
5370 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5371 if (RExC_parse == name_start || *RExC_parse != ch)
5372 vFAIL2("Sequence (?(%c... not terminated",
5373 (ch == '>' ? '<' : ch));
5374 RExC_parse++;
5375 if (!SIZE_ONLY) {
5376 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5377 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5378 SvREFCNT_inc(sv_dat);
5379 }
5380 ret = reganode(pRExC_state,NGROUPP,num);
5381 goto insert_if_check_paren;
5382 }
5383 else if (RExC_parse[0] == 'D' &&
5384 RExC_parse[1] == 'E' &&
5385 RExC_parse[2] == 'F' &&
5386 RExC_parse[3] == 'I' &&
5387 RExC_parse[4] == 'N' &&
5388 RExC_parse[5] == 'E')
5389 {
5390 ret = reganode(pRExC_state,DEFINEP,0);
5391 RExC_parse +=6 ;
5392 is_define = 1;
5393 goto insert_if_check_paren;
5394 }
5395 else if (RExC_parse[0] == 'R') {
5396 RExC_parse++;
5397 parno = 0;
5398 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5399 parno = atoi(RExC_parse++);
5400 while (isDIGIT(*RExC_parse))
5401 RExC_parse++;
5402 } else if (RExC_parse[0] == '&') {
5403 SV *sv_dat;
5404 RExC_parse++;
5405 sv_dat = reg_scan_name(pRExC_state,
5406 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5407 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5408 }
1a147d38 5409 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5410 goto insert_if_check_paren;
5411 }
830247a4 5412 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5413 /* (?(1)...) */
6136c704 5414 char c;
830247a4 5415 parno = atoi(RExC_parse++);
c277df42 5416
830247a4
IZ
5417 while (isDIGIT(*RExC_parse))
5418 RExC_parse++;
fac92740 5419 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5420
0a4db386 5421 insert_if_check_paren:
830247a4 5422 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5423 vFAIL("Switch condition not recognized");
c277df42 5424 insert_if:
3dab1dad
YO
5425 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5426 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5427 if (br == NULL)
830247a4 5428 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5429 else
3dab1dad 5430 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5431 c = *nextchar(pRExC_state);
d1b80229
IZ
5432 if (flags&HASWIDTH)
5433 *flagp |= HASWIDTH;
c277df42 5434 if (c == '|') {
0a4db386
YO
5435 if (is_define)
5436 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5437 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5438 regbranch(pRExC_state, &flags, 1,depth+1);
5439 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5440 if (flags&HASWIDTH)
5441 *flagp |= HASWIDTH;
830247a4 5442 c = *nextchar(pRExC_state);
a0ed51b3
LW
5443 }
5444 else
c277df42
IZ
5445 lastbr = NULL;
5446 if (c != ')')
8615cb43 5447 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5448 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5449 REGTAIL(pRExC_state, br, ender);
c277df42 5450 if (lastbr) {
3dab1dad
YO
5451 REGTAIL(pRExC_state, lastbr, ender);
5452 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5453 }
5454 else
3dab1dad 5455 REGTAIL(pRExC_state, ret, ender);
c277df42 5456 return ret;
a0ed51b3
LW
5457 }
5458 else {
830247a4 5459 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5460 }
5461 }
1b1626e4 5462 case 0:
830247a4 5463 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5464 vFAIL("Sequence (? incomplete");
1b1626e4 5465 break;
a0d0e21e 5466 default:
cde0cee5
YO
5467 --RExC_parse;
5468 parse_flags: /* (?i) */
5469 {
5470 U32 posflags = 0, negflags = 0;
5471 U32 *flagsp = &posflags;
5472
5473 while (*RExC_parse) {
5474 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
5475 /* (?g), (?gc) and (?o) are useless here
5476 and must be globally applied -- japhy */
cde0cee5
YO
5477 switch (*RExC_parse) {
5478 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5479 case 'o':
5480 case 'g':
9d1d55b5 5481 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5482 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5483 if (! (wastedflags & wflagbit) ) {
5484 wastedflags |= wflagbit;
5485 vWARN5(
5486 RExC_parse + 1,
5487 "Useless (%s%c) - %suse /%c modifier",
5488 flagsp == &negflags ? "?-" : "?",
5489 *RExC_parse,
5490 flagsp == &negflags ? "don't " : "",
5491 *RExC_parse
5492 );
5493 }
5494 }
cde0cee5
YO
5495 break;
5496
5497 case 'c':
9d1d55b5 5498 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5499 if (! (wastedflags & WASTED_C) ) {
5500 wastedflags |= WASTED_GC;
9d1d55b5
JP
5501 vWARN3(
5502 RExC_parse + 1,
5503 "Useless (%sc) - %suse /gc modifier",
5504 flagsp == &negflags ? "?-" : "?",
5505 flagsp == &negflags ? "don't " : ""
5506 );
5507 }
5508 }
cde0cee5
YO
5509 break;
5510 case 'k':
5511 if (flagsp == &negflags) {
5512 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5513 vWARN(RExC_parse + 1,"Useless use of (?-k)");
5514 } else {
5515 *flagsp |= RXf_PMf_KEEPCOPY;
5516 }
5517 break;
5518 case '-':
57b84237
YO
5519 if (flagsp == &negflags) {
5520 RExC_parse++;
5521 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5522 /*NOTREACHED*/
5523 }
cde0cee5
YO
5524 flagsp = &negflags;
5525 wastedflags = 0; /* reset so (?g-c) warns twice */
5526 break;
5527 case ':':
5528 paren = ':';
5529 /*FALLTHROUGH*/
5530 case ')':
5531 RExC_flags |= posflags;
5532 RExC_flags &= ~negflags;
5533 nextchar(pRExC_state);
5534 if (paren != ':') {
5535 *flagp = TRYAGAIN;
5536 return NULL;
5537 } else {
5538 ret = NULL;
5539 goto parse_rest;
5540 }
5541 /*NOTREACHED*/
5542 default:
cde0cee5
YO
5543 RExC_parse++;
5544 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5545 /*NOTREACHED*/
5546 }
830247a4 5547 ++RExC_parse;
48c036b1 5548 }
cde0cee5 5549 }} /* one for the default block, one for the switch */
a0d0e21e 5550 }
fac92740 5551 else { /* (...) */
81714fb9 5552 capturing_parens:
830247a4
IZ
5553 parno = RExC_npar;
5554 RExC_npar++;
e2e6a0f1 5555
830247a4 5556 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5557 if (!SIZE_ONLY ){
5558 if (!RExC_nestroot)
5559 RExC_nestroot = parno;
5560 if (RExC_seen & REG_SEEN_RECURSE) {
5561 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5562 "Setting open paren #%"IVdf" to %d\n",
5563 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5564 RExC_open_parens[parno-1]= ret;
5565 }
6bda09f9 5566 }
fac92740
MJD
5567 Set_Node_Length(ret, 1); /* MJD */
5568 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5569 is_open = 1;
a0d0e21e 5570 }
a0ed51b3 5571 }
fac92740 5572 else /* ! paren */
a0d0e21e 5573 ret = NULL;
cde0cee5
YO
5574
5575 parse_rest:
a0d0e21e 5576 /* Pick up the branches, linking them together. */
fac92740 5577 parse_start = RExC_parse; /* MJD */
3dab1dad 5578 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5579 /* branch_len = (paren != 0); */
2af232bd 5580
a0d0e21e
LW
5581 if (br == NULL)
5582 return(NULL);
830247a4
IZ
5583 if (*RExC_parse == '|') {
5584 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5585 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5586 }
fac92740 5587 else { /* MJD */
6bda09f9 5588 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5589 Set_Node_Length(br, paren != 0);
5590 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5591 }
c277df42
IZ
5592 have_branch = 1;
5593 if (SIZE_ONLY)
830247a4 5594 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5595 }
5596 else if (paren == ':') {
c277df42
IZ
5597 *flagp |= flags&SIMPLE;
5598 }
6136c704 5599 if (is_open) { /* Starts with OPEN. */
3dab1dad 5600 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5601 }
5602 else if (paren != '?') /* Not Conditional */
a0d0e21e 5603 ret = br;
32a0ca98 5604 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5605 lastbr = br;
830247a4
IZ
5606 while (*RExC_parse == '|') {
5607 if (!SIZE_ONLY && RExC_extralen) {
5608 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5609 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5610 }
5611 if (SIZE_ONLY)
830247a4
IZ
5612 RExC_extralen += 2; /* Account for LONGJMP. */
5613 nextchar(pRExC_state);
3dab1dad 5614 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5615
a687059c 5616 if (br == NULL)
a0d0e21e 5617 return(NULL);
3dab1dad 5618 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5619 lastbr = br;
821b33a5
IZ
5620 if (flags&HASWIDTH)
5621 *flagp |= HASWIDTH;
a687059c 5622 *flagp |= flags&SPSTART;
a0d0e21e
LW
5623 }
5624
c277df42
IZ
5625 if (have_branch || paren != ':') {
5626 /* Make a closing node, and hook it on the end. */
5627 switch (paren) {
5628 case ':':
830247a4 5629 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5630 break;
5631 case 1:
830247a4 5632 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5633 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5634 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5635 "Setting close paren #%"IVdf" to %d\n",
5636 (IV)parno, REG_NODE_NUM(ender)));
5637 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5638 if (RExC_nestroot == parno)
5639 RExC_nestroot = 0;
40d049e4 5640 }
fac92740
MJD
5641 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5642 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5643 break;
5644 case '<':
c277df42
IZ
5645 case ',':
5646 case '=':
5647 case '!':
c277df42 5648 *flagp &= ~HASWIDTH;
821b33a5
IZ
5649 /* FALL THROUGH */
5650 case '>':
830247a4 5651 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5652 break;
5653 case 0:
830247a4 5654 ender = reg_node(pRExC_state, END);
40d049e4
YO
5655 if (!SIZE_ONLY) {
5656 assert(!RExC_opend); /* there can only be one! */
5657 RExC_opend = ender;
5658 }
c277df42
IZ
5659 break;
5660 }
eaf3ca90 5661 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5662
9674d46a 5663 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5664 if (depth==1)
5665 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5666
c277df42 5667 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5668 for (br = ret; br; br = regnext(br)) {
5669 const U8 op = PL_regkind[OP(br)];
5670 if (op == BRANCH) {
07be1b83 5671 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5672 }
5673 else if (op == BRANCHJ) {
07be1b83 5674 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5675 }
c277df42
IZ
5676 }
5677 }
a0d0e21e 5678 }
c277df42
IZ
5679
5680 {
e1ec3a88
AL
5681 const char *p;
5682 static const char parens[] = "=!<,>";
c277df42
IZ
5683
5684 if (paren && (p = strchr(parens, paren))) {
eb160463 5685 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5686 int flag = (p - parens) > 1;
5687
5688 if (paren == '>')
5689 node = SUSPEND, flag = 0;
6bda09f9 5690 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5691 Set_Node_Cur_Length(ret);
5692 Set_Node_Offset(ret, parse_start + 1);
c277df42 5693 ret->flags = flag;
07be1b83 5694 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5695 }
a0d0e21e
LW
5696 }
5697
5698 /* Check for proper termination. */
ce3e6498 5699 if (paren) {
e2509266 5700 RExC_flags = oregflags;
830247a4
IZ
5701 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5702 RExC_parse = oregcomp_parse;
380a0633 5703 vFAIL("Unmatched (");
ce3e6498 5704 }
a0ed51b3 5705 }
830247a4
IZ
5706 else if (!paren && RExC_parse < RExC_end) {
5707 if (*RExC_parse == ')') {
5708 RExC_parse++;
380a0633 5709 vFAIL("Unmatched )");
a0ed51b3
LW
5710 }
5711 else
b45f050a 5712 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5713 /* NOTREACHED */
5714 }
a687059c 5715
a0d0e21e 5716 return(ret);
a687059c
LW
5717}
5718
5719/*
5720 - regbranch - one alternative of an | operator
5721 *
5722 * Implements the concatenation operator.
5723 */
76e3520e 5724STATIC regnode *
3dab1dad 5725S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5726{
97aff369 5727 dVAR;
c277df42
IZ
5728 register regnode *ret;
5729 register regnode *chain = NULL;
5730 register regnode *latest;
5731 I32 flags = 0, c = 0;
3dab1dad
YO
5732 GET_RE_DEBUG_FLAGS_DECL;
5733 DEBUG_PARSE("brnc");
b81d288d 5734 if (first)
c277df42
IZ
5735 ret = NULL;
5736 else {
b81d288d 5737 if (!SIZE_ONLY && RExC_extralen)
830247a4 5738 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5739 else {
830247a4 5740 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5741 Set_Node_Length(ret, 1);
5742 }
c277df42
IZ
5743 }
5744
b81d288d 5745 if (!first && SIZE_ONLY)
830247a4 5746 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5747
c277df42 5748 *flagp = WORST; /* Tentatively. */
a0d0e21e 5749
830247a4
IZ
5750 RExC_parse--;
5751 nextchar(pRExC_state);
5752 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5753 flags &= ~TRYAGAIN;
3dab1dad 5754 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5755 if (latest == NULL) {
5756 if (flags & TRYAGAIN)
5757 continue;
5758 return(NULL);
a0ed51b3
LW
5759 }
5760 else if (ret == NULL)
c277df42 5761 ret = latest;
a0d0e21e 5762 *flagp |= flags&HASWIDTH;
c277df42 5763 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5764 *flagp |= flags&SPSTART;
5765 else {
830247a4 5766 RExC_naughty++;
3dab1dad 5767 REGTAIL(pRExC_state, chain, latest);
a687059c 5768 }
a0d0e21e 5769 chain = latest;
c277df42
IZ
5770 c++;
5771 }
5772 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5773 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5774 if (ret == NULL)
5775 ret = chain;
5776 }
5777 if (c == 1) {
5778 *flagp |= flags&SIMPLE;
a0d0e21e 5779 }
a687059c 5780
d4c19fe8 5781 return ret;
a687059c
LW
5782}
5783
5784/*
5785 - regpiece - something followed by possible [*+?]
5786 *
5787 * Note that the branching code sequences used for ? and the general cases
5788 * of * and + are somewhat optimized: they use the same NOTHING node as
5789 * both the endmarker for their branch list and the body of the last branch.
5790 * It might seem that this node could be dispensed with entirely, but the
5791 * endmarker role is not redundant.
5792 */
76e3520e 5793STATIC regnode *
3dab1dad 5794S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5795{
97aff369 5796 dVAR;
c277df42 5797 register regnode *ret;
a0d0e21e
LW
5798 register char op;
5799 register char *next;
5800 I32 flags;
1df70142 5801 const char * const origparse = RExC_parse;
a0d0e21e 5802 I32 min;
c277df42 5803 I32 max = REG_INFTY;
fac92740 5804 char *parse_start;
10edeb5d 5805 const char *maxpos = NULL;
3dab1dad
YO
5806 GET_RE_DEBUG_FLAGS_DECL;
5807 DEBUG_PARSE("piec");
a0d0e21e 5808
3dab1dad 5809 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5810 if (ret == NULL) {
5811 if (flags & TRYAGAIN)
5812 *flagp |= TRYAGAIN;
5813 return(NULL);
5814 }
5815
830247a4 5816 op = *RExC_parse;
a0d0e21e 5817
830247a4 5818 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5819 maxpos = NULL;
fac92740 5820 parse_start = RExC_parse; /* MJD */
830247a4 5821 next = RExC_parse + 1;
a0d0e21e
LW
5822 while (isDIGIT(*next) || *next == ',') {
5823 if (*next == ',') {
5824 if (maxpos)
5825 break;
5826 else
5827 maxpos = next;
a687059c 5828 }
a0d0e21e
LW
5829 next++;
5830 }
5831 if (*next == '}') { /* got one */
5832 if (!maxpos)
5833 maxpos = next;
830247a4
IZ
5834 RExC_parse++;
5835 min = atoi(RExC_parse);
a0d0e21e
LW
5836 if (*maxpos == ',')
5837 maxpos++;
5838 else
830247a4 5839 maxpos = RExC_parse;
a0d0e21e
LW
5840 max = atoi(maxpos);
5841 if (!max && *maxpos != '0')
c277df42
IZ
5842 max = REG_INFTY; /* meaning "infinity" */
5843 else if (max >= REG_INFTY)
8615cb43 5844 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5845 RExC_parse = next;
5846 nextchar(pRExC_state);
a0d0e21e
LW
5847
5848 do_curly:
5849 if ((flags&SIMPLE)) {
830247a4 5850 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5851 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5852 Set_Node_Offset(ret, parse_start+1); /* MJD */
5853 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5854 }
5855 else {
3dab1dad 5856 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5857
5858 w->flags = 0;
3dab1dad 5859 REGTAIL(pRExC_state, ret, w);
830247a4 5860 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5861 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5862 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5863 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5864 }
6bda09f9 5865 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5866 /* MJD hk */
5867 Set_Node_Offset(ret, parse_start+1);
2af232bd 5868 Set_Node_Length(ret,
fac92740 5869 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5870
830247a4 5871 if (!SIZE_ONLY && RExC_extralen)
c277df42 5872 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5873 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5874 if (SIZE_ONLY)
830247a4
IZ
5875 RExC_whilem_seen++, RExC_extralen += 3;
5876 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5877 }
c277df42 5878 ret->flags = 0;
a0d0e21e
LW
5879
5880 if (min > 0)
821b33a5
IZ
5881 *flagp = WORST;
5882 if (max > 0)
5883 *flagp |= HASWIDTH;
a0d0e21e 5884 if (max && max < min)
8615cb43 5885 vFAIL("Can't do {n,m} with n > m");
c277df42 5886 if (!SIZE_ONLY) {
eb160463
GS
5887 ARG1_SET(ret, (U16)min);
5888 ARG2_SET(ret, (U16)max);
a687059c 5889 }
a687059c 5890
a0d0e21e 5891 goto nest_check;
a687059c 5892 }
a0d0e21e 5893 }
a687059c 5894
a0d0e21e
LW
5895 if (!ISMULT1(op)) {
5896 *flagp = flags;
a687059c 5897 return(ret);
a0d0e21e 5898 }
bb20fd44 5899
c277df42 5900#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5901
5902 /* if this is reinstated, don't forget to put this back into perldiag:
5903
5904 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5905
5906 (F) The part of the regexp subject to either the * or + quantifier
5907 could match an empty string. The {#} shows in the regular
5908 expression about where the problem was discovered.
5909
5910 */
5911
bb20fd44 5912 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5913 vFAIL("Regexp *+ operand could be empty");
b81d288d 5914#endif
bb20fd44 5915
fac92740 5916 parse_start = RExC_parse;
830247a4 5917 nextchar(pRExC_state);
a0d0e21e 5918
821b33a5 5919 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5920
5921 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5922 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5923 ret->flags = 0;
830247a4 5924 RExC_naughty += 4;
a0d0e21e
LW
5925 }
5926 else if (op == '*') {
5927 min = 0;
5928 goto do_curly;
a0ed51b3
LW
5929 }
5930 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5931 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5932 ret->flags = 0;
830247a4 5933 RExC_naughty += 3;
a0d0e21e
LW
5934 }
5935 else if (op == '+') {
5936 min = 1;
5937 goto do_curly;
a0ed51b3
LW
5938 }
5939 else if (op == '?') {
a0d0e21e
LW
5940 min = 0; max = 1;
5941 goto do_curly;
5942 }
5943 nest_check:
041457d9 5944 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5945 vWARN3(RExC_parse,
b45f050a 5946 "%.*s matches null string many times",
afd78fd5 5947 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5948 origparse);
a0d0e21e
LW
5949 }
5950
b9b4dddf 5951 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5952 nextchar(pRExC_state);
6bda09f9 5953 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5954 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5955 }
b9b4dddf
YO
5956#ifndef REG_ALLOW_MINMOD_SUSPEND
5957 else
5958#endif
5959 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5960 regnode *ender;
5961 nextchar(pRExC_state);
5962 ender = reg_node(pRExC_state, SUCCEED);
5963 REGTAIL(pRExC_state, ret, ender);
5964 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5965 ret->flags = 0;
5966 ender = reg_node(pRExC_state, TAIL);
5967 REGTAIL(pRExC_state, ret, ender);
5968 /*ret= ender;*/
5969 }
5970
5971 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5972 RExC_parse++;
b45f050a
JF
5973 vFAIL("Nested quantifiers");
5974 }
a0d0e21e
LW
5975
5976 return(ret);
a687059c
LW
5977}
5978
fc8cd66c
YO
5979
5980/* reg_namedseq(pRExC_state,UVp)
5981
5982 This is expected to be called by a parser routine that has
5983 recognized'\N' and needs to handle the rest. RExC_parse is
5984 expected to point at the first char following the N at the time
5985 of the call.
5986
5987 If valuep is non-null then it is assumed that we are parsing inside
5988 of a charclass definition and the first codepoint in the resolved
5989 string is returned via *valuep and the routine will return NULL.
5990 In this mode if a multichar string is returned from the charnames
5991 handler a warning will be issued, and only the first char in the
5992 sequence will be examined. If the string returned is zero length
5993 then the value of *valuep is undefined and NON-NULL will
5994 be returned to indicate failure. (This will NOT be a valid pointer
5995 to a regnode.)
5996
5997 If value is null then it is assumed that we are parsing normal text
5998 and inserts a new EXACT node into the program containing the resolved
5999 string and returns a pointer to the new node. If the string is
6000 zerolength a NOTHING node is emitted.
6001
6002 On success RExC_parse is set to the char following the endbrace.
6003 Parsing failures will generate a fatal errorvia vFAIL(...)
6004
6005 NOTE: We cache all results from the charnames handler locally in
6006 the RExC_charnames hash (created on first use) to prevent a charnames
6007 handler from playing silly-buggers and returning a short string and
6008 then a long string for a given pattern. Since the regexp program
6009 size is calculated during an initial parse this would result
6010 in a buffer overrun so we cache to prevent the charname result from
6011 changing during the course of the parse.
6012
6013 */
6014STATIC regnode *
6015S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6016{
6017 char * name; /* start of the content of the name */
6018 char * endbrace; /* endbrace following the name */
6019 SV *sv_str = NULL;
6020 SV *sv_name = NULL;
6021 STRLEN len; /* this has various purposes throughout the code */
6022 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6023 regnode *ret = NULL;
6024
6025 if (*RExC_parse != '{') {
6026 vFAIL("Missing braces on \\N{}");
6027 }
6028 name = RExC_parse+1;
6029 endbrace = strchr(RExC_parse, '}');
6030 if ( ! endbrace ) {
6031 RExC_parse++;
6032 vFAIL("Missing right brace on \\N{}");
6033 }
6034 RExC_parse = endbrace + 1;
6035
6036
6037 /* RExC_parse points at the beginning brace,
6038 endbrace points at the last */
6039 if ( name[0]=='U' && name[1]=='+' ) {
6040 /* its a "unicode hex" notation {U+89AB} */
6041 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6042 | PERL_SCAN_DISALLOW_PREFIX
6043 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6044 UV cp;
196f1508 6045 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6046 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6047 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6048 cp = 0xFFFD;
6049 }
6050 if (cp > 0xff)
6051 RExC_utf8 = 1;
6052 if ( valuep ) {
6053 *valuep = cp;
6054 return NULL;
6055 }
6056 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6057 } else {
6058 /* fetch the charnames handler for this scope */
6059 HV * const table = GvHV(PL_hintgv);
6060 SV **cvp= table ?
6061 hv_fetchs(table, "charnames", FALSE) :
6062 NULL;
6063 SV *cv= cvp ? *cvp : NULL;
6064 HE *he_str;
6065 int count;
6066 /* create an SV with the name as argument */
6067 sv_name = newSVpvn(name, endbrace - name);
6068
6069 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6070 vFAIL2("Constant(\\N{%s}) unknown: "
6071 "(possibly a missing \"use charnames ...\")",
6072 SvPVX(sv_name));
6073 }
6074 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6075 vFAIL2("Constant(\\N{%s}): "
6076 "$^H{charnames} is not defined",SvPVX(sv_name));
6077 }
6078
6079
6080
6081 if (!RExC_charnames) {
6082 /* make sure our cache is allocated */
6083 RExC_charnames = newHV();
6bda09f9 6084 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
6085 }
6086 /* see if we have looked this one up before */
6087 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6088 if ( he_str ) {
6089 sv_str = HeVAL(he_str);
6090 cached = 1;
6091 } else {
6092 dSP ;
6093
6094 ENTER ;
6095 SAVETMPS ;
6096 PUSHMARK(SP) ;
6097
6098 XPUSHs(sv_name);
6099
6100 PUTBACK ;
6101
6102 count= call_sv(cv, G_SCALAR);
6103
6104 if (count == 1) { /* XXXX is this right? dmq */
6105 sv_str = POPs;
6106 SvREFCNT_inc_simple_void(sv_str);
6107 }
6108
6109 SPAGAIN ;
6110 PUTBACK ;
6111 FREETMPS ;
6112 LEAVE ;
6113
6114 if ( !sv_str || !SvOK(sv_str) ) {
6115 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6116 "did not return a defined value",SvPVX(sv_name));
6117 }
6118 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6119 cached = 1;
6120 }
6121 }
6122 if (valuep) {
6123 char *p = SvPV(sv_str, len);
6124 if (len) {
6125 STRLEN numlen = 1;
6126 if ( SvUTF8(sv_str) ) {
196f1508 6127 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6128 if (*valuep > 0x7F)
6129 RExC_utf8 = 1;
6130 /* XXXX
6131 We have to turn on utf8 for high bit chars otherwise
6132 we get failures with
6133
6134 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6135 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6136
6137 This is different from what \x{} would do with the same
6138 codepoint, where the condition is > 0xFF.
6139 - dmq
6140 */
6141
6142
6143 } else {
6144 *valuep = (UV)*p;
6145 /* warn if we havent used the whole string? */
6146 }
6147 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6148 vWARN2(RExC_parse,
6149 "Ignoring excess chars from \\N{%s} in character class",
6150 SvPVX(sv_name)
6151 );
6152 }
6153 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6154 vWARN2(RExC_parse,
6155 "Ignoring zero length \\N{%s} in character class",
6156 SvPVX(sv_name)
6157 );
6158 }
6159 if (sv_name)
6160 SvREFCNT_dec(sv_name);
6161 if (!cached)
6162 SvREFCNT_dec(sv_str);
6163 return len ? NULL : (regnode *)&len;
6164 } else if(SvCUR(sv_str)) {
6165
6166 char *s;
6167 char *p, *pend;
6168 STRLEN charlen = 1;
d008bc60 6169#ifdef DEBUGGING
fc8cd66c 6170 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6171#endif
fc8cd66c
YO
6172 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6173
6174 ret = reg_node(pRExC_state,
6175 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6176 s= STRING(ret);
6177
6178 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6179 sv_utf8_upgrade(sv_str);
6180 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6181 RExC_utf8= 1;
6182 }
6183
6184 p = SvPV(sv_str, len);
6185 pend = p + len;
6186 /* len is the length written, charlen is the size the char read */
6187 for ( len = 0; p < pend; p += charlen ) {
6188 if (UTF) {
196f1508 6189 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6190 if (FOLD) {
6191 STRLEN foldlen,numlen;
6192 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6193 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6194 /* Emit all the Unicode characters. */
6195
6196 for (foldbuf = tmpbuf;
6197 foldlen;
6198 foldlen -= numlen)
6199 {
6200 uvc = utf8_to_uvchr(foldbuf, &numlen);
6201 if (numlen > 0) {
6202 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6203 s += unilen;
6204 len += unilen;
6205 /* In EBCDIC the numlen
6206 * and unilen can differ. */
6207 foldbuf += numlen;
6208 if (numlen >= foldlen)
6209 break;
6210 }
6211 else
6212 break; /* "Can't happen." */
6213 }
6214 } else {
6215 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6216 if (unilen > 0) {
6217 s += unilen;
6218 len += unilen;
6219 }
6220 }
6221 } else {
6222 len++;
6223 REGC(*p, s++);
6224 }
6225 }
6226 if (SIZE_ONLY) {
6227 RExC_size += STR_SZ(len);
6228 } else {
6229 STR_LEN(ret) = len;
6230 RExC_emit += STR_SZ(len);
6231 }
6232 Set_Node_Cur_Length(ret); /* MJD */
6233 RExC_parse--;
6234 nextchar(pRExC_state);
6235 } else {
6236 ret = reg_node(pRExC_state,NOTHING);
6237 }
6238 if (!cached) {
6239 SvREFCNT_dec(sv_str);
6240 }
6241 if (sv_name) {
6242 SvREFCNT_dec(sv_name);
6243 }
6244 return ret;
6245
6246}
6247
6248
9e08bc66
TS
6249/*
6250 * reg_recode
6251 *
6252 * It returns the code point in utf8 for the value in *encp.
6253 * value: a code value in the source encoding
6254 * encp: a pointer to an Encode object
6255 *
6256 * If the result from Encode is not a single character,
6257 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6258 */
6259STATIC UV
6260S_reg_recode(pTHX_ const char value, SV **encp)
6261{
6262 STRLEN numlen = 1;
6263 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6264 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6265 : SvPVX(sv);
6266 const STRLEN newlen = SvCUR(sv);
6267 UV uv = UNICODE_REPLACEMENT;
6268
6269 if (newlen)
6270 uv = SvUTF8(sv)
6271 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6272 : *(U8*)s;
6273
6274 if (!newlen || numlen != newlen) {
6275 uv = UNICODE_REPLACEMENT;
6276 if (encp)
6277 *encp = NULL;
6278 }
6279 return uv;
6280}
6281
fc8cd66c 6282
a687059c
LW
6283/*
6284 - regatom - the lowest level
ee9b8eae
YO
6285
6286 Try to identify anything special at the start of the pattern. If there
6287 is, then handle it as required. This may involve generating a single regop,
6288 such as for an assertion; or it may involve recursing, such as to
6289 handle a () structure.
6290
6291 If the string doesn't start with something special then we gobble up
6292 as much literal text as we can.
6293
6294 Once we have been able to handle whatever type of thing started the
6295 sequence, we return.
6296
6297 Note: we have to be careful with escapes, as they can be both literal
6298 and special, and in the case of \10 and friends can either, depending
6299 on context. Specifically there are two seperate switches for handling
6300 escape sequences, with the one for handling literal escapes requiring
6301 a dummy entry for all of the special escapes that are actually handled
6302 by the other.
6303*/
6304
76e3520e 6305STATIC regnode *
3dab1dad 6306S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6307{
97aff369 6308 dVAR;
cbbf8932 6309 register regnode *ret = NULL;
a0d0e21e 6310 I32 flags;
45948336 6311 char *parse_start = RExC_parse;
3dab1dad
YO
6312 GET_RE_DEBUG_FLAGS_DECL;
6313 DEBUG_PARSE("atom");
a0d0e21e
LW
6314 *flagp = WORST; /* Tentatively. */
6315
ee9b8eae 6316
a0d0e21e 6317tryagain:
830247a4 6318 switch (*RExC_parse) {
a0d0e21e 6319 case '^':
830247a4
IZ
6320 RExC_seen_zerolen++;
6321 nextchar(pRExC_state);
bbe252da 6322 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6323 ret = reg_node(pRExC_state, MBOL);
bbe252da 6324 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6325 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6326 else
830247a4 6327 ret = reg_node(pRExC_state, BOL);
fac92740 6328 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6329 break;
6330 case '$':
830247a4 6331 nextchar(pRExC_state);
b81d288d 6332 if (*RExC_parse)
830247a4 6333 RExC_seen_zerolen++;
bbe252da 6334 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6335 ret = reg_node(pRExC_state, MEOL);
bbe252da 6336 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6337 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6338 else
830247a4 6339 ret = reg_node(pRExC_state, EOL);
fac92740 6340 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6341 break;
6342 case '.':
830247a4 6343 nextchar(pRExC_state);
bbe252da 6344 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6345 ret = reg_node(pRExC_state, SANY);
6346 else
6347 ret = reg_node(pRExC_state, REG_ANY);
6348 *flagp |= HASWIDTH|SIMPLE;
830247a4 6349 RExC_naughty++;
fac92740 6350 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6351 break;
6352 case '[':
b45f050a 6353 {
3dab1dad
YO
6354 char * const oregcomp_parse = ++RExC_parse;
6355 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6356 if (*RExC_parse != ']') {
6357 RExC_parse = oregcomp_parse;
b45f050a
JF
6358 vFAIL("Unmatched [");
6359 }
830247a4 6360 nextchar(pRExC_state);
a0d0e21e 6361 *flagp |= HASWIDTH|SIMPLE;
fac92740 6362 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6363 break;
b45f050a 6364 }
a0d0e21e 6365 case '(':
830247a4 6366 nextchar(pRExC_state);
3dab1dad 6367 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6368 if (ret == NULL) {
bf93d4cc 6369 if (flags & TRYAGAIN) {
830247a4 6370 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6371 /* Make parent create an empty node if needed. */
6372 *flagp |= TRYAGAIN;
6373 return(NULL);
6374 }
a0d0e21e 6375 goto tryagain;
bf93d4cc 6376 }
a0d0e21e
LW
6377 return(NULL);
6378 }
c277df42 6379 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6380 break;
6381 case '|':
6382 case ')':
6383 if (flags & TRYAGAIN) {
6384 *flagp |= TRYAGAIN;
6385 return NULL;
6386 }
b45f050a 6387 vFAIL("Internal urp");
a0d0e21e
LW
6388 /* Supposed to be caught earlier. */
6389 break;
85afd4ae 6390 case '{':
830247a4
IZ
6391 if (!regcurly(RExC_parse)) {
6392 RExC_parse++;
85afd4ae
CS
6393 goto defchar;
6394 }
6395 /* FALL THROUGH */
a0d0e21e
LW
6396 case '?':
6397 case '+':
6398 case '*':
830247a4 6399 RExC_parse++;
b45f050a 6400 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6401 break;
6402 case '\\':
ee9b8eae
YO
6403 /* Special Escapes
6404
6405 This switch handles escape sequences that resolve to some kind
6406 of special regop and not to literal text. Escape sequnces that
6407 resolve to literal text are handled below in the switch marked
6408 "Literal Escapes".
6409
6410 Every entry in this switch *must* have a corresponding entry
6411 in the literal escape switch. However, the opposite is not
6412 required, as the default for this switch is to jump to the
6413 literal text handling code.
6414 */
830247a4 6415 switch (*++RExC_parse) {
ee9b8eae 6416 /* Special Escapes */
a0d0e21e 6417 case 'A':
830247a4
IZ
6418 RExC_seen_zerolen++;
6419 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6420 *flagp |= SIMPLE;
ee9b8eae 6421 goto finish_meta_pat;
a0d0e21e 6422 case 'G':
830247a4
IZ
6423 ret = reg_node(pRExC_state, GPOS);
6424 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6425 *flagp |= SIMPLE;
ee9b8eae
YO
6426 goto finish_meta_pat;
6427 case 'K':
6428 RExC_seen_zerolen++;
6429 ret = reg_node(pRExC_state, KEEPS);
6430 *flagp |= SIMPLE;
6431 goto finish_meta_pat;
a0d0e21e 6432 case 'Z':
830247a4 6433 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6434 *flagp |= SIMPLE;
a1917ab9 6435 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6436 goto finish_meta_pat;
b85d18e9 6437 case 'z':
830247a4 6438 ret = reg_node(pRExC_state, EOS);
b85d18e9 6439 *flagp |= SIMPLE;
830247a4 6440 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6441 goto finish_meta_pat;
4a2d328f 6442 case 'C':
f33976b4
DB
6443 ret = reg_node(pRExC_state, CANY);
6444 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6445 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6446 goto finish_meta_pat;
a0ed51b3 6447 case 'X':
830247a4 6448 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6449 *flagp |= HASWIDTH;
ee9b8eae 6450 goto finish_meta_pat;
a0d0e21e 6451 case 'w':
eb160463 6452 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6453 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6454 goto finish_meta_pat;
a0d0e21e 6455 case 'W':
eb160463 6456 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6457 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6458 goto finish_meta_pat;
a0d0e21e 6459 case 'b':
830247a4
IZ
6460 RExC_seen_zerolen++;
6461 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6462 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6463 *flagp |= SIMPLE;
ee9b8eae 6464 goto finish_meta_pat;
a0d0e21e 6465 case 'B':
830247a4
IZ
6466 RExC_seen_zerolen++;
6467 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6468 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6469 *flagp |= SIMPLE;
ee9b8eae 6470 goto finish_meta_pat;
a0d0e21e 6471 case 's':
eb160463 6472 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6473 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6474 goto finish_meta_pat;
a0d0e21e 6475 case 'S':
eb160463 6476 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6477 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6478 goto finish_meta_pat;
a0d0e21e 6479 case 'd':
ffc61ed2 6480 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6481 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6482 goto finish_meta_pat;
a0d0e21e 6483 case 'D':
ffc61ed2 6484 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6485 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
6486 goto finish_meta_pat;
6487 case 'v':
6488 ret = reganode(pRExC_state, PRUNE, 0);
6489 ret->flags = 1;
6490 *flagp |= SIMPLE;
6491 goto finish_meta_pat;
6492 case 'V':
6493 ret = reganode(pRExC_state, SKIP, 0);
6494 ret->flags = 1;
6495 *flagp |= SIMPLE;
6496 finish_meta_pat:
830247a4 6497 nextchar(pRExC_state);
fac92740 6498 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 6499 break;
a14b48bc
LW
6500 case 'p':
6501 case 'P':
3568d838 6502 {
3dab1dad 6503 char* const oldregxend = RExC_end;
d008bc60 6504#ifdef DEBUGGING
ccb2c380 6505 char* parse_start = RExC_parse - 2;
d008bc60 6506#endif
a14b48bc 6507
830247a4 6508 if (RExC_parse[1] == '{') {
3568d838 6509 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6510 RExC_end = strchr(RExC_parse, '}');
6511 if (!RExC_end) {
3dab1dad 6512 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6513 RExC_parse += 2;
6514 RExC_end = oldregxend;
0da60cf5 6515 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6516 }
830247a4 6517 RExC_end++;
a14b48bc 6518 }
af6f566e 6519 else {
830247a4 6520 RExC_end = RExC_parse + 2;
af6f566e
HS
6521 if (RExC_end > oldregxend)
6522 RExC_end = oldregxend;
6523 }
830247a4 6524 RExC_parse--;
a14b48bc 6525
3dab1dad 6526 ret = regclass(pRExC_state,depth+1);
a14b48bc 6527
830247a4
IZ
6528 RExC_end = oldregxend;
6529 RExC_parse--;
ccb2c380
MP
6530
6531 Set_Node_Offset(ret, parse_start + 2);
6532 Set_Node_Cur_Length(ret);
830247a4 6533 nextchar(pRExC_state);
a14b48bc
LW
6534 *flagp |= HASWIDTH|SIMPLE;
6535 }
6536 break;
fc8cd66c
YO
6537 case 'N':
6538 /* Handle \N{NAME} here and not below because it can be
6539 multicharacter. join_exact() will join them up later on.
6540 Also this makes sure that things like /\N{BLAH}+/ and
6541 \N{BLAH} being multi char Just Happen. dmq*/
6542 ++RExC_parse;
6543 ret= reg_namedseq(pRExC_state, NULL);
6544 break;
0a4db386 6545 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 6546 parse_named_seq:
81714fb9
YO
6547 {
6548 char ch= RExC_parse[1];
1f1031fe
YO
6549 if (ch != '<' && ch != '\'' && ch != '{') {
6550 RExC_parse++;
6551 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 6552 } else {
1f1031fe
YO
6553 /* this pretty much dupes the code for (?P=...) in reg(), if
6554 you change this make sure you change that */
81714fb9 6555 char* name_start = (RExC_parse += 2);
2eccd3b2 6556 U32 num = 0;
0a4db386
YO
6557 SV *sv_dat = reg_scan_name(pRExC_state,
6558 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 6559 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 6560 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
6561 vFAIL2("Sequence %.3s... not terminated",parse_start);
6562
6563 if (!SIZE_ONLY) {
6564 num = add_data( pRExC_state, 1, "S" );
6565 RExC_rxi->data->data[num]=(void*)sv_dat;
6566 SvREFCNT_inc(sv_dat);
6567 }
6568
81714fb9
YO
6569 RExC_sawback = 1;
6570 ret = reganode(pRExC_state,
6571 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6572 num);
6573 *flagp |= HASWIDTH;
1f1031fe 6574
81714fb9
YO
6575 /* override incorrect value set in reganode MJD */
6576 Set_Node_Offset(ret, parse_start+1);
6577 Set_Node_Cur_Length(ret); /* MJD */
6578 nextchar(pRExC_state);
1f1031fe 6579
81714fb9
YO
6580 }
6581 break;
1f1031fe 6582 }
2bf803e2 6583 case 'g':
a0d0e21e
LW
6584 case '1': case '2': case '3': case '4':
6585 case '5': case '6': case '7': case '8': case '9':
6586 {
c74340f9 6587 I32 num;
2bf803e2
YO
6588 bool isg = *RExC_parse == 'g';
6589 bool isrel = 0;
6590 bool hasbrace = 0;
6591 if (isg) {
c74340f9 6592 RExC_parse++;
2bf803e2
YO
6593 if (*RExC_parse == '{') {
6594 RExC_parse++;
6595 hasbrace = 1;
6596 }
6597 if (*RExC_parse == '-') {
6598 RExC_parse++;
6599 isrel = 1;
6600 }
1f1031fe
YO
6601 if (hasbrace && !isDIGIT(*RExC_parse)) {
6602 if (isrel) RExC_parse--;
6603 RExC_parse -= 2;
6604 goto parse_named_seq;
6605 } }
c74340f9
YO
6606 num = atoi(RExC_parse);
6607 if (isrel) {
5624f11d 6608 num = RExC_npar - num;
c74340f9
YO
6609 if (num < 1)
6610 vFAIL("Reference to nonexistent or unclosed group");
6611 }
2bf803e2 6612 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6613 goto defchar;
6614 else {
3dab1dad 6615 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6616 while (isDIGIT(*RExC_parse))
6617 RExC_parse++;
1f1031fe
YO
6618 if (parse_start == RExC_parse - 1)
6619 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
6620 if (hasbrace) {
6621 if (*RExC_parse != '}')
6622 vFAIL("Unterminated \\g{...} pattern");
6623 RExC_parse++;
6624 }
c74340f9
YO
6625 if (!SIZE_ONLY) {
6626 if (num > (I32)RExC_rx->nparens)
6627 vFAIL("Reference to nonexistent group");
c74340f9 6628 }
830247a4 6629 RExC_sawback = 1;
eb160463
GS
6630 ret = reganode(pRExC_state,
6631 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6632 num);
a0d0e21e 6633 *flagp |= HASWIDTH;
2af232bd 6634
fac92740 6635 /* override incorrect value set in reganode MJD */
2af232bd 6636 Set_Node_Offset(ret, parse_start+1);
fac92740 6637 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6638 RExC_parse--;
6639 nextchar(pRExC_state);
a0d0e21e
LW
6640 }
6641 }
6642 break;
6643 case '\0':
830247a4 6644 if (RExC_parse >= RExC_end)
b45f050a 6645 FAIL("Trailing \\");
a0d0e21e
LW
6646 /* FALL THROUGH */
6647 default:
a0288114 6648 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6649 back into the quick-grab loop below */
45948336 6650 parse_start--;
a0d0e21e
LW
6651 goto defchar;
6652 }
6653 break;
4633a7c4
LW
6654
6655 case '#':
bbe252da 6656 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6657 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6658 RExC_parse++;
830247a4 6659 if (RExC_parse < RExC_end)
4633a7c4
LW
6660 goto tryagain;
6661 }
6662 /* FALL THROUGH */
6663
a0d0e21e 6664 default: {
ba210ebe 6665 register STRLEN len;
58ae7d3f 6666 register UV ender;
a0d0e21e 6667 register char *p;
3dab1dad 6668 char *s;
80aecb99 6669 STRLEN foldlen;
89ebb4a3 6670 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6671
6672 parse_start = RExC_parse - 1;
a0d0e21e 6673
830247a4 6674 RExC_parse++;
a0d0e21e
LW
6675
6676 defchar:
58ae7d3f 6677 ender = 0;
eb160463
GS
6678 ret = reg_node(pRExC_state,
6679 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6680 s = STRING(ret);
830247a4
IZ
6681 for (len = 0, p = RExC_parse - 1;
6682 len < 127 && p < RExC_end;
a0d0e21e
LW
6683 len++)
6684 {
3dab1dad 6685 char * const oldp = p;
5b5a24f7 6686
bbe252da 6687 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6688 p = regwhite(p, RExC_end);
a0d0e21e
LW
6689 switch (*p) {
6690 case '^':
6691 case '$':
6692 case '.':
6693 case '[':
6694 case '(':
6695 case ')':
6696 case '|':
6697 goto loopdone;
6698 case '\\':
ee9b8eae
YO
6699 /* Literal Escapes Switch
6700
6701 This switch is meant to handle escape sequences that
6702 resolve to a literal character.
6703
6704 Every escape sequence that represents something
6705 else, like an assertion or a char class, is handled
6706 in the switch marked 'Special Escapes' above in this
6707 routine, but also has an entry here as anything that
6708 isn't explicitly mentioned here will be treated as
6709 an unescaped equivalent literal.
6710 */
6711
a0d0e21e 6712 switch (*++p) {
ee9b8eae
YO
6713 /* These are all the special escapes. */
6714 case 'A': /* Start assertion */
6715 case 'b': case 'B': /* Word-boundary assertion*/
6716 case 'C': /* Single char !DANGEROUS! */
6717 case 'd': case 'D': /* digit class */
6718 case 'g': case 'G': /* generic-backref, pos assertion */
6719 case 'k': case 'K': /* named backref, keep marker */
6720 case 'N': /* named char sequence */
6721 case 'p': case 'P': /* unicode property */
6722 case 's': case 'S': /* space class */
6723 case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
6724 case 'w': case 'W': /* word class */
6725 case 'X': /* eXtended Unicode "combining character sequence" */
6726 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
6727 --p;
6728 goto loopdone;
ee9b8eae
YO
6729
6730 /* Anything after here is an escape that resolves to a
6731 literal. (Except digits, which may or may not)
6732 */
a0d0e21e
LW
6733 case 'n':
6734 ender = '\n';
6735 p++;
a687059c 6736 break;
a0d0e21e
LW
6737 case 'r':
6738 ender = '\r';
6739 p++;
a687059c 6740 break;
a0d0e21e
LW
6741 case 't':
6742 ender = '\t';
6743 p++;
a687059c 6744 break;
a0d0e21e
LW
6745 case 'f':
6746 ender = '\f';
6747 p++;
a687059c 6748 break;
a0d0e21e 6749 case 'e':
c7f1f016 6750 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6751 p++;
a687059c 6752 break;
a0d0e21e 6753 case 'a':
c7f1f016 6754 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6755 p++;
a687059c 6756 break;
a0d0e21e 6757 case 'x':
a0ed51b3 6758 if (*++p == '{') {
1df70142 6759 char* const e = strchr(p, '}');
b81d288d 6760
b45f050a 6761 if (!e) {
830247a4 6762 RExC_parse = p + 1;
b45f050a
JF
6763 vFAIL("Missing right brace on \\x{}");
6764 }
de5f0749 6765 else {
a4c04bdc
NC
6766 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6767 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6768 STRLEN numlen = e - p - 1;
53305cf1 6769 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6770 if (ender > 0xff)
6771 RExC_utf8 = 1;
a0ed51b3
LW
6772 p = e + 1;
6773 }
a0ed51b3
LW
6774 }
6775 else {
a4c04bdc 6776 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6777 STRLEN numlen = 2;
53305cf1 6778 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6779 p += numlen;
6780 }
9e08bc66
TS
6781 if (PL_encoding && ender < 0x100)
6782 goto recode_encoding;
a687059c 6783 break;
a0d0e21e
LW
6784 case 'c':
6785 p++;
bbce6d69 6786 ender = UCHARAT(p++);
6787 ender = toCTRL(ender);
a687059c 6788 break;
a0d0e21e
LW
6789 case '0': case '1': case '2': case '3':case '4':
6790 case '5': case '6': case '7': case '8':case '9':
6791 if (*p == '0' ||
830247a4 6792 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6793 I32 flags = 0;
1df70142 6794 STRLEN numlen = 3;
53305cf1 6795 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6796 p += numlen;
6797 }
6798 else {
6799 --p;
6800 goto loopdone;
a687059c 6801 }
9e08bc66
TS
6802 if (PL_encoding && ender < 0x100)
6803 goto recode_encoding;
6804 break;
6805 recode_encoding:
6806 {
6807 SV* enc = PL_encoding;
6808 ender = reg_recode((const char)(U8)ender, &enc);
6809 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6810 vWARN(p, "Invalid escape in the specified encoding");
6811 RExC_utf8 = 1;
6812 }
a687059c 6813 break;
a0d0e21e 6814 case '\0':
830247a4 6815 if (p >= RExC_end)
b45f050a 6816 FAIL("Trailing \\");
a687059c 6817 /* FALL THROUGH */
a0d0e21e 6818 default:
041457d9 6819 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6820 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6821 goto normal_default;
a0d0e21e
LW
6822 }
6823 break;
a687059c 6824 default:
a0ed51b3 6825 normal_default:
fd400ab9 6826 if (UTF8_IS_START(*p) && UTF) {
1df70142 6827 STRLEN numlen;
5e12f4fb 6828 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6829 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6830 p += numlen;
6831 }
6832 else
6833 ender = *p++;
a0d0e21e 6834 break;
a687059c 6835 }
bbe252da 6836 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6837 p = regwhite(p, RExC_end);
60a8b682
JH
6838 if (UTF && FOLD) {
6839 /* Prime the casefolded buffer. */
ac7e0132 6840 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6841 }
a0d0e21e
LW
6842 if (ISMULT2(p)) { /* Back off on ?+*. */
6843 if (len)
6844 p = oldp;
16ea2a2e 6845 else if (UTF) {
80aecb99 6846 if (FOLD) {
60a8b682 6847 /* Emit all the Unicode characters. */
1df70142 6848 STRLEN numlen;
80aecb99
JH
6849 for (foldbuf = tmpbuf;
6850 foldlen;
6851 foldlen -= numlen) {
6852 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6853 if (numlen > 0) {
71207a34 6854 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6855 s += unilen;
6856 len += unilen;
6857 /* In EBCDIC the numlen
6858 * and unilen can differ. */
9dc45d57 6859 foldbuf += numlen;
47654450
JH
6860 if (numlen >= foldlen)
6861 break;
9dc45d57
JH
6862 }
6863 else
6864 break; /* "Can't happen." */
80aecb99
JH
6865 }
6866 }
6867 else {
71207a34 6868 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6869 if (unilen > 0) {
0ebc6274
JH
6870 s += unilen;
6871 len += unilen;
9dc45d57 6872 }
80aecb99 6873 }
a0ed51b3 6874 }
a0d0e21e
LW
6875 else {
6876 len++;
eb160463 6877 REGC((char)ender, s++);
a0d0e21e
LW
6878 }
6879 break;
a687059c 6880 }
16ea2a2e 6881 if (UTF) {
80aecb99 6882 if (FOLD) {
60a8b682 6883 /* Emit all the Unicode characters. */
1df70142 6884 STRLEN numlen;
80aecb99
JH
6885 for (foldbuf = tmpbuf;
6886 foldlen;
6887 foldlen -= numlen) {
6888 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6889 if (numlen > 0) {
71207a34 6890 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6891 len += unilen;
6892 s += unilen;
6893 /* In EBCDIC the numlen
6894 * and unilen can differ. */
9dc45d57 6895 foldbuf += numlen;
47654450
JH
6896 if (numlen >= foldlen)
6897 break;
9dc45d57
JH
6898 }
6899 else
6900 break;
80aecb99
JH
6901 }
6902 }
6903 else {
71207a34 6904 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6905 if (unilen > 0) {
0ebc6274
JH
6906 s += unilen;
6907 len += unilen;
9dc45d57 6908 }
80aecb99
JH
6909 }
6910 len--;
a0ed51b3
LW
6911 }
6912 else
eb160463 6913 REGC((char)ender, s++);
a0d0e21e
LW
6914 }
6915 loopdone:
830247a4 6916 RExC_parse = p - 1;
fac92740 6917 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6918 nextchar(pRExC_state);
793db0cb
JH
6919 {
6920 /* len is STRLEN which is unsigned, need to copy to signed */
6921 IV iv = len;
6922 if (iv < 0)
6923 vFAIL("Internal disaster");
6924 }
a0d0e21e
LW
6925 if (len > 0)
6926 *flagp |= HASWIDTH;
090f7165 6927 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6928 *flagp |= SIMPLE;
3dab1dad 6929
cd439c50 6930 if (SIZE_ONLY)
830247a4 6931 RExC_size += STR_SZ(len);
3dab1dad
YO
6932 else {
6933 STR_LEN(ret) = len;
830247a4 6934 RExC_emit += STR_SZ(len);
07be1b83 6935 }
3dab1dad 6936 }
a0d0e21e
LW
6937 break;
6938 }
a687059c 6939
a0d0e21e 6940 return(ret);
a687059c
LW
6941}
6942
873ef191 6943STATIC char *
5f66b61c 6944S_regwhite(char *p, const char *e)
5b5a24f7
CS
6945{
6946 while (p < e) {
6947 if (isSPACE(*p))
6948 ++p;
6949 else if (*p == '#') {
6950 do {
6951 p++;
6952 } while (p < e && *p != '\n');
6953 }
6954 else
6955 break;
6956 }
6957 return p;
6958}
6959
b8c5462f
JH
6960/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6961 Character classes ([:foo:]) can also be negated ([:^foo:]).
6962 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6963 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6964 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6965
6966#define POSIXCC_DONE(c) ((c) == ':')
6967#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6968#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6969
b8c5462f 6970STATIC I32
830247a4 6971S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6972{
97aff369 6973 dVAR;
936ed897 6974 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6975
830247a4 6976 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6977 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6978 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6979 const char c = UCHARAT(RExC_parse);
097eb12c 6980 char* const s = RExC_parse++;
b81d288d 6981
9a86a77b 6982 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6983 RExC_parse++;
6984 if (RExC_parse == RExC_end)
620e46c5 6985 /* Grandfather lone [:, [=, [. */
830247a4 6986 RExC_parse = s;
620e46c5 6987 else {
3dab1dad 6988 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6989 assert(*t == c);
6990
9a86a77b 6991 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6992 const char *posixcc = s + 1;
830247a4 6993 RExC_parse++; /* skip over the ending ] */
3dab1dad 6994
b8c5462f 6995 if (*s == ':') {
1df70142
AL
6996 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6997 const I32 skip = t - posixcc;
80916619
NC
6998
6999 /* Initially switch on the length of the name. */
7000 switch (skip) {
7001 case 4:
3dab1dad
YO
7002 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7003 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7004 break;
80916619
NC
7005 case 5:
7006 /* Names all of length 5. */
7007 /* alnum alpha ascii blank cntrl digit graph lower
7008 print punct space upper */
7009 /* Offset 4 gives the best switch position. */
7010 switch (posixcc[4]) {
7011 case 'a':
3dab1dad
YO
7012 if (memEQ(posixcc, "alph", 4)) /* alpha */
7013 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7014 break;
7015 case 'e':
3dab1dad
YO
7016 if (memEQ(posixcc, "spac", 4)) /* space */
7017 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7018 break;
7019 case 'h':
3dab1dad
YO
7020 if (memEQ(posixcc, "grap", 4)) /* graph */
7021 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7022 break;
7023 case 'i':
3dab1dad
YO
7024 if (memEQ(posixcc, "asci", 4)) /* ascii */
7025 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7026 break;
7027 case 'k':
3dab1dad
YO
7028 if (memEQ(posixcc, "blan", 4)) /* blank */
7029 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7030 break;
7031 case 'l':
3dab1dad
YO
7032 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7033 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7034 break;
7035 case 'm':
3dab1dad
YO
7036 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7037 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7038 break;
7039 case 'r':
3dab1dad
YO
7040 if (memEQ(posixcc, "lowe", 4)) /* lower */
7041 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7042 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7043 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7044 break;
7045 case 't':
3dab1dad
YO
7046 if (memEQ(posixcc, "digi", 4)) /* digit */
7047 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7048 else if (memEQ(posixcc, "prin", 4)) /* print */
7049 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7050 else if (memEQ(posixcc, "punc", 4)) /* punct */
7051 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7052 break;
b8c5462f
JH
7053 }
7054 break;
80916619 7055 case 6:
3dab1dad
YO
7056 if (memEQ(posixcc, "xdigit", 6))
7057 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7058 break;
7059 }
80916619
NC
7060
7061 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7062 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7063 t - s - 1, s + 1);
80916619
NC
7064 assert (posixcc[skip] == ':');
7065 assert (posixcc[skip+1] == ']');
b45f050a 7066 } else if (!SIZE_ONLY) {
b8c5462f 7067 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7068
830247a4 7069 /* adjust RExC_parse so the warning shows after
b45f050a 7070 the class closes */
9a86a77b 7071 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7072 RExC_parse++;
b45f050a
JF
7073 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7074 }
b8c5462f
JH
7075 } else {
7076 /* Maternal grandfather:
7077 * "[:" ending in ":" but not in ":]" */
830247a4 7078 RExC_parse = s;
767d463e 7079 }
620e46c5
JH
7080 }
7081 }
7082
b8c5462f
JH
7083 return namedclass;
7084}
7085
7086STATIC void
830247a4 7087S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7088{
97aff369 7089 dVAR;
3dab1dad 7090 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7091 const char *s = RExC_parse;
7092 const char c = *s++;
b8c5462f 7093
3dab1dad 7094 while (isALNUM(*s))
b8c5462f
JH
7095 s++;
7096 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7097 if (ckWARN(WARN_REGEXP))
7098 vWARN3(s+2,
7099 "POSIX syntax [%c %c] belongs inside character classes",
7100 c, c);
b45f050a
JF
7101
7102 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7103 if (POSIXCC_NOTYET(c)) {
830247a4 7104 /* adjust RExC_parse so the error shows after
b45f050a 7105 the class closes */
9a86a77b 7106 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7107 NOOP;
b45f050a
JF
7108 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7109 }
b8c5462f
JH
7110 }
7111 }
620e46c5
JH
7112}
7113
7f6f358c 7114
89836f1f
YO
7115#define _C_C_T_(NAME,TEST,WORD) \
7116ANYOF_##NAME: \
7117 if (LOC) \
7118 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7119 else { \
7120 for (value = 0; value < 256; value++) \
7121 if (TEST) \
7122 ANYOF_BITMAP_SET(ret, value); \
7123 } \
7124 yesno = '+'; \
7125 what = WORD; \
7126 break; \
7127case ANYOF_N##NAME: \
7128 if (LOC) \
7129 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7130 else { \
7131 for (value = 0; value < 256; value++) \
7132 if (!TEST) \
7133 ANYOF_BITMAP_SET(ret, value); \
7134 } \
7135 yesno = '!'; \
7136 what = WORD; \
7137 break
7138
7139
7f6f358c
YO
7140/*
7141 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7142 matches the pattern or if the pattern matches a single char only and
7143 that char is < 256 and we are case insensitive then we produce an
7144 EXACT node instead.
7f6f358c 7145*/
89836f1f 7146
76e3520e 7147STATIC regnode *
3dab1dad 7148S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7149{
97aff369 7150 dVAR;
9ef43ace 7151 register UV value = 0;
9a86a77b 7152 register UV nextvalue;
3568d838 7153 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7154 register IV range = 0;
c277df42 7155 register regnode *ret;
ba210ebe 7156 STRLEN numlen;
ffc61ed2 7157 IV namedclass;
cbbf8932 7158 char *rangebegin = NULL;
936ed897 7159 bool need_class = 0;
c445ea15 7160 SV *listsv = NULL;
ffc61ed2 7161 UV n;
9e55ce06 7162 bool optimize_invert = TRUE;
cbbf8932 7163 AV* unicode_alternate = NULL;
1b2d223b
JH
7164#ifdef EBCDIC
7165 UV literal_endpoint = 0;
7166#endif
7f6f358c 7167 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7168
3dab1dad 7169 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7170 case we need to change the emitted regop to an EXACT. */
07be1b83 7171 const char * orig_parse = RExC_parse;
72f13be8 7172 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
7173#ifndef DEBUGGING
7174 PERL_UNUSED_ARG(depth);
7175#endif
72f13be8 7176
3dab1dad 7177 DEBUG_PARSE("clas");
7f6f358c
YO
7178
7179 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7180 ret = reganode(pRExC_state, ANYOF, 0);
7181
7182 if (!SIZE_ONLY)
7183 ANYOF_FLAGS(ret) = 0;
7184
9a86a77b 7185 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7186 RExC_naughty++;
7187 RExC_parse++;
7188 if (!SIZE_ONLY)
7189 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7190 }
a0d0e21e 7191
73060fc4 7192 if (SIZE_ONLY) {
830247a4 7193 RExC_size += ANYOF_SKIP;
73060fc4
JH
7194 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7195 }
936ed897 7196 else {
830247a4 7197 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7198 if (FOLD)
7199 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7200 if (LOC)
7201 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7202 ANYOF_BITMAP_ZERO(ret);
396482e1 7203 listsv = newSVpvs("# comment\n");
a0d0e21e 7204 }
b8c5462f 7205
9a86a77b
JH
7206 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7207
b938889d 7208 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7209 checkposixcc(pRExC_state);
b8c5462f 7210
f064b6ad
HS
7211 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7212 if (UCHARAT(RExC_parse) == ']')
7213 goto charclassloop;
ffc61ed2 7214
fc8cd66c 7215parseit:
9a86a77b 7216 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7217
7218 charclassloop:
7219
7220 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7221
73b437c8 7222 if (!range)
830247a4 7223 rangebegin = RExC_parse;
ffc61ed2 7224 if (UTF) {
5e12f4fb 7225 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7226 RExC_end - RExC_parse,
9f7f3913 7227 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7228 RExC_parse += numlen;
7229 }
7230 else
7231 value = UCHARAT(RExC_parse++);
7f6f358c 7232
9a86a77b
JH
7233 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7234 if (value == '[' && POSIXCC(nextvalue))
830247a4 7235 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7236 else if (value == '\\') {
ffc61ed2 7237 if (UTF) {
5e12f4fb 7238 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7239 RExC_end - RExC_parse,
9f7f3913 7240 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7241 RExC_parse += numlen;
7242 }
7243 else
7244 value = UCHARAT(RExC_parse++);
470c3474 7245 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7246 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7247 * be a problem later if we want switch on Unicode.
7248 * A similar issue a little bit later when switching on
7249 * namedclass. --jhi */
ffc61ed2 7250 switch ((I32)value) {
b8c5462f
JH
7251 case 'w': namedclass = ANYOF_ALNUM; break;
7252 case 'W': namedclass = ANYOF_NALNUM; break;
7253 case 's': namedclass = ANYOF_SPACE; break;
7254 case 'S': namedclass = ANYOF_NSPACE; break;
7255 case 'd': namedclass = ANYOF_DIGIT; break;
7256 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
7257 case 'N': /* Handle \N{NAME} in class */
7258 {
7259 /* We only pay attention to the first char of
7260 multichar strings being returned. I kinda wonder
7261 if this makes sense as it does change the behaviour
7262 from earlier versions, OTOH that behaviour was broken
7263 as well. */
7264 UV v; /* value is register so we cant & it /grrr */
7265 if (reg_namedseq(pRExC_state, &v)) {
7266 goto parseit;
7267 }
7268 value= v;
7269 }
7270 break;
ffc61ed2
JH
7271 case 'p':
7272 case 'P':
3dab1dad
YO
7273 {
7274 char *e;
af6f566e 7275 if (RExC_parse >= RExC_end)
2a4859cd 7276 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7277 if (*RExC_parse == '{') {
1df70142 7278 const U8 c = (U8)value;
ffc61ed2
JH
7279 e = strchr(RExC_parse++, '}');
7280 if (!e)
0da60cf5 7281 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7282 while (isSPACE(UCHARAT(RExC_parse)))
7283 RExC_parse++;
7284 if (e == RExC_parse)
0da60cf5 7285 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7286 n = e - RExC_parse;
ab13f0c7
JH
7287 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7288 n--;
ffc61ed2
JH
7289 }
7290 else {
7291 e = RExC_parse;
7292 n = 1;
7293 }
7294 if (!SIZE_ONLY) {
ab13f0c7
JH
7295 if (UCHARAT(RExC_parse) == '^') {
7296 RExC_parse++;
7297 n--;
7298 value = value == 'p' ? 'P' : 'p'; /* toggle */
7299 while (isSPACE(UCHARAT(RExC_parse))) {
7300 RExC_parse++;
7301 n--;
7302 }
7303 }
097eb12c
AL
7304 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7305 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7306 }
7307 RExC_parse = e + 1;
7308 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7309 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7310 }
f81125e2 7311 break;
b8c5462f
JH
7312 case 'n': value = '\n'; break;
7313 case 'r': value = '\r'; break;
7314 case 't': value = '\t'; break;
7315 case 'f': value = '\f'; break;
7316 case 'b': value = '\b'; break;
c7f1f016
NIS
7317 case 'e': value = ASCII_TO_NATIVE('\033');break;
7318 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7319 case 'x':
ffc61ed2 7320 if (*RExC_parse == '{') {
a4c04bdc
NC
7321 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7322 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7323 char * const e = strchr(RExC_parse++, '}');
b81d288d 7324 if (!e)
ffc61ed2 7325 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7326
7327 numlen = e - RExC_parse;
7328 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7329 RExC_parse = e + 1;
7330 }
7331 else {
a4c04bdc 7332 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7333 numlen = 2;
7334 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7335 RExC_parse += numlen;
7336 }
9e08bc66
TS
7337 if (PL_encoding && value < 0x100)
7338 goto recode_encoding;
b8c5462f
JH
7339 break;
7340 case 'c':
830247a4 7341 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7342 value = toCTRL(value);
7343 break;
7344 case '0': case '1': case '2': case '3': case '4':
7345 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7346 {
7347 I32 flags = 0;
7348 numlen = 3;
7349 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7350 RExC_parse += numlen;
7351 if (PL_encoding && value < 0x100)
7352 goto recode_encoding;
7353 break;
7354 }
7355 recode_encoding:
7356 {
7357 SV* enc = PL_encoding;
7358 value = reg_recode((const char)(U8)value, &enc);
7359 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7360 vWARN(RExC_parse,
7361 "Invalid escape in the specified encoding");
7362 break;
7363 }
1028017a 7364 default:
041457d9 7365 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7366 vWARN2(RExC_parse,
7367 "Unrecognized escape \\%c in character class passed through",
7368 (int)value);
1028017a 7369 break;
b8c5462f 7370 }
ffc61ed2 7371 } /* end of \blah */
1b2d223b
JH
7372#ifdef EBCDIC
7373 else
7374 literal_endpoint++;
7375#endif
ffc61ed2
JH
7376
7377 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7378
7379 if (!SIZE_ONLY && !need_class)
936ed897 7380 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7381
936ed897 7382 need_class = 1;
ffc61ed2
JH
7383
7384 /* a bad range like a-\d, a-[:digit:] ? */
7385 if (range) {
73b437c8 7386 if (!SIZE_ONLY) {
afd78fd5 7387 if (ckWARN(WARN_REGEXP)) {
097eb12c 7388 const int w =
afd78fd5
JH
7389 RExC_parse >= rangebegin ?
7390 RExC_parse - rangebegin : 0;
830247a4 7391 vWARN4(RExC_parse,
b45f050a 7392 "False [] range \"%*.*s\"",
097eb12c 7393 w, w, rangebegin);
afd78fd5 7394 }
3568d838
JH
7395 if (prevvalue < 256) {
7396 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7397 ANYOF_BITMAP_SET(ret, '-');
7398 }
7399 else {
7400 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7401 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7402 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7403 }
b8c5462f 7404 }
ffc61ed2
JH
7405
7406 range = 0; /* this was not a true range */
73b437c8 7407 }
ffc61ed2 7408
89836f1f
YO
7409
7410
73b437c8 7411 if (!SIZE_ONLY) {
c49a72a9
NC
7412 const char *what = NULL;
7413 char yesno = 0;
7414
3568d838
JH
7415 if (namedclass > OOB_NAMEDCLASS)
7416 optimize_invert = FALSE;
e2962f66
JH
7417 /* Possible truncation here but in some 64-bit environments
7418 * the compiler gets heartburn about switch on 64-bit values.
7419 * A similar issue a little earlier when switching on value.
98f323fa 7420 * --jhi */
e2962f66 7421 switch ((I32)namedclass) {
89836f1f
YO
7422 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7423 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7424 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7425 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7426 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7427 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7428 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7429 case _C_C_T_(PRINT, isPRINT(value), "Print");
7430 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7431 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7432 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7433 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7434 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
73b437c8
JH
7435 case ANYOF_ASCII:
7436 if (LOC)
936ed897 7437 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7438 else {
c7f1f016 7439#ifndef EBCDIC
1ba5c669
JH
7440 for (value = 0; value < 128; value++)
7441 ANYOF_BITMAP_SET(ret, value);
7442#else /* EBCDIC */
ffbc6a93 7443 for (value = 0; value < 256; value++) {
3a3c4447
JH
7444 if (isASCII(value))
7445 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7446 }
1ba5c669 7447#endif /* EBCDIC */
73b437c8 7448 }
c49a72a9
NC
7449 yesno = '+';
7450 what = "ASCII";
73b437c8
JH
7451 break;
7452 case ANYOF_NASCII:
7453 if (LOC)
936ed897 7454 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7455 else {
c7f1f016 7456#ifndef EBCDIC
1ba5c669
JH
7457 for (value = 128; value < 256; value++)
7458 ANYOF_BITMAP_SET(ret, value);
7459#else /* EBCDIC */
ffbc6a93 7460 for (value = 0; value < 256; value++) {
3a3c4447
JH
7461 if (!isASCII(value))
7462 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7463 }
1ba5c669 7464#endif /* EBCDIC */
73b437c8 7465 }
c49a72a9
NC
7466 yesno = '!';
7467 what = "ASCII";
89836f1f 7468 break;
ffc61ed2
JH
7469 case ANYOF_DIGIT:
7470 if (LOC)
7471 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7472 else {
7473 /* consecutive digits assumed */
7474 for (value = '0'; value <= '9'; value++)
7475 ANYOF_BITMAP_SET(ret, value);
7476 }
c49a72a9
NC
7477 yesno = '+';
7478 what = "Digit";
ffc61ed2
JH
7479 break;
7480 case ANYOF_NDIGIT:
7481 if (LOC)
7482 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7483 else {
7484 /* consecutive digits assumed */
7485 for (value = 0; value < '0'; value++)
7486 ANYOF_BITMAP_SET(ret, value);
7487 for (value = '9' + 1; value < 256; value++)
7488 ANYOF_BITMAP_SET(ret, value);
7489 }
c49a72a9
NC
7490 yesno = '!';
7491 what = "Digit";
89836f1f 7492 break;
f81125e2
JP
7493 case ANYOF_MAX:
7494 /* this is to handle \p and \P */
7495 break;
73b437c8 7496 default:
b45f050a 7497 vFAIL("Invalid [::] class");
73b437c8 7498 break;
b8c5462f 7499 }
c49a72a9
NC
7500 if (what) {
7501 /* Strings such as "+utf8::isWord\n" */
7502 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7503 }
b8c5462f 7504 if (LOC)
936ed897 7505 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7506 continue;
a0d0e21e 7507 }
ffc61ed2
JH
7508 } /* end of namedclass \blah */
7509
a0d0e21e 7510 if (range) {
eb160463 7511 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7512 const int w = RExC_parse - rangebegin;
7513 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7514 range = 0; /* not a valid range */
73b437c8 7515 }
a0d0e21e
LW
7516 }
7517 else {
3568d838 7518 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7519 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7520 RExC_parse[1] != ']') {
7521 RExC_parse++;
ffc61ed2
JH
7522
7523 /* a bad range like \w-, [:word:]- ? */
7524 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7525 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7526 const int w =
afd78fd5
JH
7527 RExC_parse >= rangebegin ?
7528 RExC_parse - rangebegin : 0;
830247a4 7529 vWARN4(RExC_parse,
b45f050a 7530 "False [] range \"%*.*s\"",
097eb12c 7531 w, w, rangebegin);
afd78fd5 7532 }
73b437c8 7533 if (!SIZE_ONLY)
936ed897 7534 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7535 } else
ffc61ed2
JH
7536 range = 1; /* yeah, it's a range! */
7537 continue; /* but do it the next time */
a0d0e21e 7538 }
a687059c 7539 }
ffc61ed2 7540
93733859 7541 /* now is the next time */
07be1b83 7542 /*stored += (value - prevvalue + 1);*/
ae5c130c 7543 if (!SIZE_ONLY) {
3568d838 7544 if (prevvalue < 256) {
1df70142 7545 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7546 IV i;
3568d838 7547#ifdef EBCDIC
1b2d223b
JH
7548 /* In EBCDIC [\x89-\x91] should include
7549 * the \x8e but [i-j] should not. */
7550 if (literal_endpoint == 2 &&
7551 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7552 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7553 {
3568d838
JH
7554 if (isLOWER(prevvalue)) {
7555 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7556 if (isLOWER(i))
7557 ANYOF_BITMAP_SET(ret, i);
7558 } else {
3568d838 7559 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7560 if (isUPPER(i))
7561 ANYOF_BITMAP_SET(ret, i);
7562 }
8ada0baa 7563 }
ffc61ed2 7564 else
8ada0baa 7565#endif
07be1b83
YO
7566 for (i = prevvalue; i <= ceilvalue; i++) {
7567 if (!ANYOF_BITMAP_TEST(ret,i)) {
7568 stored++;
7569 ANYOF_BITMAP_SET(ret, i);
7570 }
7571 }
3568d838 7572 }
a5961de5 7573 if (value > 255 || UTF) {
1df70142
AL
7574 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7575 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7576 stored+=2; /* can't optimize this class */
ffc61ed2 7577 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7578 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7579 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7580 prevnatvalue, natvalue);
7581 }
7582 else if (prevnatvalue == natvalue) {
7583 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7584 if (FOLD) {
89ebb4a3 7585 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7586 STRLEN foldlen;
1df70142 7587 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7588
e294cc5d
JH
7589#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7590 if (RExC_precomp[0] == ':' &&
7591 RExC_precomp[1] == '[' &&
7592 (f == 0xDF || f == 0x92)) {
7593 f = NATIVE_TO_UNI(f);
7594 }
7595#endif
c840d2a2
JH
7596 /* If folding and foldable and a single
7597 * character, insert also the folded version
7598 * to the charclass. */
9e55ce06 7599 if (f != value) {
e294cc5d
JH
7600#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7601 if ((RExC_precomp[0] == ':' &&
7602 RExC_precomp[1] == '[' &&
7603 (f == 0xA2 &&
7604 (value == 0xFB05 || value == 0xFB06))) ?
7605 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7606 foldlen == (STRLEN)UNISKIP(f) )
7607#else
eb160463 7608 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7609#endif
9e55ce06
JH
7610 Perl_sv_catpvf(aTHX_ listsv,
7611 "%04"UVxf"\n", f);
7612 else {
7613 /* Any multicharacter foldings
7614 * require the following transform:
7615 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7616 * where E folds into "pq" and F folds
7617 * into "rst", all other characters
7618 * fold to single characters. We save
7619 * away these multicharacter foldings,
7620 * to be later saved as part of the
7621 * additional "s" data. */
7622 SV *sv;
7623
7624 if (!unicode_alternate)
7625 unicode_alternate = newAV();
7626 sv = newSVpvn((char*)foldbuf, foldlen);
7627 SvUTF8_on(sv);
7628 av_push(unicode_alternate, sv);
7629 }
7630 }
254ba52a 7631
60a8b682
JH
7632 /* If folding and the value is one of the Greek
7633 * sigmas insert a few more sigmas to make the
7634 * folding rules of the sigmas to work right.
7635 * Note that not all the possible combinations
7636 * are handled here: some of them are handled
9e55ce06
JH
7637 * by the standard folding rules, and some of
7638 * them (literal or EXACTF cases) are handled
7639 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7640 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7641 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7642 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7643 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7644 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7645 }
7646 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7647 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7648 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7649 }
7650 }
ffc61ed2 7651 }
1b2d223b
JH
7652#ifdef EBCDIC
7653 literal_endpoint = 0;
7654#endif
8ada0baa 7655 }
ffc61ed2
JH
7656
7657 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7658 }
ffc61ed2 7659
936ed897 7660 if (need_class) {
4f66b38d 7661 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7662 if (SIZE_ONLY)
830247a4 7663 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7664 else
830247a4 7665 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7666 }
ffc61ed2 7667
7f6f358c
YO
7668
7669 if (SIZE_ONLY)
7670 return ret;
7671 /****** !SIZE_ONLY AFTER HERE *********/
7672
7673 if( stored == 1 && value < 256
7674 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7675 ) {
7676 /* optimize single char class to an EXACT node
7677 but *only* when its not a UTF/high char */
07be1b83
YO
7678 const char * cur_parse= RExC_parse;
7679 RExC_emit = (regnode *)orig_emit;
7680 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7681 ret = reg_node(pRExC_state,
7682 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7683 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7684 *STRING(ret)= (char)value;
7685 STR_LEN(ret)= 1;
7686 RExC_emit += STR_SZ(1);
7687 return ret;
7688 }
ae5c130c 7689 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7690 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7691 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7692 ) {
a0ed51b3 7693 for (value = 0; value < 256; ++value) {
936ed897 7694 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7695 UV fold = PL_fold[value];
ffc61ed2
JH
7696
7697 if (fold != value)
7698 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7699 }
7700 }
936ed897 7701 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7702 }
ffc61ed2 7703
ae5c130c 7704 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7705 if (optimize_invert &&
ffc61ed2
JH
7706 /* If the only flag is inversion. */
7707 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7708 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7709 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7710 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7711 }
7f6f358c 7712 {
097eb12c 7713 AV * const av = newAV();
ffc61ed2 7714 SV *rv;
9e55ce06 7715 /* The 0th element stores the character class description
6a0407ee 7716 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7717 * to initialize the appropriate swash (which gets stored in
7718 * the 1st element), and also useful for dumping the regnode.
7719 * The 2nd element stores the multicharacter foldings,
6a0407ee 7720 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7721 av_store(av, 0, listsv);
7722 av_store(av, 1, NULL);
9e55ce06 7723 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7724 rv = newRV_noinc((SV*)av);
19860706 7725 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7726 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7727 ARG_SET(ret, n);
a0ed51b3 7728 }
a0ed51b3
LW
7729 return ret;
7730}
89836f1f
YO
7731#undef _C_C_T_
7732
a0ed51b3 7733
76e3520e 7734STATIC char*
830247a4 7735S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7736{
097eb12c 7737 char* const retval = RExC_parse++;
a0d0e21e 7738
4633a7c4 7739 for (;;) {
830247a4
IZ
7740 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7741 RExC_parse[2] == '#') {
e994fd66
AE
7742 while (*RExC_parse != ')') {
7743 if (RExC_parse == RExC_end)
7744 FAIL("Sequence (?#... not terminated");
830247a4 7745 RExC_parse++;
e994fd66 7746 }
830247a4 7747 RExC_parse++;
4633a7c4
LW
7748 continue;
7749 }
bbe252da 7750 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7751 if (isSPACE(*RExC_parse)) {
7752 RExC_parse++;
748a9306
LW
7753 continue;
7754 }
830247a4 7755 else if (*RExC_parse == '#') {
e994fd66
AE
7756 while (RExC_parse < RExC_end)
7757 if (*RExC_parse++ == '\n') break;
748a9306
LW
7758 continue;
7759 }
748a9306 7760 }
4633a7c4 7761 return retval;
a0d0e21e 7762 }
a687059c
LW
7763}
7764
7765/*
c277df42 7766- reg_node - emit a node
a0d0e21e 7767*/
76e3520e 7768STATIC regnode * /* Location. */
830247a4 7769S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7770{
97aff369 7771 dVAR;
c277df42 7772 register regnode *ptr;
504618e9 7773 regnode * const ret = RExC_emit;
07be1b83 7774 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7775
c277df42 7776 if (SIZE_ONLY) {
830247a4
IZ
7777 SIZE_ALIGN(RExC_size);
7778 RExC_size += 1;
a0d0e21e
LW
7779 return(ret);
7780 }
e2e6a0f1
YO
7781#ifdef DEBUGGING
7782 if (OP(RExC_emit) == 255)
7783 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7784 reg_name[op], OP(RExC_emit));
7785#endif
c277df42 7786 NODE_ALIGN_FILL(ret);
a0d0e21e 7787 ptr = ret;
c277df42 7788 FILL_ADVANCE_NODE(ptr, op);
7122b237 7789#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7790 if (RExC_offsets) { /* MJD */
07be1b83 7791 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7792 "reg_node", __LINE__,
7793 reg_name[op],
07be1b83
YO
7794 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7795 ? "Overwriting end of array!\n" : "OK",
7796 (UV)(RExC_emit - RExC_emit_start),
7797 (UV)(RExC_parse - RExC_start),
7798 (UV)RExC_offsets[0]));
ccb2c380 7799 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7800 }
7122b237 7801#endif
830247a4 7802 RExC_emit = ptr;
a0d0e21e 7803 return(ret);
a687059c
LW
7804}
7805
7806/*
a0d0e21e
LW
7807- reganode - emit a node with an argument
7808*/
76e3520e 7809STATIC regnode * /* Location. */
830247a4 7810S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7811{
97aff369 7812 dVAR;
c277df42 7813 register regnode *ptr;
504618e9 7814 regnode * const ret = RExC_emit;
07be1b83 7815 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7816
c277df42 7817 if (SIZE_ONLY) {
830247a4
IZ
7818 SIZE_ALIGN(RExC_size);
7819 RExC_size += 2;
6bda09f9
YO
7820 /*
7821 We can't do this:
7822
7823 assert(2==regarglen[op]+1);
7824
7825 Anything larger than this has to allocate the extra amount.
7826 If we changed this to be:
7827
7828 RExC_size += (1 + regarglen[op]);
7829
7830 then it wouldn't matter. Its not clear what side effect
7831 might come from that so its not done so far.
7832 -- dmq
7833 */
a0d0e21e
LW
7834 return(ret);
7835 }
e2e6a0f1
YO
7836#ifdef DEBUGGING
7837 if (OP(RExC_emit) == 255)
7838 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7839#endif
c277df42 7840 NODE_ALIGN_FILL(ret);
a0d0e21e 7841 ptr = ret;
c277df42 7842 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 7843#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7844 if (RExC_offsets) { /* MJD */
07be1b83 7845 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7846 "reganode",
ccb2c380
MP
7847 __LINE__,
7848 reg_name[op],
07be1b83 7849 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7850 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7851 (UV)(RExC_emit - RExC_emit_start),
7852 (UV)(RExC_parse - RExC_start),
7853 (UV)RExC_offsets[0]));
ccb2c380 7854 Set_Cur_Node_Offset;
fac92740 7855 }
7122b237 7856#endif
830247a4 7857 RExC_emit = ptr;
a0d0e21e 7858 return(ret);
fe14fcc3
LW
7859}
7860
7861/*
cd439c50 7862- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7863*/
71207a34
AL
7864STATIC STRLEN
7865S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7866{
97aff369 7867 dVAR;
71207a34 7868 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7869}
7870
7871/*
a0d0e21e
LW
7872- reginsert - insert an operator in front of already-emitted operand
7873*
7874* Means relocating the operand.
7875*/
76e3520e 7876STATIC void
6bda09f9 7877S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7878{
97aff369 7879 dVAR;
c277df42
IZ
7880 register regnode *src;
7881 register regnode *dst;
7882 register regnode *place;
504618e9 7883 const int offset = regarglen[(U8)op];
6bda09f9 7884 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7885 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7886/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7887 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7888 if (SIZE_ONLY) {
6bda09f9 7889 RExC_size += size;
a0d0e21e
LW
7890 return;
7891 }
a687059c 7892
830247a4 7893 src = RExC_emit;
6bda09f9 7894 RExC_emit += size;
830247a4 7895 dst = RExC_emit;
40d049e4 7896 if (RExC_open_parens) {
6bda09f9 7897 int paren;
6d99fb9b 7898 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7899 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7900 if ( RExC_open_parens[paren] >= opnd ) {
7901 DEBUG_PARSE_FMT("open"," - %d",size);
7902 RExC_open_parens[paren] += size;
7903 } else {
7904 DEBUG_PARSE_FMT("open"," - %s","ok");
7905 }
7906 if ( RExC_close_parens[paren] >= opnd ) {
7907 DEBUG_PARSE_FMT("close"," - %d",size);
7908 RExC_close_parens[paren] += size;
7909 } else {
7910 DEBUG_PARSE_FMT("close"," - %s","ok");
7911 }
7912 }
6bda09f9 7913 }
40d049e4 7914
fac92740 7915 while (src > opnd) {
c277df42 7916 StructCopy(--src, --dst, regnode);
7122b237 7917#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7918 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7919 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7920 "reg_insert",
ccb2c380
MP
7921 __LINE__,
7922 reg_name[op],
07be1b83
YO
7923 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7924 ? "Overwriting end of array!\n" : "OK",
7925 (UV)(src - RExC_emit_start),
7926 (UV)(dst - RExC_emit_start),
7927 (UV)RExC_offsets[0]));
ccb2c380
MP
7928 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7929 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 7930 }
7122b237 7931#endif
fac92740
MJD
7932 }
7933
a0d0e21e
LW
7934
7935 place = opnd; /* Op node, where operand used to be. */
7122b237 7936#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7937 if (RExC_offsets) { /* MJD */
07be1b83 7938 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7939 "reginsert",
ccb2c380
MP
7940 __LINE__,
7941 reg_name[op],
07be1b83 7942 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7943 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7944 (UV)(place - RExC_emit_start),
7945 (UV)(RExC_parse - RExC_start),
786e8c11 7946 (UV)RExC_offsets[0]));
ccb2c380 7947 Set_Node_Offset(place, RExC_parse);
45948336 7948 Set_Node_Length(place, 1);
fac92740 7949 }
7122b237 7950#endif
c277df42
IZ
7951 src = NEXTOPER(place);
7952 FILL_ADVANCE_NODE(place, op);
7953 Zero(src, offset, regnode);
a687059c
LW
7954}
7955
7956/*
c277df42 7957- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7958- SEE ALSO: regtail_study
a0d0e21e 7959*/
097eb12c 7960/* TODO: All three parms should be const */
76e3520e 7961STATIC void
3dab1dad 7962S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7963{
97aff369 7964 dVAR;
c277df42 7965 register regnode *scan;
72f13be8 7966 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7967#ifndef DEBUGGING
7968 PERL_UNUSED_ARG(depth);
7969#endif
a0d0e21e 7970
c277df42 7971 if (SIZE_ONLY)
a0d0e21e
LW
7972 return;
7973
7974 /* Find last node. */
7975 scan = p;
7976 for (;;) {
504618e9 7977 regnode * const temp = regnext(scan);
3dab1dad
YO
7978 DEBUG_PARSE_r({
7979 SV * const mysv=sv_newmortal();
7980 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7981 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7982 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7983 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7984 (temp == NULL ? "->" : ""),
7985 (temp == NULL ? reg_name[OP(val)] : "")
7986 );
3dab1dad
YO
7987 });
7988 if (temp == NULL)
7989 break;
7990 scan = temp;
7991 }
7992
7993 if (reg_off_by_arg[OP(scan)]) {
7994 ARG_SET(scan, val - scan);
7995 }
7996 else {
7997 NEXT_OFF(scan) = val - scan;
7998 }
7999}
8000
07be1b83 8001#ifdef DEBUGGING
3dab1dad
YO
8002/*
8003- regtail_study - set the next-pointer at the end of a node chain of p to val.
8004- Look for optimizable sequences at the same time.
8005- currently only looks for EXACT chains.
07be1b83
YO
8006
8007This is expermental code. The idea is to use this routine to perform
8008in place optimizations on branches and groups as they are constructed,
8009with the long term intention of removing optimization from study_chunk so
8010that it is purely analytical.
8011
8012Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8013to control which is which.
8014
3dab1dad
YO
8015*/
8016/* TODO: All four parms should be const */
07be1b83 8017
3dab1dad
YO
8018STATIC U8
8019S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8020{
8021 dVAR;
8022 register regnode *scan;
07be1b83
YO
8023 U8 exact = PSEUDO;
8024#ifdef EXPERIMENTAL_INPLACESCAN
8025 I32 min = 0;
8026#endif
8027
3dab1dad
YO
8028 GET_RE_DEBUG_FLAGS_DECL;
8029
07be1b83 8030
3dab1dad
YO
8031 if (SIZE_ONLY)
8032 return exact;
8033
8034 /* Find last node. */
8035
8036 scan = p;
8037 for (;;) {
8038 regnode * const temp = regnext(scan);
07be1b83
YO
8039#ifdef EXPERIMENTAL_INPLACESCAN
8040 if (PL_regkind[OP(scan)] == EXACT)
8041 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8042 return EXACT;
8043#endif
3dab1dad
YO
8044 if ( exact ) {
8045 switch (OP(scan)) {
8046 case EXACT:
8047 case EXACTF:
8048 case EXACTFL:
8049 if( exact == PSEUDO )
8050 exact= OP(scan);
07be1b83
YO
8051 else if ( exact != OP(scan) )
8052 exact= 0;
3dab1dad
YO
8053 case NOTHING:
8054 break;
8055 default:
8056 exact= 0;
8057 }
8058 }
8059 DEBUG_PARSE_r({
8060 SV * const mysv=sv_newmortal();
8061 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8062 regprop(RExC_rx, mysv, scan);
eaf3ca90 8063 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8064 SvPV_nolen_const(mysv),
eaf3ca90
YO
8065 REG_NODE_NUM(scan),
8066 reg_name[exact]);
3dab1dad 8067 });
a0d0e21e
LW
8068 if (temp == NULL)
8069 break;
8070 scan = temp;
8071 }
07be1b83
YO
8072 DEBUG_PARSE_r({
8073 SV * const mysv_val=sv_newmortal();
8074 DEBUG_PARSE_MSG("");
8075 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8076 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8077 SvPV_nolen_const(mysv_val),
8078 (IV)REG_NODE_NUM(val),
8079 (IV)(val - scan)
07be1b83
YO
8080 );
8081 });
c277df42
IZ
8082 if (reg_off_by_arg[OP(scan)]) {
8083 ARG_SET(scan, val - scan);
a0ed51b3
LW
8084 }
8085 else {
c277df42
IZ
8086 NEXT_OFF(scan) = val - scan;
8087 }
3dab1dad
YO
8088
8089 return exact;
a687059c 8090}
07be1b83 8091#endif
a687059c
LW
8092
8093/*
a687059c
LW
8094 - regcurly - a little FSA that accepts {\d+,?\d*}
8095 */
79072805 8096STATIC I32
5f66b61c 8097S_regcurly(register const char *s)
a687059c
LW
8098{
8099 if (*s++ != '{')
8100 return FALSE;
f0fcb552 8101 if (!isDIGIT(*s))
a687059c 8102 return FALSE;
f0fcb552 8103 while (isDIGIT(*s))
a687059c
LW
8104 s++;
8105 if (*s == ',')
8106 s++;
f0fcb552 8107 while (isDIGIT(*s))
a687059c
LW
8108 s++;
8109 if (*s != '}')
8110 return FALSE;
8111 return TRUE;
8112}
8113
a687059c
LW
8114
8115/*
fd181c75 8116 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8117 */
8118void
097eb12c 8119Perl_regdump(pTHX_ const regexp *r)
a687059c 8120{
35ff7856 8121#ifdef DEBUGGING
97aff369 8122 dVAR;
c445ea15 8123 SV * const sv = sv_newmortal();
ab3bbdeb 8124 SV *dsv= sv_newmortal();
f8fc2ecf 8125 RXi_GET_DECL(r,ri);
a687059c 8126
f8fc2ecf 8127 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8128
8129 /* Header fields of interest. */
ab3bbdeb
YO
8130 if (r->anchored_substr) {
8131 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8132 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8133 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8134 "anchored %s%s at %"IVdf" ",
8135 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8136 (IV)r->anchored_offset);
ab3bbdeb
YO
8137 } else if (r->anchored_utf8) {
8138 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8139 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8140 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8141 "anchored utf8 %s%s at %"IVdf" ",
8142 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8143 (IV)r->anchored_offset);
ab3bbdeb
YO
8144 }
8145 if (r->float_substr) {
8146 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8147 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8148 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8149 "floating %s%s at %"IVdf"..%"UVuf" ",
8150 s, RE_SV_TAIL(r->float_substr),
7b0972df 8151 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8152 } else if (r->float_utf8) {
8153 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8154 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8155 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8156 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8157 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8158 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8159 }
33b8afdf 8160 if (r->check_substr || r->check_utf8)
b81d288d 8161 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8162 (const char *)
8163 (r->check_substr == r->float_substr
8164 && r->check_utf8 == r->float_utf8
8165 ? "(checking floating" : "(checking anchored"));
bbe252da 8166 if (r->extflags & RXf_NOSCAN)
c277df42 8167 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8168 if (r->extflags & RXf_CHECK_ALL)
c277df42 8169 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8170 if (r->check_substr || r->check_utf8)
c277df42
IZ
8171 PerlIO_printf(Perl_debug_log, ") ");
8172
f8fc2ecf
YO
8173 if (ri->regstclass) {
8174 regprop(r, sv, ri->regstclass);
1de06328 8175 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8176 }
bbe252da 8177 if (r->extflags & RXf_ANCH) {
774d564b 8178 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8179 if (r->extflags & RXf_ANCH_BOL)
774d564b 8180 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8181 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8182 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8183 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8184 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8185 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8186 PerlIO_printf(Perl_debug_log, "(GPOS)");
8187 PerlIO_putc(Perl_debug_log, ' ');
8188 }
bbe252da 8189 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8190 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8191 if (r->intflags & PREGf_SKIP)
760ac839 8192 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8193 if (r->intflags & PREGf_IMPLICIT)
760ac839 8194 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8195 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8196 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8197 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8198 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8199#else
96a5add6 8200 PERL_UNUSED_CONTEXT;
65e66c80 8201 PERL_UNUSED_ARG(r);
17c3b450 8202#endif /* DEBUGGING */
a687059c
LW
8203}
8204
8205/*
a0d0e21e
LW
8206- regprop - printable representation of opcode
8207*/
46fc3d4c 8208void
32fc9b6a 8209Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8210{
35ff7856 8211#ifdef DEBUGGING
97aff369 8212 dVAR;
9b155405 8213 register int k;
f8fc2ecf 8214 RXi_GET_DECL(prog,progi);
1de06328 8215 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8216
a0d0e21e 8217
54dc92de 8218 sv_setpvn(sv, "", 0);
8aa23a47 8219
03363afd 8220 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8221 /* It would be nice to FAIL() here, but this may be called from
8222 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8223 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8224 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8225
3dab1dad 8226 k = PL_regkind[OP(o)];
9b155405 8227
2a782b5b 8228 if (k == EXACT) {
396482e1 8229 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8230 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8231 * is a crude hack but it may be the best for now since
8232 * we have no flag "this EXACTish node was UTF-8"
8233 * --jhi */
8234 const char * const s =
ddc5bc0f 8235 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8236 PL_colors[0], PL_colors[1],
8237 PERL_PV_ESCAPE_UNI_DETECT |
8238 PERL_PV_PRETTY_ELIPSES |
8239 PERL_PV_PRETTY_LTGT
8240 );
8241 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8242 } else if (k == TRIE) {
3dab1dad 8243 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8244 * progi->data isn't available here */
1de06328 8245 const char op = OP(o);
647f639f 8246 const U32 n = ARG(o);
1de06328 8247 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8248 (reg_ac_data *)progi->data->data[n] :
1de06328 8249 NULL;
3251b653
NC
8250 const reg_trie_data * const trie
8251 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8252
8253 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8254 DEBUG_TRIE_COMPILE_r(
8255 Perl_sv_catpvf(aTHX_ sv,
8256 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8257 (UV)trie->startstate,
1e2e3d02 8258 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8259 (UV)trie->wordcount,
8260 (UV)trie->minlen,
8261 (UV)trie->maxlen,
8262 (UV)TRIE_CHARCOUNT(trie),
8263 (UV)trie->uniquecharcount
8264 )
8265 );
8266 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8267 int i;
8268 int rangestart = -1;
f46cb337 8269 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8270 Perl_sv_catpvf(aTHX_ sv, "[");
8271 for (i = 0; i <= 256; i++) {
8272 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8273 if (rangestart == -1)
8274 rangestart = i;
8275 } else if (rangestart != -1) {
8276 if (i <= rangestart + 3)
8277 for (; rangestart < i; rangestart++)
8278 put_byte(sv, rangestart);
8279 else {
8280 put_byte(sv, rangestart);
8281 sv_catpvs(sv, "-");
8282 put_byte(sv, i - 1);
8283 }
8284 rangestart = -1;
8285 }
8286 }
8287 Perl_sv_catpvf(aTHX_ sv, "]");
8288 }
8289
a3621e74 8290 } else if (k == CURLY) {
cb434fcc 8291 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8292 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8293 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8294 }
2c2d71f5
JH
8295 else if (k == WHILEM && o->flags) /* Ordinal/of */
8296 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 8297 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 8298 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1f1031fe 8299 if ( prog->paren_names ) {
ee9b8eae
YO
8300 if ( k != REF || OP(o) < NREF) {
8301 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8302 SV **name= av_fetch(list, ARG(o), 0 );
8303 if (name)
8304 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8305 }
8306 else {
8307 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8308 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8309 I32 *nums=(I32*)SvPVX(sv_dat);
8310 SV **name= av_fetch(list, nums[0], 0 );
8311 I32 n;
8312 if (name) {
8313 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8314 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8315 (n ? "," : ""), (IV)nums[n]);
8316 }
8317 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 8318 }
1f1031fe 8319 }
ee9b8eae 8320 }
1f1031fe 8321 } else if (k == GOSUB)
6bda09f9 8322 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8323 else if (k == VERB) {
8324 if (!o->flags)
8325 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
be2597df 8326 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
e2e6a0f1 8327 } else if (k == LOGICAL)
04ebc1ab 8328 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8329 else if (k == ANYOF) {
8330 int i, rangestart = -1;
2d03de9c 8331 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8332
8333 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8334 static const char * const anyofs[] = {
653099ff
GS
8335 "\\w",
8336 "\\W",
8337 "\\s",
8338 "\\S",
8339 "\\d",
8340 "\\D",
8341 "[:alnum:]",
8342 "[:^alnum:]",
8343 "[:alpha:]",
8344 "[:^alpha:]",
8345 "[:ascii:]",
8346 "[:^ascii:]",
8347 "[:ctrl:]",
8348 "[:^ctrl:]",
8349 "[:graph:]",
8350 "[:^graph:]",
8351 "[:lower:]",
8352 "[:^lower:]",
8353 "[:print:]",
8354 "[:^print:]",
8355 "[:punct:]",
8356 "[:^punct:]",
8357 "[:upper:]",
aaa51d5e 8358 "[:^upper:]",
653099ff 8359 "[:xdigit:]",
aaa51d5e
JF
8360 "[:^xdigit:]",
8361 "[:space:]",
8362 "[:^space:]",
8363 "[:blank:]",
8364 "[:^blank:]"
653099ff
GS
8365 };
8366
19860706 8367 if (flags & ANYOF_LOCALE)
396482e1 8368 sv_catpvs(sv, "{loc}");
19860706 8369 if (flags & ANYOF_FOLD)
396482e1 8370 sv_catpvs(sv, "{i}");
653099ff 8371 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8372 if (flags & ANYOF_INVERT)
396482e1 8373 sv_catpvs(sv, "^");
ffc61ed2
JH
8374 for (i = 0; i <= 256; i++) {
8375 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8376 if (rangestart == -1)
8377 rangestart = i;
8378 } else if (rangestart != -1) {
8379 if (i <= rangestart + 3)
8380 for (; rangestart < i; rangestart++)
653099ff 8381 put_byte(sv, rangestart);
ffc61ed2
JH
8382 else {
8383 put_byte(sv, rangestart);
396482e1 8384 sv_catpvs(sv, "-");
ffc61ed2 8385 put_byte(sv, i - 1);
653099ff 8386 }
ffc61ed2 8387 rangestart = -1;
653099ff 8388 }
847a199f 8389 }
ffc61ed2
JH
8390
8391 if (o->flags & ANYOF_CLASS)
bb7a0f54 8392 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8393 if (ANYOF_CLASS_TEST(o,i))
8394 sv_catpv(sv, anyofs[i]);
8395
8396 if (flags & ANYOF_UNICODE)
396482e1 8397 sv_catpvs(sv, "{unicode}");
1aa99e6b 8398 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8399 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8400
8401 {
8402 SV *lv;
32fc9b6a 8403 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8404
ffc61ed2
JH
8405 if (lv) {
8406 if (sw) {
89ebb4a3 8407 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8408
ffc61ed2 8409 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8410 uvchr_to_utf8(s, i);
ffc61ed2 8411
3568d838 8412 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8413 if (rangestart == -1)
8414 rangestart = i;
8415 } else if (rangestart != -1) {
ffc61ed2
JH
8416 if (i <= rangestart + 3)
8417 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8418 const U8 * const e = uvchr_to_utf8(s,rangestart);
8419 U8 *p;
8420 for(p = s; p < e; p++)
ffc61ed2
JH
8421 put_byte(sv, *p);
8422 }
8423 else {
2d03de9c
AL
8424 const U8 *e = uvchr_to_utf8(s,rangestart);
8425 U8 *p;
8426 for (p = s; p < e; p++)
ffc61ed2 8427 put_byte(sv, *p);
396482e1 8428 sv_catpvs(sv, "-");
2d03de9c
AL
8429 e = uvchr_to_utf8(s, i-1);
8430 for (p = s; p < e; p++)
1df70142 8431 put_byte(sv, *p);
ffc61ed2
JH
8432 }
8433 rangestart = -1;
8434 }
19860706 8435 }
ffc61ed2 8436
396482e1 8437 sv_catpvs(sv, "..."); /* et cetera */
19860706 8438 }
fde631ed 8439
ffc61ed2 8440 {
2e0de35c 8441 char *s = savesvpv(lv);
c445ea15 8442 char * const origs = s;
b81d288d 8443
3dab1dad
YO
8444 while (*s && *s != '\n')
8445 s++;
b81d288d 8446
ffc61ed2 8447 if (*s == '\n') {
2d03de9c 8448 const char * const t = ++s;
ffc61ed2
JH
8449
8450 while (*s) {
8451 if (*s == '\n')
8452 *s = ' ';
8453 s++;
8454 }
8455 if (s[-1] == ' ')
8456 s[-1] = 0;
8457
8458 sv_catpv(sv, t);
fde631ed 8459 }
b81d288d 8460
ffc61ed2 8461 Safefree(origs);
fde631ed
JH
8462 }
8463 }
653099ff 8464 }
ffc61ed2 8465
653099ff
GS
8466 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8467 }
9b155405 8468 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8469 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8470#else
96a5add6 8471 PERL_UNUSED_CONTEXT;
65e66c80
SP
8472 PERL_UNUSED_ARG(sv);
8473 PERL_UNUSED_ARG(o);
f9049ba1 8474 PERL_UNUSED_ARG(prog);
17c3b450 8475#endif /* DEBUGGING */
35ff7856 8476}
a687059c 8477
cad2e5aa
JH
8478SV *
8479Perl_re_intuit_string(pTHX_ regexp *prog)
8480{ /* Assume that RE_INTUIT is set */
97aff369 8481 dVAR;
a3621e74 8482 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8483 PERL_UNUSED_CONTEXT;
8484
a3621e74 8485 DEBUG_COMPILE_r(
cfd0369c 8486 {
2d03de9c 8487 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8488 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8489
8490 if (!PL_colorset) reginitcolors();
8491 PerlIO_printf(Perl_debug_log,
a0288114 8492 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8493 PL_colors[4],
8494 prog->check_substr ? "" : "utf8 ",
8495 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8496 s,
8497 PL_colors[1],
8498 (strlen(s) > 60 ? "..." : ""));
8499 } );
8500
33b8afdf 8501 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8502}
8503
84da74a7 8504/*
f8149455 8505 pregfree()
84da74a7 8506
f8149455
YO
8507 handles refcounting and freeing the perl core regexp structure. When
8508 it is necessary to actually free the structure the first thing it
8509 does is call the 'free' method of the regexp_engine associated to to
8510 the regexp, allowing the handling of the void *pprivate; member
8511 first. (This routine is not overridable by extensions, which is why
8512 the extensions free is called first.)
8513
8514 See regdupe and regdupe_internal if you change anything here.
84da74a7 8515*/
f8149455 8516#ifndef PERL_IN_XSUB_RE
2b69d0c2 8517void
864dbfa3 8518Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8519{
27da23d5 8520 dVAR;
fc32ee4a 8521 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8522
7821416a
IZ
8523 if (!r || (--r->refcnt > 0))
8524 return;
f8149455
YO
8525
8526 CALLREGFREE_PVT(r); /* free the private data */
8527
43c5f42d
NC
8528 /* gcov results gave these as non-null 100% of the time, so there's no
8529 optimisation in checking them before calling Safefree */
8530 Safefree(r->precomp);
ed252734 8531 RX_MATCH_COPY_FREE(r);
f8c7b90f 8532#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8533 if (r->saved_copy)
8534 SvREFCNT_dec(r->saved_copy);
8535#endif
a193d654
GS
8536 if (r->substrs) {
8537 if (r->anchored_substr)
8538 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8539 if (r->anchored_utf8)
8540 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8541 if (r->float_substr)
8542 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8543 if (r->float_utf8)
8544 SvREFCNT_dec(r->float_utf8);
2779dcf1 8545 Safefree(r->substrs);
a193d654 8546 }
81714fb9
YO
8547 if (r->paren_names)
8548 SvREFCNT_dec(r->paren_names);
f8149455
YO
8549
8550 Safefree(r->startp);
8551 Safefree(r->endp);
8552 Safefree(r);
8553}
8554#endif
8555
8556/* regfree_internal()
8557
8558 Free the private data in a regexp. This is overloadable by
8559 extensions. Perl takes care of the regexp structure in pregfree(),
8560 this covers the *pprivate pointer which technically perldoesnt
8561 know about, however of course we have to handle the
8562 regexp_internal structure when no extension is in use.
8563
8564 Note this is called before freeing anything in the regexp
8565 structure.
8566 */
8567
8568void
8569Perl_regfree_internal(pTHX_ struct regexp *r)
8570{
8571 dVAR;
8572 RXi_GET_DECL(r,ri);
8573 GET_RE_DEBUG_FLAGS_DECL;
8574
8575 DEBUG_COMPILE_r({
8576 if (!PL_colorset)
8577 reginitcolors();
8578 {
8579 SV *dsv= sv_newmortal();
8580 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8581 dsv, r->precomp, r->prelen, 60);
8582 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8583 PL_colors[4],PL_colors[5],s);
8584 }
8585 });
7122b237
YO
8586#ifdef RE_TRACK_PATTERN_OFFSETS
8587 if (ri->u.offsets)
8588 Safefree(ri->u.offsets); /* 20010421 MJD */
8589#endif
f8fc2ecf
YO
8590 if (ri->data) {
8591 int n = ri->data->count;
f3548bdc
DM
8592 PAD* new_comppad = NULL;
8593 PAD* old_comppad;
4026c95a 8594 PADOFFSET refcnt;
dfad63ad 8595
c277df42 8596 while (--n >= 0) {
261faec3 8597 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8598 switch (ri->data->what[n]) {
c277df42 8599 case 's':
81714fb9 8600 case 'S':
55eed653 8601 case 'u':
f8fc2ecf 8602 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8603 break;
653099ff 8604 case 'f':
f8fc2ecf 8605 Safefree(ri->data->data[n]);
653099ff 8606 break;
dfad63ad 8607 case 'p':
f8fc2ecf 8608 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8609 break;
c277df42 8610 case 'o':
dfad63ad 8611 if (new_comppad == NULL)
cea2e8a9 8612 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8613 PAD_SAVE_LOCAL(old_comppad,
8614 /* Watch out for global destruction's random ordering. */
c445ea15 8615 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8616 );
b34c0dd4 8617 OP_REFCNT_LOCK;
f8fc2ecf 8618 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8619 OP_REFCNT_UNLOCK;
8620 if (!refcnt)
f8fc2ecf 8621 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8622
f3548bdc 8623 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8624 SvREFCNT_dec((SV*)new_comppad);
8625 new_comppad = NULL;
c277df42
IZ
8626 break;
8627 case 'n':
9e55ce06 8628 break;
07be1b83 8629 case 'T':
be8e71aa
YO
8630 { /* Aho Corasick add-on structure for a trie node.
8631 Used in stclass optimization only */
07be1b83 8632 U32 refcount;
f8fc2ecf 8633 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8634 OP_REFCNT_LOCK;
8635 refcount = --aho->refcount;
8636 OP_REFCNT_UNLOCK;
8637 if ( !refcount ) {
446bd890
NC
8638 PerlMemShared_free(aho->states);
8639 PerlMemShared_free(aho->fail);
446bd890
NC
8640 /* do this last!!!! */
8641 PerlMemShared_free(ri->data->data[n]);
8642 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8643 }
8644 }
8645 break;
a3621e74 8646 case 't':
07be1b83 8647 {
be8e71aa 8648 /* trie structure. */
07be1b83 8649 U32 refcount;
f8fc2ecf 8650 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8651 OP_REFCNT_LOCK;
8652 refcount = --trie->refcount;
8653 OP_REFCNT_UNLOCK;
8654 if ( !refcount ) {
446bd890 8655 PerlMemShared_free(trie->charmap);
446bd890
NC
8656 PerlMemShared_free(trie->states);
8657 PerlMemShared_free(trie->trans);
07be1b83 8658 if (trie->bitmap)
446bd890 8659 PerlMemShared_free(trie->bitmap);
07be1b83 8660 if (trie->wordlen)
446bd890 8661 PerlMemShared_free(trie->wordlen);
786e8c11 8662 if (trie->jump)
446bd890 8663 PerlMemShared_free(trie->jump);
786e8c11 8664 if (trie->nextword)
446bd890 8665 PerlMemShared_free(trie->nextword);
446bd890
NC
8666 /* do this last!!!! */
8667 PerlMemShared_free(ri->data->data[n]);
a3621e74 8668 }
07be1b83
YO
8669 }
8670 break;
c277df42 8671 default:
f8fc2ecf 8672 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8673 }
8674 }
f8fc2ecf
YO
8675 Safefree(ri->data->what);
8676 Safefree(ri->data);
a0d0e21e 8677 }
f8fc2ecf
YO
8678 if (ri->swap) {
8679 Safefree(ri->swap->startp);
8680 Safefree(ri->swap->endp);
8681 Safefree(ri->swap);
c74340f9 8682 }
f8fc2ecf 8683 Safefree(ri);
a687059c 8684}
c277df42 8685
84da74a7
YO
8686#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8687#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8688#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8689#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8690
8691/*
8692 regdupe - duplicate a regexp.
8693
8694 This routine is called by sv.c's re_dup and is expected to clone a
8695 given regexp structure. It is a no-op when not under USE_ITHREADS.
8696 (Originally this *was* re_dup() for change history see sv.c)
8697
f8149455
YO
8698 After all of the core data stored in struct regexp is duplicated
8699 the regexp_engine.dupe method is used to copy any private data
8700 stored in the *pprivate pointer. This allows extensions to handle
8701 any duplication it needs to do.
8702
8703 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8704*/
a3c0e9ca 8705#if defined(USE_ITHREADS)
f8149455 8706#ifndef PERL_IN_XSUB_RE
84da74a7 8707regexp *
f8149455 8708Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8709{
84da74a7 8710 dVAR;
f8fc2ecf 8711 regexp *ret;
f8149455 8712 int i, npar;
84da74a7 8713 struct reg_substr_datum *s;
644c02aa 8714
84da74a7
YO
8715 if (!r)
8716 return (REGEXP *)NULL;
8717
8718 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8719 return ret;
8720
f8149455 8721
84da74a7 8722 npar = r->nparens+1;
f8fc2ecf 8723 Newxz(ret, 1, regexp);
84da74a7
YO
8724 Newx(ret->startp, npar, I32);
8725 Copy(r->startp, ret->startp, npar, I32);
8726 Newx(ret->endp, npar, I32);
f8149455 8727 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8728
c945c181 8729 if (r->substrs) {
785a26d5
YO
8730 Newx(ret->substrs, 1, struct reg_substr_data);
8731 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8732 s->min_offset = r->substrs->data[i].min_offset;
8733 s->max_offset = r->substrs->data[i].max_offset;
8734 s->end_shift = r->substrs->data[i].end_shift;
8735 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8736 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8737 }
8738 } else
8739 ret->substrs = NULL;
f8149455
YO
8740
8741 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8742 ret->refcnt = r->refcnt;
8743 ret->minlen = r->minlen;
8744 ret->minlenret = r->minlenret;
8745 ret->prelen = r->prelen;
8746 ret->nparens = r->nparens;
8747 ret->lastparen = r->lastparen;
8748 ret->lastcloseparen = r->lastcloseparen;
8749 ret->intflags = r->intflags;
8750 ret->extflags = r->extflags;
8751
8752 ret->sublen = r->sublen;
8753
8754 ret->engine = r->engine;
8755
8756 ret->paren_names = hv_dup_inc(r->paren_names, param);
8757
8758 if (RX_MATCH_COPIED(ret))
8759 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8760 else
8761 ret->subbeg = NULL;
8762#ifdef PERL_OLD_COPY_ON_WRITE
8763 ret->saved_copy = NULL;
8764#endif
8765
8766 ret->pprivate = r->pprivate;
785a26d5
YO
8767 if (ret->pprivate)
8768 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8769
8770 ptr_table_store(PL_ptr_table, r, ret);
8771 return ret;
8772}
8773#endif /* PERL_IN_XSUB_RE */
8774
8775/*
8776 regdupe_internal()
8777
8778 This is the internal complement to regdupe() which is used to copy
8779 the structure pointed to by the *pprivate pointer in the regexp.
8780 This is the core version of the extension overridable cloning hook.
8781 The regexp structure being duplicated will be copied by perl prior
8782 to this and will be provided as the regexp *r argument, however
8783 with the /old/ structures pprivate pointer value. Thus this routine
8784 may override any copying normally done by perl.
8785
8786 It returns a pointer to the new regexp_internal structure.
8787*/
8788
8789void *
8790Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8791{
8792 dVAR;
8793 regexp_internal *reti;
8794 int len, npar;
8795 RXi_GET_DECL(r,ri);
8796
8797 npar = r->nparens+1;
7122b237 8798 len = ProgLen(ri);
f8149455
YO
8799
8800 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8801 Copy(ri->program, reti->program, len+1, regnode);
8802
8803 if(ri->swap) {
8804 Newx(reti->swap, 1, regexp_paren_ofs);
8805 /* no need to copy these */
8806 Newx(reti->swap->startp, npar, I32);
8807 Newx(reti->swap->endp, npar, I32);
8808 } else {
8809 reti->swap = NULL;
8810 }
8811
84da74a7 8812
f8fc2ecf
YO
8813 reti->regstclass = NULL;
8814 if (ri->data) {
84da74a7 8815 struct reg_data *d;
f8fc2ecf 8816 const int count = ri->data->count;
84da74a7
YO
8817 int i;
8818
8819 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8820 char, struct reg_data);
8821 Newx(d->what, count, U8);
8822
8823 d->count = count;
8824 for (i = 0; i < count; i++) {
f8fc2ecf 8825 d->what[i] = ri->data->what[i];
84da74a7 8826 switch (d->what[i]) {
55eed653 8827 /* legal options are one of: sSfpontTu
84da74a7
YO
8828 see also regcomp.h and pregfree() */
8829 case 's':
81714fb9 8830 case 'S':
0536c0a7 8831 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8832 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8833 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8834 break;
84da74a7
YO
8835 case 'f':
8836 /* This is cheating. */
8837 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8838 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8839 struct regnode_charclass_class);
f8fc2ecf 8840 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8841 break;
8842 case 'o':
bbe252da
YO
8843 /* Compiled op trees are readonly and in shared memory,
8844 and can thus be shared without duplication. */
84da74a7 8845 OP_REFCNT_LOCK;
f8fc2ecf 8846 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8847 OP_REFCNT_UNLOCK;
8848 break;
23eab42c
NC
8849 case 'T':
8850 /* Trie stclasses are readonly and can thus be shared
8851 * without duplication. We free the stclass in pregfree
8852 * when the corresponding reg_ac_data struct is freed.
8853 */
8854 reti->regstclass= ri->regstclass;
8855 /* Fall through */
84da74a7 8856 case 't':
84da74a7 8857 OP_REFCNT_LOCK;
0536c0a7 8858 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8859 OP_REFCNT_UNLOCK;
0536c0a7
NC
8860 /* Fall through */
8861 case 'n':
8862 d->data[i] = ri->data->data[i];
84da74a7 8863 break;
84da74a7 8864 default:
f8fc2ecf 8865 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8866 }
8867 }
8868
f8fc2ecf 8869 reti->data = d;
84da74a7
YO
8870 }
8871 else
f8fc2ecf 8872 reti->data = NULL;
84da74a7 8873
cde0cee5
YO
8874 reti->name_list_idx = ri->name_list_idx;
8875
7122b237
YO
8876#ifdef RE_TRACK_PATTERN_OFFSETS
8877 if (ri->u.offsets) {
8878 Newx(reti->u.offsets, 2*len+1, U32);
8879 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
8880 }
8881#else
8882 SetProgLen(reti,len);
8883#endif
8884
f8149455 8885 return (void*)reti;
84da74a7 8886}
f8149455
YO
8887
8888#endif /* USE_ITHREADS */
84da74a7 8889
de8c5301
YO
8890/*
8891 reg_stringify()
8892
8893 converts a regexp embedded in a MAGIC struct to its stringified form,
8894 caching the converted form in the struct and returns the cached
8895 string.
8896
8897 If lp is nonnull then it is used to return the length of the
8898 resulting string
8899
8900 If flags is nonnull and the returned string contains UTF8 then
f8149455 8901 (*flags & 1) will be true.
de8c5301
YO
8902
8903 If haseval is nonnull then it is used to return whether the pattern
8904 contains evals.
8905
8906 Normally called via macro:
8907
f8149455 8908 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
8909
8910 And internally with
8911
f8149455 8912 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
8913
8914 See sv_2pv_flags() in sv.c for an example of internal usage.
8915
8916 */
f8149455 8917#ifndef PERL_IN_XSUB_RE
de8c5301
YO
8918char *
8919Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8920 dVAR;
8921 const regexp * const re = (regexp *)mg->mg_obj;
f8149455 8922
de8c5301 8923 if (!mg->mg_ptr) {
a20207d7 8924 const char *fptr = STD_PAT_MODS; /*"msix"*/
cde0cee5 8925 char reflags[7];
de8c5301 8926 char ch;
cde0cee5
YO
8927 bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
8928 bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
8929 U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12);
8930 bool need_newline = 0;
8931 int left = 0;
8932 int right = 4 + hask;
8933 if (hask)
a20207d7 8934 reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/
de8c5301
YO
8935 while((ch = *fptr++)) {
8936 if(reganch & 1) {
8937 reflags[left++] = ch;
8938 }
8939 else {
8940 reflags[right--] = ch;
8941 }
8942 reganch >>= 1;
8943 }
cde0cee5 8944 if(hasm) {
de8c5301 8945 reflags[left] = '-';
cde0cee5 8946 left = 5 + hask;
de8c5301 8947 }
cde0cee5 8948 /* printf("[%*.7s]\n",left,reflags); */
de8c5301
YO
8949 mg->mg_len = re->prelen + 4 + left;
8950 /*
8951 * If /x was used, we have to worry about a regex ending with a
8952 * comment later being embedded within another regex. If so, we don't
8953 * want this regex's "commentization" to leak out to the right part of
8954 * the enclosing regex, we must cap it with a newline.
8955 *
8956 * So, if /x was used, we scan backwards from the end of the regex. If
8957 * we find a '#' before we find a newline, we need to add a newline
8958 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8959 * we don't need to add anything. -jfriedl
8960 */
bbe252da 8961 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8962 const char *endptr = re->precomp + re->prelen;
8963 while (endptr >= re->precomp) {
8964 const char c = *(endptr--);
8965 if (c == '\n')
8966 break; /* don't need another */
8967 if (c == '#') {
8968 /* we end while in a comment, so we need a newline */
8969 mg->mg_len++; /* save space for it */
8970 need_newline = 1; /* note to add it */
8971 break;
8972 }
8973 }
8974 }
8975
8976 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8977 mg->mg_ptr[0] = '(';
8978 mg->mg_ptr[1] = '?';
8979 Copy(reflags, mg->mg_ptr+2, left, char);
8980 *(mg->mg_ptr+left+2) = ':';
8981 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8982 if (need_newline)
8983 mg->mg_ptr[mg->mg_len - 2] = '\n';
8984 mg->mg_ptr[mg->mg_len - 1] = ')';
8985 mg->mg_ptr[mg->mg_len] = 0;
8986 }
8987 if (haseval)
f8149455 8988 *haseval = re->seen_evals;
de8c5301 8989 if (flags)
bbe252da 8990 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8991
8992 if (lp)
8993 *lp = mg->mg_len;
8994 return mg->mg_ptr;
8995}
8996
c277df42
IZ
8997/*
8998 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8999 */
9000regnode *
864dbfa3 9001Perl_regnext(pTHX_ register regnode *p)
c277df42 9002{
97aff369 9003 dVAR;
c277df42
IZ
9004 register I32 offset;
9005
f8fc2ecf 9006 if (!p)
c277df42
IZ
9007 return(NULL);
9008
9009 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9010 if (offset == 0)
9011 return(NULL);
9012
c277df42 9013 return(p+offset);
c277df42 9014}
76234dfb 9015#endif
c277df42 9016
01f988be 9017STATIC void
cea2e8a9 9018S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9019{
9020 va_list args;
9021 STRLEN l1 = strlen(pat1);
9022 STRLEN l2 = strlen(pat2);
9023 char buf[512];
06bf62c7 9024 SV *msv;
73d840c0 9025 const char *message;
c277df42
IZ
9026
9027 if (l1 > 510)
9028 l1 = 510;
9029 if (l1 + l2 > 510)
9030 l2 = 510 - l1;
9031 Copy(pat1, buf, l1 , char);
9032 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9033 buf[l1 + l2] = '\n';
9034 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9035#ifdef I_STDARG
9036 /* ANSI variant takes additional second argument */
c277df42 9037 va_start(args, pat2);
8736538c
AS
9038#else
9039 va_start(args);
9040#endif
5a844595 9041 msv = vmess(buf, &args);
c277df42 9042 va_end(args);
cfd0369c 9043 message = SvPV_const(msv,l1);
c277df42
IZ
9044 if (l1 > 512)
9045 l1 = 512;
9046 Copy(message, buf, l1 , char);
197cf9b9 9047 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9048 Perl_croak(aTHX_ "%s", buf);
c277df42 9049}
a0ed51b3
LW
9050
9051/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9052
76234dfb 9053#ifndef PERL_IN_XSUB_RE
a0ed51b3 9054void
864dbfa3 9055Perl_save_re_context(pTHX)
b81d288d 9056{
97aff369 9057 dVAR;
1ade1aa1
NC
9058
9059 struct re_save_state *state;
9060
9061 SAVEVPTR(PL_curcop);
9062 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9063
9064 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9065 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9066 SSPUSHINT(SAVEt_RE_STATE);
9067
46ab3289 9068 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9069
a0ed51b3 9070 PL_reg_start_tmp = 0;
a0ed51b3 9071 PL_reg_start_tmpl = 0;
c445ea15 9072 PL_reg_oldsaved = NULL;
a5db57d6 9073 PL_reg_oldsavedlen = 0;
a5db57d6 9074 PL_reg_maxiter = 0;
a5db57d6 9075 PL_reg_leftiter = 0;
c445ea15 9076 PL_reg_poscache = NULL;
a5db57d6 9077 PL_reg_poscache_size = 0;
1ade1aa1
NC
9078#ifdef PERL_OLD_COPY_ON_WRITE
9079 PL_nrs = NULL;
9080#endif
ada6e8a9 9081
c445ea15
AL
9082 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9083 if (PL_curpm) {
9084 const REGEXP * const rx = PM_GETRE(PL_curpm);
9085 if (rx) {
1df70142 9086 U32 i;
ada6e8a9 9087 for (i = 1; i <= rx->nparens; i++) {
1df70142 9088 char digits[TYPE_CHARS(long)];
d9fad198 9089 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9090 GV *const *const gvp
9091 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9092
b37c2d43
AL
9093 if (gvp) {
9094 GV * const gv = *gvp;
9095 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9096 save_scalar(gv);
49f27e4b 9097 }
ada6e8a9
AMS
9098 }
9099 }
9100 }
a0ed51b3 9101}
76234dfb 9102#endif
51371543 9103
51371543 9104static void
acfe0abc 9105clear_re(pTHX_ void *r)
51371543 9106{
97aff369 9107 dVAR;
51371543
GS
9108 ReREFCNT_dec((regexp *)r);
9109}
ffbc6a93 9110
a28509cc
AL
9111#ifdef DEBUGGING
9112
9113STATIC void
9114S_put_byte(pTHX_ SV *sv, int c)
9115{
9116 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9117 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9118 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9119 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9120 else
9121 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9122}
9123
786e8c11 9124
3dab1dad
YO
9125#define CLEAR_OPTSTART \
9126 if (optstart) STMT_START { \
70685ca0 9127 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9128 optstart=NULL; \
9129 } STMT_END
9130
786e8c11 9131#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9132
b5a2f8d8
NC
9133STATIC const regnode *
9134S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9135 const regnode *last, const regnode *plast,
9136 SV* sv, I32 indent, U32 depth)
a28509cc 9137{
97aff369 9138 dVAR;
786e8c11 9139 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9140 register const regnode *next;
3dab1dad 9141 const regnode *optstart= NULL;
1f1031fe 9142
f8fc2ecf 9143 RXi_GET_DECL(r,ri);
3dab1dad 9144 GET_RE_DEBUG_FLAGS_DECL;
1f1031fe 9145
786e8c11
YO
9146#ifdef DEBUG_DUMPUNTIL
9147 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9148 last ? last-start : 0,plast ? plast-start : 0);
9149#endif
9150
9151 if (plast && plast < last)
9152 last= plast;
9153
9154 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9155 /* While that wasn't END last time... */
a28509cc
AL
9156 NODE_ALIGN(node);
9157 op = OP(node);
de734bd5 9158 if (op == CLOSE || op == WHILEM)
786e8c11 9159 indent--;
b5a2f8d8 9160 next = regnext((regnode *)node);
1f1031fe 9161
a28509cc 9162 /* Where, what. */
8e11feef 9163 if (OP(node) == OPTIMIZED) {
e68ec53f 9164 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9165 optstart = node;
3dab1dad 9166 else
8e11feef 9167 goto after_print;
3dab1dad
YO
9168 } else
9169 CLEAR_OPTSTART;
1f1031fe 9170
32fc9b6a 9171 regprop(r, sv, node);
a28509cc 9172 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9173 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
9174
9175 if (OP(node) != OPTIMIZED) {
9176 if (next == NULL) /* Next ptr. */
9177 PerlIO_printf(Perl_debug_log, " (0)");
9178 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9179 PerlIO_printf(Perl_debug_log, " (FAIL)");
9180 else
9181 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9182 (void)PerlIO_putc(Perl_debug_log, '\n');
9183 }
9184
a28509cc
AL
9185 after_print:
9186 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9187 assert(next);
9188 {
9189 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9190 ? regnext((regnode *)next)
9191 : next);
be8e71aa
YO
9192 if (last && nnode > last)
9193 nnode = last;
786e8c11 9194 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9195 }
a28509cc
AL
9196 }
9197 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9198 assert(next);
786e8c11 9199 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9200 }
9201 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9202 const regnode *this_trie = node;
1de06328 9203 const char op = OP(node);
647f639f 9204 const U32 n = ARG(node);
1de06328 9205 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9206 (reg_ac_data *)ri->data->data[n] :
1de06328 9207 NULL;
3251b653
NC
9208 const reg_trie_data * const trie =
9209 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9210#ifdef DEBUGGING
9211 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9212#endif
786e8c11 9213 const regnode *nextbranch= NULL;
a28509cc 9214 I32 word_idx;
1de06328 9215 sv_setpvn(sv, "", 0);
786e8c11 9216 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9217 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9218
9219 PerlIO_printf(Perl_debug_log, "%*s%s ",
9220 (int)(2*(indent+3)), "",
9221 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9222 PL_colors[0], PL_colors[1],
9223 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9224 PERL_PV_PRETTY_ELIPSES |
7f69552c 9225 PERL_PV_PRETTY_LTGT
786e8c11
YO
9226 )
9227 : "???"
9228 );
9229 if (trie->jump) {
40d049e4 9230 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9231 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9232 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9233 if (dist) {
9234 if (!nextbranch)
24b23f37 9235 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9236 DUMPUNTIL(this_trie + dist, nextbranch);
9237 }
786e8c11
YO
9238 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9239 nextbranch= regnext((regnode *)nextbranch);
9240 } else {
9241 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9242 }
786e8c11
YO
9243 }
9244 if (last && next > last)
9245 node= last;
9246 else
9247 node= next;
a28509cc 9248 }
786e8c11
YO
9249 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9250 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9251 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9252 }
9253 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9254 assert(next);
786e8c11 9255 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9256 }
9257 else if ( op == PLUS || op == STAR) {
786e8c11 9258 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9259 }
9260 else if (op == ANYOF) {
9261 /* arglen 1 + class block */
9262 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9263 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9264 node = NEXTOPER(node);
9265 }
9266 else if (PL_regkind[(U8)op] == EXACT) {
9267 /* Literal string, where present. */
9268 node += NODE_SZ_STR(node) - 1;
9269 node = NEXTOPER(node);
9270 }
9271 else {
9272 node = NEXTOPER(node);
9273 node += regarglen[(U8)op];
9274 }
9275 if (op == CURLYX || op == OPEN)
786e8c11 9276 indent++;
a28509cc 9277 }
3dab1dad 9278 CLEAR_OPTSTART;
786e8c11 9279#ifdef DEBUG_DUMPUNTIL
70685ca0 9280 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9281#endif
1de06328 9282 return node;
a28509cc
AL
9283}
9284
9285#endif /* DEBUGGING */
9286
241d1a3b
NC
9287/*
9288 * Local variables:
9289 * c-indentation-style: bsd
9290 * c-basic-offset: 4
9291 * indent-tabs-mode: t
9292 * End:
9293 *
37442d52
RGS
9294 * ex: set ts=8 sts=4 sw=4 noet:
9295 */