This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: run/cloexec.t is failing
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
154#ifdef RE_TRACK_PATTERN_OFFSETS
155#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
156#endif
830247a4 157#define RExC_emit (pRExC_state->emit)
fac92740 158#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
159#define RExC_naughty (pRExC_state->naughty)
160#define RExC_sawback (pRExC_state->sawback)
161#define RExC_seen (pRExC_state->seen)
162#define RExC_size (pRExC_state->size)
163#define RExC_npar (pRExC_state->npar)
e2e6a0f1 164#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
165#define RExC_extralen (pRExC_state->extralen)
166#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
167#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 168#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 169#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
170#define RExC_open_parens (pRExC_state->open_parens)
171#define RExC_close_parens (pRExC_state->close_parens)
172#define RExC_opend (pRExC_state->opend)
81714fb9 173#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
174#define RExC_recurse (pRExC_state->recurse)
175#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 176
cde0cee5 177
a687059c
LW
178#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
179#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
180 ((*s) == '{' && regcurly(s)))
a687059c 181
35c8bce7
LW
182#ifdef SPSTART
183#undef SPSTART /* dratted cpp namespace... */
184#endif
a687059c
LW
185/*
186 * Flags to be passed up and down.
187 */
a687059c 188#define WORST 0 /* Worst case. */
821b33a5 189#define HASWIDTH 0x1 /* Known to match non-null strings. */
a0d0e21e
LW
190#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
191#define SPSTART 0x4 /* Starts with * or +. */
192#define TRYAGAIN 0x8 /* Weeded out a declaration. */
a687059c 193
3dab1dad
YO
194#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
195
07be1b83
YO
196/* whether trie related optimizations are enabled */
197#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
198#define TRIE_STUDY_OPT
786e8c11 199#define FULL_TRIE_STUDY
07be1b83
YO
200#define TRIE_STCLASS
201#endif
1de06328
YO
202
203
40d049e4
YO
204
205#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
206#define PBITVAL(paren) (1 << ((paren) & 7))
207#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
208#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
209#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
210
211
1de06328
YO
212/* About scan_data_t.
213
214 During optimisation we recurse through the regexp program performing
215 various inplace (keyhole style) optimisations. In addition study_chunk
216 and scan_commit populate this data structure with information about
217 what strings MUST appear in the pattern. We look for the longest
218 string that must appear for at a fixed location, and we look for the
219 longest string that may appear at a floating location. So for instance
220 in the pattern:
221
222 /FOO[xX]A.*B[xX]BAR/
223
224 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
225 strings (because they follow a .* construct). study_chunk will identify
226 both FOO and BAR as being the longest fixed and floating strings respectively.
227
228 The strings can be composites, for instance
229
230 /(f)(o)(o)/
231
232 will result in a composite fixed substring 'foo'.
233
234 For each string some basic information is maintained:
235
236 - offset or min_offset
237 This is the position the string must appear at, or not before.
238 It also implicitly (when combined with minlenp) tells us how many
239 character must match before the string we are searching.
240 Likewise when combined with minlenp and the length of the string
241 tells us how many characters must appear after the string we have
242 found.
243
244 - max_offset
245 Only used for floating strings. This is the rightmost point that
246 the string can appear at. Ifset to I32 max it indicates that the
247 string can occur infinitely far to the right.
248
249 - minlenp
250 A pointer to the minimum length of the pattern that the string
251 was found inside. This is important as in the case of positive
252 lookahead or positive lookbehind we can have multiple patterns
253 involved. Consider
254
255 /(?=FOO).*F/
256
257 The minimum length of the pattern overall is 3, the minimum length
258 of the lookahead part is 3, but the minimum length of the part that
259 will actually match is 1. So 'FOO's minimum length is 3, but the
260 minimum length for the F is 1. This is important as the minimum length
261 is used to determine offsets in front of and behind the string being
262 looked for. Since strings can be composites this is the length of the
263 pattern at the time it was commited with a scan_commit. Note that
264 the length is calculated by study_chunk, so that the minimum lengths
265 are not known until the full pattern has been compiled, thus the
266 pointer to the value.
267
268 - lookbehind
269
270 In the case of lookbehind the string being searched for can be
271 offset past the start point of the final matching string.
272 If this value was just blithely removed from the min_offset it would
273 invalidate some of the calculations for how many chars must match
274 before or after (as they are derived from min_offset and minlen and
275 the length of the string being searched for).
276 When the final pattern is compiled and the data is moved from the
277 scan_data_t structure into the regexp structure the information
278 about lookbehind is factored in, with the information that would
279 have been lost precalculated in the end_shift field for the
280 associated string.
281
282 The fields pos_min and pos_delta are used to store the minimum offset
283 and the delta to the maximum offset at the current point in the pattern.
284
285*/
2c2d71f5
JH
286
287typedef struct scan_data_t {
1de06328
YO
288 /*I32 len_min; unused */
289 /*I32 len_delta; unused */
2c2d71f5
JH
290 I32 pos_min;
291 I32 pos_delta;
292 SV *last_found;
1de06328 293 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
294 I32 last_start_min;
295 I32 last_start_max;
1de06328
YO
296 SV **longest; /* Either &l_fixed, or &l_float. */
297 SV *longest_fixed; /* longest fixed string found in pattern */
298 I32 offset_fixed; /* offset where it starts */
299 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
300 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
301 SV *longest_float; /* longest floating string found in pattern */
302 I32 offset_float_min; /* earliest point in string it can appear */
303 I32 offset_float_max; /* latest point in string it can appear */
304 I32 *minlen_float; /* pointer to the minlen relevent to the string */
305 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
306 I32 flags;
307 I32 whilem_c;
cb434fcc 308 I32 *last_closep;
653099ff 309 struct regnode_charclass_class *start_class;
2c2d71f5
JH
310} scan_data_t;
311
a687059c 312/*
e50aee73 313 * Forward declarations for pregcomp()'s friends.
a687059c 314 */
a0d0e21e 315
27da23d5 316static const scan_data_t zero_scan_data =
1de06328 317 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
318
319#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
320#define SF_BEFORE_SEOL 0x0001
321#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
322#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
323#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
324
09b7f37c
CB
325#ifdef NO_UNARY_PLUS
326# define SF_FIX_SHIFT_EOL (0+2)
327# define SF_FL_SHIFT_EOL (0+4)
328#else
329# define SF_FIX_SHIFT_EOL (+2)
330# define SF_FL_SHIFT_EOL (+4)
331#endif
c277df42
IZ
332
333#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
334#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
335
336#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
337#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
338#define SF_IS_INF 0x0040
339#define SF_HAS_PAR 0x0080
340#define SF_IN_PAR 0x0100
341#define SF_HAS_EVAL 0x0200
342#define SCF_DO_SUBSTR 0x0400
653099ff
GS
343#define SCF_DO_STCLASS_AND 0x0800
344#define SCF_DO_STCLASS_OR 0x1000
345#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 346#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 347
786e8c11 348#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 349#define SCF_SEEN_ACCEPT 0x8000
07be1b83 350
eb160463 351#define UTF (RExC_utf8 != 0)
bbe252da
YO
352#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
353#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 354
ffc61ed2 355#define OOB_UNICODE 12345678
93733859 356#define OOB_NAMEDCLASS -1
b8c5462f 357
a0ed51b3
LW
358#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
359#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
360
8615cb43 361
b45f050a
JF
362/* length of regex to show in messages that don't mark a position within */
363#define RegexLengthToShowInErrorMessages 127
364
365/*
366 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
367 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
368 * op/pragma/warn/regcomp.
369 */
7253e4e3
RK
370#define MARKER1 "<-- HERE" /* marker as it appears in the description */
371#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 372
7253e4e3 373#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
374
375/*
376 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
377 * arg. Show regex, up to a maximum length. If it's too long, chop and add
378 * "...".
379 */
58e23c8d 380#define _FAIL(code) STMT_START { \
bfed75c6 381 const char *ellipses = ""; \
ccb2c380
MP
382 IV len = RExC_end - RExC_precomp; \
383 \
384 if (!SIZE_ONLY) \
385 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
386 if (len > RegexLengthToShowInErrorMessages) { \
387 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
388 len = RegexLengthToShowInErrorMessages - 10; \
389 ellipses = "..."; \
390 } \
58e23c8d 391 code; \
ccb2c380 392} STMT_END
8615cb43 393
58e23c8d
YO
394#define FAIL(msg) _FAIL( \
395 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
396 msg, (int)len, RExC_precomp, ellipses))
397
398#define FAIL2(msg,arg) _FAIL( \
399 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
400 arg, (int)len, RExC_precomp, ellipses))
401
b45f050a 402/*
b45f050a
JF
403 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
404 */
ccb2c380 405#define Simple_vFAIL(m) STMT_START { \
a28509cc 406 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
407 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
408 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
409} STMT_END
b45f050a
JF
410
411/*
412 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
413 */
ccb2c380
MP
414#define vFAIL(m) STMT_START { \
415 if (!SIZE_ONLY) \
416 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
417 Simple_vFAIL(m); \
418} STMT_END
b45f050a
JF
419
420/*
421 * Like Simple_vFAIL(), but accepts two arguments.
422 */
ccb2c380 423#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 424 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
425 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
426 (int)offset, RExC_precomp, RExC_precomp + offset); \
427} STMT_END
b45f050a
JF
428
429/*
430 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
431 */
ccb2c380
MP
432#define vFAIL2(m,a1) STMT_START { \
433 if (!SIZE_ONLY) \
434 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
435 Simple_vFAIL2(m, a1); \
436} STMT_END
b45f050a
JF
437
438
439/*
440 * Like Simple_vFAIL(), but accepts three arguments.
441 */
ccb2c380 442#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 443 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
444 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
445 (int)offset, RExC_precomp, RExC_precomp + offset); \
446} STMT_END
b45f050a
JF
447
448/*
449 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
450 */
ccb2c380
MP
451#define vFAIL3(m,a1,a2) STMT_START { \
452 if (!SIZE_ONLY) \
453 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
454 Simple_vFAIL3(m, a1, a2); \
455} STMT_END
b45f050a
JF
456
457/*
458 * Like Simple_vFAIL(), but accepts four arguments.
459 */
ccb2c380 460#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 461 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
462 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
463 (int)offset, RExC_precomp, RExC_precomp + offset); \
464} STMT_END
b45f050a 465
ccb2c380 466#define vWARN(loc,m) STMT_START { \
a28509cc 467 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
468 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
469 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
470} STMT_END
471
472#define vWARNdep(loc,m) STMT_START { \
a28509cc 473 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
474 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
475 "%s" REPORT_LOCATION, \
476 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
477} STMT_END
478
479
480#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 481 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
482 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
483 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
485
486#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 487 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
488 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
489 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
490} STMT_END
491
492#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
497
498#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 499 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
500 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
501 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
9d1d55b5 503
8615cb43 504
cd439c50 505/* Allow for side effects in s */
ccb2c380
MP
506#define REGC(c,s) STMT_START { \
507 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
508} STMT_END
cd439c50 509
fac92740
MJD
510/* Macros for recording node offsets. 20001227 mjd@plover.com
511 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
512 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
513 * Element 0 holds the number n.
07be1b83 514 * Position is 1 indexed.
fac92740 515 */
7122b237
YO
516#ifndef RE_TRACK_PATTERN_OFFSETS
517#define Set_Node_Offset_To_R(node,byte)
518#define Set_Node_Offset(node,byte)
519#define Set_Cur_Node_Offset
520#define Set_Node_Length_To_R(node,len)
521#define Set_Node_Length(node,len)
522#define Set_Node_Cur_Length(node)
523#define Node_Offset(n)
524#define Node_Length(n)
525#define Set_Node_Offset_Length(node,offset,len)
526#define ProgLen(ri) ri->u.proglen
527#define SetProgLen(ri,x) ri->u.proglen = x
528#else
529#define ProgLen(ri) ri->u.offsets[0]
530#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
531#define Set_Node_Offset_To_R(node,byte) STMT_START { \
532 if (! SIZE_ONLY) { \
533 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 534 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 535 if((node) < 0) { \
551405c4 536 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
537 } else { \
538 RExC_offsets[2*(node)-1] = (byte); \
539 } \
540 } \
541} STMT_END
542
543#define Set_Node_Offset(node,byte) \
544 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
545#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
546
547#define Set_Node_Length_To_R(node,len) STMT_START { \
548 if (! SIZE_ONLY) { \
549 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 550 __LINE__, (int)(node), (int)(len))); \
ccb2c380 551 if((node) < 0) { \
551405c4 552 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
553 } else { \
554 RExC_offsets[2*(node)] = (len); \
555 } \
556 } \
557} STMT_END
558
559#define Set_Node_Length(node,len) \
560 Set_Node_Length_To_R((node)-RExC_emit_start, len)
561#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
562#define Set_Node_Cur_Length(node) \
563 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
564
565/* Get offsets and lengths */
566#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
567#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
568
07be1b83
YO
569#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
570 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
571 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
572} STMT_END
7122b237 573#endif
07be1b83
YO
574
575#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
576#define EXPERIMENTAL_INPLACESCAN
7122b237 577#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 578
304ee84b
YO
579#define DEBUG_STUDYDATA(str,data,depth) \
580DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 581 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
582 "%*s" str "Pos:%"IVdf"/%"IVdf \
583 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
584 (int)(depth)*2, "", \
585 (IV)((data)->pos_min), \
586 (IV)((data)->pos_delta), \
304ee84b 587 (UV)((data)->flags), \
1de06328 588 (IV)((data)->whilem_c), \
304ee84b
YO
589 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
590 is_inf ? "INF " : "" \
1de06328
YO
591 ); \
592 if ((data)->last_found) \
593 PerlIO_printf(Perl_debug_log, \
594 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
595 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
596 SvPVX_const((data)->last_found), \
597 (IV)((data)->last_end), \
598 (IV)((data)->last_start_min), \
599 (IV)((data)->last_start_max), \
600 ((data)->longest && \
601 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
602 SvPVX_const((data)->longest_fixed), \
603 (IV)((data)->offset_fixed), \
604 ((data)->longest && \
605 (data)->longest==&((data)->longest_float)) ? "*" : "", \
606 SvPVX_const((data)->longest_float), \
607 (IV)((data)->offset_float_min), \
608 (IV)((data)->offset_float_max) \
609 ); \
610 PerlIO_printf(Perl_debug_log,"\n"); \
611});
612
acfe0abc 613static void clear_re(pTHX_ void *r);
4327152a 614
653099ff 615/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 616 Update the longest found anchored substring and the longest found
653099ff
GS
617 floating substrings if needed. */
618
4327152a 619STATIC void
304ee84b 620S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 621{
e1ec3a88
AL
622 const STRLEN l = CHR_SVLEN(data->last_found);
623 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 624 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 625
c277df42 626 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 627 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
628 if (*data->longest == data->longest_fixed) {
629 data->offset_fixed = l ? data->last_start_min : data->pos_min;
630 if (data->flags & SF_BEFORE_EOL)
b81d288d 631 data->flags
c277df42
IZ
632 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
633 else
634 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
635 data->minlen_fixed=minlenp;
636 data->lookbehind_fixed=0;
a0ed51b3 637 }
304ee84b 638 else { /* *data->longest == data->longest_float */
c277df42 639 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
640 data->offset_float_max = (l
641 ? data->last_start_max
c277df42 642 : data->pos_min + data->pos_delta);
304ee84b 643 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 644 data->offset_float_max = I32_MAX;
c277df42 645 if (data->flags & SF_BEFORE_EOL)
b81d288d 646 data->flags
c277df42
IZ
647 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
648 else
649 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
650 data->minlen_float=minlenp;
651 data->lookbehind_float=0;
c277df42
IZ
652 }
653 }
654 SvCUR_set(data->last_found, 0);
0eda9292 655 {
a28509cc 656 SV * const sv = data->last_found;
097eb12c
AL
657 if (SvUTF8(sv) && SvMAGICAL(sv)) {
658 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
659 if (mg)
660 mg->mg_len = 0;
661 }
0eda9292 662 }
c277df42
IZ
663 data->last_end = -1;
664 data->flags &= ~SF_BEFORE_EOL;
304ee84b 665 DEBUG_STUDYDATA("cl_anything: ",data,0);
c277df42
IZ
666}
667
653099ff
GS
668/* Can match anything (initialization) */
669STATIC void
097eb12c 670S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 671{
653099ff 672 ANYOF_CLASS_ZERO(cl);
f8bef550 673 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 674 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
675 if (LOC)
676 cl->flags |= ANYOF_LOCALE;
677}
678
679/* Can match anything (initialization) */
680STATIC int
5f66b61c 681S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
682{
683 int value;
684
aaa51d5e 685 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
686 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
687 return 1;
1aa99e6b
IH
688 if (!(cl->flags & ANYOF_UNICODE_ALL))
689 return 0;
10edeb5d 690 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 691 return 0;
653099ff
GS
692 return 1;
693}
694
695/* Can match anything (initialization) */
696STATIC void
097eb12c 697S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 698{
8ecf7187 699 Zero(cl, 1, struct regnode_charclass_class);
653099ff 700 cl->type = ANYOF;
830247a4 701 cl_anything(pRExC_state, cl);
653099ff
GS
702}
703
704STATIC void
097eb12c 705S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 706{
8ecf7187 707 Zero(cl, 1, struct regnode_charclass_class);
653099ff 708 cl->type = ANYOF;
830247a4 709 cl_anything(pRExC_state, cl);
653099ff
GS
710 if (LOC)
711 cl->flags |= ANYOF_LOCALE;
712}
713
714/* 'And' a given class with another one. Can create false positives */
715/* We assume that cl is not inverted */
716STATIC void
5f66b61c 717S_cl_and(struct regnode_charclass_class *cl,
a28509cc 718 const struct regnode_charclass_class *and_with)
653099ff 719{
40d049e4
YO
720
721 assert(and_with->type == ANYOF);
653099ff
GS
722 if (!(and_with->flags & ANYOF_CLASS)
723 && !(cl->flags & ANYOF_CLASS)
724 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
725 && !(and_with->flags & ANYOF_FOLD)
726 && !(cl->flags & ANYOF_FOLD)) {
727 int i;
728
729 if (and_with->flags & ANYOF_INVERT)
730 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
731 cl->bitmap[i] &= ~and_with->bitmap[i];
732 else
733 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
734 cl->bitmap[i] &= and_with->bitmap[i];
735 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
736 if (!(and_with->flags & ANYOF_EOS))
737 cl->flags &= ~ANYOF_EOS;
1aa99e6b 738
14ebb1a2
JH
739 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
740 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
741 cl->flags &= ~ANYOF_UNICODE_ALL;
742 cl->flags |= ANYOF_UNICODE;
743 ARG_SET(cl, ARG(and_with));
744 }
14ebb1a2
JH
745 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
746 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 747 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
748 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
749 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 750 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
751}
752
753/* 'OR' a given class with another one. Can create false positives */
754/* We assume that cl is not inverted */
755STATIC void
097eb12c 756S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 757{
653099ff
GS
758 if (or_with->flags & ANYOF_INVERT) {
759 /* We do not use
760 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
761 * <= (B1 | !B2) | (CL1 | !CL2)
762 * which is wasteful if CL2 is small, but we ignore CL2:
763 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
764 * XXXX Can we handle case-fold? Unclear:
765 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
766 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
767 */
768 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(or_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD) ) {
771 int i;
772
773 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
774 cl->bitmap[i] |= ~or_with->bitmap[i];
775 } /* XXXX: logic is complicated otherwise */
776 else {
830247a4 777 cl_anything(pRExC_state, cl);
653099ff
GS
778 }
779 } else {
780 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
781 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 782 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
783 || (cl->flags & ANYOF_FOLD)) ) {
784 int i;
785
786 /* OR char bitmap and class bitmap separately */
787 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
788 cl->bitmap[i] |= or_with->bitmap[i];
789 if (or_with->flags & ANYOF_CLASS) {
790 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
791 cl->classflags[i] |= or_with->classflags[i];
792 cl->flags |= ANYOF_CLASS;
793 }
794 }
795 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 796 cl_anything(pRExC_state, cl);
653099ff
GS
797 }
798 }
799 if (or_with->flags & ANYOF_EOS)
800 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
801
802 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
803 ARG(cl) != ARG(or_with)) {
804 cl->flags |= ANYOF_UNICODE_ALL;
805 cl->flags &= ~ANYOF_UNICODE;
806 }
807 if (or_with->flags & ANYOF_UNICODE_ALL) {
808 cl->flags |= ANYOF_UNICODE_ALL;
809 cl->flags &= ~ANYOF_UNICODE;
810 }
653099ff
GS
811}
812
a3621e74
YO
813#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
814#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
815#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
816#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
817
3dab1dad
YO
818
819#ifdef DEBUGGING
07be1b83 820/*
2b8b4781
NC
821 dump_trie(trie,widecharmap,revcharmap)
822 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
823 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
824
825 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
826 The _interim_ variants are used for debugging the interim
827 tables that are used to generate the final compressed
828 representation which is what dump_trie expects.
829
3dab1dad
YO
830 Part of the reason for their existance is to provide a form
831 of documentation as to how the different representations function.
07be1b83
YO
832
833*/
3dab1dad
YO
834
835/*
3dab1dad
YO
836 Dumps the final compressed table form of the trie to Perl_debug_log.
837 Used for debugging make_trie().
838*/
839
840STATIC void
2b8b4781
NC
841S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
842 AV *revcharmap, U32 depth)
3dab1dad
YO
843{
844 U32 state;
ab3bbdeb 845 SV *sv=sv_newmortal();
55eed653 846 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
847 GET_RE_DEBUG_FLAGS_DECL;
848
ab3bbdeb 849
3dab1dad
YO
850 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
851 (int)depth * 2 + 2,"",
852 "Match","Base","Ofs" );
853
854 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 855 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 856 if ( tmp ) {
ab3bbdeb
YO
857 PerlIO_printf( Perl_debug_log, "%*s",
858 colwidth,
ddc5bc0f 859 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
860 PL_colors[0], PL_colors[1],
861 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
862 PERL_PV_ESCAPE_FIRSTCHAR
863 )
864 );
3dab1dad
YO
865 }
866 }
867 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
868 (int)depth * 2 + 2,"");
869
870 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 871 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
872 PerlIO_printf( Perl_debug_log, "\n");
873
1e2e3d02 874 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 875 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
876
877 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
878
879 if ( trie->states[ state ].wordnum ) {
880 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
881 } else {
882 PerlIO_printf( Perl_debug_log, "%6s", "" );
883 }
884
885 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
886
887 if ( base ) {
888 U32 ofs = 0;
889
890 while( ( base + ofs < trie->uniquecharcount ) ||
891 ( base + ofs - trie->uniquecharcount < trie->lasttrans
892 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
893 ofs++;
894
895 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
896
897 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
898 if ( ( base + ofs >= trie->uniquecharcount ) &&
899 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
900 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
901 {
ab3bbdeb
YO
902 PerlIO_printf( Perl_debug_log, "%*"UVXf,
903 colwidth,
3dab1dad
YO
904 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
905 } else {
ab3bbdeb 906 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
907 }
908 }
909
910 PerlIO_printf( Perl_debug_log, "]");
911
912 }
913 PerlIO_printf( Perl_debug_log, "\n" );
914 }
915}
916/*
3dab1dad
YO
917 Dumps a fully constructed but uncompressed trie in list form.
918 List tries normally only are used for construction when the number of
919 possible chars (trie->uniquecharcount) is very high.
920 Used for debugging make_trie().
921*/
922STATIC void
55eed653 923S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
924 HV *widecharmap, AV *revcharmap, U32 next_alloc,
925 U32 depth)
3dab1dad
YO
926{
927 U32 state;
ab3bbdeb 928 SV *sv=sv_newmortal();
55eed653 929 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
930 GET_RE_DEBUG_FLAGS_DECL;
931 /* print out the table precompression. */
ab3bbdeb
YO
932 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
933 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
934 "------:-----+-----------------\n" );
3dab1dad
YO
935
936 for( state=1 ; state < next_alloc ; state ++ ) {
937 U16 charid;
938
ab3bbdeb 939 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
940 (int)depth * 2 + 2,"", (UV)state );
941 if ( ! trie->states[ state ].wordnum ) {
942 PerlIO_printf( Perl_debug_log, "%5s| ","");
943 } else {
944 PerlIO_printf( Perl_debug_log, "W%4x| ",
945 trie->states[ state ].wordnum
946 );
947 }
948 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 949 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
950 if ( tmp ) {
951 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
952 colwidth,
ddc5bc0f 953 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
954 PL_colors[0], PL_colors[1],
955 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
956 PERL_PV_ESCAPE_FIRSTCHAR
957 ) ,
1e2e3d02
YO
958 TRIE_LIST_ITEM(state,charid).forid,
959 (UV)TRIE_LIST_ITEM(state,charid).newstate
960 );
961 if (!(charid % 10))
664e119d
RGS
962 PerlIO_printf(Perl_debug_log, "\n%*s| ",
963 (int)((depth * 2) + 14), "");
1e2e3d02 964 }
ab3bbdeb
YO
965 }
966 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
967 }
968}
969
970/*
3dab1dad
YO
971 Dumps a fully constructed but uncompressed trie in table form.
972 This is the normal DFA style state transition table, with a few
973 twists to facilitate compression later.
974 Used for debugging make_trie().
975*/
976STATIC void
55eed653 977S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
978 HV *widecharmap, AV *revcharmap, U32 next_alloc,
979 U32 depth)
3dab1dad
YO
980{
981 U32 state;
982 U16 charid;
ab3bbdeb 983 SV *sv=sv_newmortal();
55eed653 984 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
985 GET_RE_DEBUG_FLAGS_DECL;
986
987 /*
988 print out the table precompression so that we can do a visual check
989 that they are identical.
990 */
991
992 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
993
994 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 995 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 996 if ( tmp ) {
ab3bbdeb
YO
997 PerlIO_printf( Perl_debug_log, "%*s",
998 colwidth,
ddc5bc0f 999 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1000 PL_colors[0], PL_colors[1],
1001 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1002 PERL_PV_ESCAPE_FIRSTCHAR
1003 )
1004 );
3dab1dad
YO
1005 }
1006 }
1007
1008 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1009
1010 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1011 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1012 }
1013
1014 PerlIO_printf( Perl_debug_log, "\n" );
1015
1016 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1017
1018 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1019 (int)depth * 2 + 2,"",
1020 (UV)TRIE_NODENUM( state ) );
1021
1022 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1023 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1024 if (v)
1025 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1026 else
1027 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1028 }
1029 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1030 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1031 } else {
1032 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1033 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1034 }
1035 }
07be1b83 1036}
3dab1dad
YO
1037
1038#endif
1039
786e8c11
YO
1040/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1041 startbranch: the first branch in the whole branch sequence
1042 first : start branch of sequence of branch-exact nodes.
1043 May be the same as startbranch
1044 last : Thing following the last branch.
1045 May be the same as tail.
1046 tail : item following the branch sequence
1047 count : words in the sequence
1048 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1049 depth : indent depth
3dab1dad 1050
786e8c11 1051Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1052
786e8c11
YO
1053A trie is an N'ary tree where the branches are determined by digital
1054decomposition of the key. IE, at the root node you look up the 1st character and
1055follow that branch repeat until you find the end of the branches. Nodes can be
1056marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1057
786e8c11 1058 /he|she|his|hers/
72f13be8 1059
786e8c11
YO
1060would convert into the following structure. Numbers represent states, letters
1061following numbers represent valid transitions on the letter from that state, if
1062the number is in square brackets it represents an accepting state, otherwise it
1063will be in parenthesis.
07be1b83 1064
786e8c11
YO
1065 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1066 | |
1067 | (2)
1068 | |
1069 (1) +-i->(6)-+-s->[7]
1070 |
1071 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1072
786e8c11
YO
1073 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1074
1075This shows that when matching against the string 'hers' we will begin at state 1
1076read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1077then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1078is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1079single traverse. We store a mapping from accepting to state to which word was
1080matched, and then when we have multiple possibilities we try to complete the
1081rest of the regex in the order in which they occured in the alternation.
1082
1083The only prior NFA like behaviour that would be changed by the TRIE support is
1084the silent ignoring of duplicate alternations which are of the form:
1085
1086 / (DUPE|DUPE) X? (?{ ... }) Y /x
1087
1088Thus EVAL blocks follwing a trie may be called a different number of times with
1089and without the optimisation. With the optimisations dupes will be silently
1090ignored. This inconsistant behaviour of EVAL type nodes is well established as
1091the following demonstrates:
1092
1093 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1094
1095which prints out 'word' three times, but
1096
1097 'words'=~/(word|word|word)(?{ print $1 })S/
1098
1099which doesnt print it out at all. This is due to other optimisations kicking in.
1100
1101Example of what happens on a structural level:
1102
1103The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1104
1105 1: CURLYM[1] {1,32767}(18)
1106 5: BRANCH(8)
1107 6: EXACT <ac>(16)
1108 8: BRANCH(11)
1109 9: EXACT <ad>(16)
1110 11: BRANCH(14)
1111 12: EXACT <ab>(16)
1112 16: SUCCEED(0)
1113 17: NOTHING(18)
1114 18: END(0)
1115
1116This would be optimizable with startbranch=5, first=5, last=16, tail=16
1117and should turn into:
1118
1119 1: CURLYM[1] {1,32767}(18)
1120 5: TRIE(16)
1121 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1122 <ac>
1123 <ad>
1124 <ab>
1125 16: SUCCEED(0)
1126 17: NOTHING(18)
1127 18: END(0)
1128
1129Cases where tail != last would be like /(?foo|bar)baz/:
1130
1131 1: BRANCH(4)
1132 2: EXACT <foo>(8)
1133 4: BRANCH(7)
1134 5: EXACT <bar>(8)
1135 7: TAIL(8)
1136 8: EXACT <baz>(10)
1137 10: END(0)
1138
1139which would be optimizable with startbranch=1, first=1, last=7, tail=8
1140and would end up looking like:
1141
1142 1: TRIE(8)
1143 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1144 <foo>
1145 <bar>
1146 7: TAIL(8)
1147 8: EXACT <baz>(10)
1148 10: END(0)
1149
1150 d = uvuni_to_utf8_flags(d, uv, 0);
1151
1152is the recommended Unicode-aware way of saying
1153
1154 *(d++) = uv;
1155*/
1156
1e2e3d02 1157#define TRIE_STORE_REVCHAR \
786e8c11 1158 STMT_START { \
1e2e3d02 1159 SV *tmp = newSVpvs(""); \
786e8c11 1160 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1161 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1162 av_push( revcharmap, tmp ); \
786e8c11
YO
1163 } STMT_END
1164
1165#define TRIE_READ_CHAR STMT_START { \
1166 wordlen++; \
1167 if ( UTF ) { \
1168 if ( folder ) { \
1169 if ( foldlen > 0 ) { \
1170 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1171 foldlen -= len; \
1172 scan += len; \
1173 len = 0; \
1174 } else { \
1175 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1176 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1177 foldlen -= UNISKIP( uvc ); \
1178 scan = foldbuf + UNISKIP( uvc ); \
1179 } \
1180 } else { \
1181 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1182 } \
1183 } else { \
1184 uvc = (U32)*uc; \
1185 len = 1; \
1186 } \
1187} STMT_END
1188
1189
1190
1191#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1192 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1193 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1194 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1195 } \
1196 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1197 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1198 TRIE_LIST_CUR( state )++; \
1199} STMT_END
07be1b83 1200
786e8c11
YO
1201#define TRIE_LIST_NEW(state) STMT_START { \
1202 Newxz( trie->states[ state ].trans.list, \
1203 4, reg_trie_trans_le ); \
1204 TRIE_LIST_CUR( state ) = 1; \
1205 TRIE_LIST_LEN( state ) = 4; \
1206} STMT_END
07be1b83 1207
786e8c11
YO
1208#define TRIE_HANDLE_WORD(state) STMT_START { \
1209 U16 dupe= trie->states[ state ].wordnum; \
1210 regnode * const noper_next = regnext( noper ); \
1211 \
1212 if (trie->wordlen) \
1213 trie->wordlen[ curword ] = wordlen; \
1214 DEBUG_r({ \
1215 /* store the word for dumping */ \
1216 SV* tmp; \
1217 if (OP(noper) != NOTHING) \
1218 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1219 else \
1220 tmp = newSVpvn( "", 0 ); \
1221 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1222 av_push( trie_words, tmp ); \
786e8c11
YO
1223 }); \
1224 \
1225 curword++; \
1226 \
1227 if ( noper_next < tail ) { \
1228 if (!trie->jump) \
c944940b 1229 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1230 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1231 if (!jumper) \
1232 jumper = noper_next; \
1233 if (!nextbranch) \
1234 nextbranch= regnext(cur); \
1235 } \
1236 \
1237 if ( dupe ) { \
1238 /* So it's a dupe. This means we need to maintain a */\
1239 /* linked-list from the first to the next. */\
1240 /* we only allocate the nextword buffer when there */\
1241 /* a dupe, so first time we have to do the allocation */\
1242 if (!trie->nextword) \
c944940b 1243 trie->nextword = (U16 *) \
446bd890 1244 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1245 while ( trie->nextword[dupe] ) \
1246 dupe= trie->nextword[dupe]; \
1247 trie->nextword[dupe]= curword; \
1248 } else { \
1249 /* we haven't inserted this word yet. */ \
1250 trie->states[ state ].wordnum = curword; \
1251 } \
1252} STMT_END
07be1b83 1253
3dab1dad 1254
786e8c11
YO
1255#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1256 ( ( base + charid >= ucharcount \
1257 && base + charid < ubound \
1258 && state == trie->trans[ base - ucharcount + charid ].check \
1259 && trie->trans[ base - ucharcount + charid ].next ) \
1260 ? trie->trans[ base - ucharcount + charid ].next \
1261 : ( state==1 ? special : 0 ) \
1262 )
3dab1dad 1263
786e8c11
YO
1264#define MADE_TRIE 1
1265#define MADE_JUMP_TRIE 2
1266#define MADE_EXACT_TRIE 4
3dab1dad 1267
a3621e74 1268STATIC I32
786e8c11 1269S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1270{
27da23d5 1271 dVAR;
a3621e74
YO
1272 /* first pass, loop through and scan words */
1273 reg_trie_data *trie;
55eed653 1274 HV *widecharmap = NULL;
2b8b4781 1275 AV *revcharmap = newAV();
a3621e74 1276 regnode *cur;
9f7f3913 1277 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1278 STRLEN len = 0;
1279 UV uvc = 0;
1280 U16 curword = 0;
1281 U32 next_alloc = 0;
786e8c11
YO
1282 regnode *jumper = NULL;
1283 regnode *nextbranch = NULL;
7f69552c 1284 regnode *convert = NULL;
a3621e74 1285 /* we just use folder as a flag in utf8 */
e1ec3a88 1286 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1287 ? PL_fold
1288 : ( flags == EXACTFL
1289 ? PL_fold_locale
1290 : NULL
1291 )
1292 );
1293
2b8b4781
NC
1294#ifdef DEBUGGING
1295 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1296 AV *trie_words = NULL;
1297 /* along with revcharmap, this only used during construction but both are
1298 * useful during debugging so we store them in the struct when debugging.
8e11feef 1299 */
2b8b4781
NC
1300#else
1301 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1302 STRLEN trie_charcount=0;
3dab1dad 1303#endif
2b8b4781 1304 SV *re_trie_maxbuff;
a3621e74 1305 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1306#ifndef DEBUGGING
1307 PERL_UNUSED_ARG(depth);
1308#endif
a3621e74 1309
c944940b 1310 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1311 trie->refcount = 1;
3dab1dad 1312 trie->startstate = 1;
786e8c11 1313 trie->wordcount = word_count;
f8fc2ecf 1314 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1315 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1316 if (!(UTF && folder))
c944940b 1317 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1318 DEBUG_r({
2b8b4781 1319 trie_words = newAV();
a3621e74 1320 });
a3621e74 1321
0111c4fd 1322 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1323 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1324 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1325 }
3dab1dad
YO
1326 DEBUG_OPTIMISE_r({
1327 PerlIO_printf( Perl_debug_log,
786e8c11 1328 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1329 (int)depth * 2 + 2, "",
1330 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1331 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1332 (int)depth);
3dab1dad 1333 });
7f69552c
YO
1334
1335 /* Find the node we are going to overwrite */
1336 if ( first == startbranch && OP( last ) != BRANCH ) {
1337 /* whole branch chain */
1338 convert = first;
1339 } else {
1340 /* branch sub-chain */
1341 convert = NEXTOPER( first );
1342 }
1343
a3621e74
YO
1344 /* -- First loop and Setup --
1345
1346 We first traverse the branches and scan each word to determine if it
1347 contains widechars, and how many unique chars there are, this is
1348 important as we have to build a table with at least as many columns as we
1349 have unique chars.
1350
1351 We use an array of integers to represent the character codes 0..255
1352 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1353 native representation of the character value as the key and IV's for the
1354 coded index.
1355
1356 *TODO* If we keep track of how many times each character is used we can
1357 remap the columns so that the table compression later on is more
1358 efficient in terms of memory by ensuring most common value is in the
1359 middle and the least common are on the outside. IMO this would be better
1360 than a most to least common mapping as theres a decent chance the most
1361 common letter will share a node with the least common, meaning the node
1362 will not be compressable. With a middle is most common approach the worst
1363 case is when we have the least common nodes twice.
1364
1365 */
1366
a3621e74 1367 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1368 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1369 const U8 *uc = (U8*)STRING( noper );
a28509cc 1370 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1371 STRLEN foldlen = 0;
1372 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1373 const U8 *scan = (U8*)NULL;
07be1b83 1374 U32 wordlen = 0; /* required init */
3dab1dad 1375 STRLEN chars=0;
a3621e74 1376
3dab1dad
YO
1377 if (OP(noper) == NOTHING) {
1378 trie->minlen= 0;
1379 continue;
1380 }
1381 if (trie->bitmap) {
1382 TRIE_BITMAP_SET(trie,*uc);
1383 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1384 }
a3621e74 1385 for ( ; uc < e ; uc += len ) {
3dab1dad 1386 TRIE_CHARCOUNT(trie)++;
a3621e74 1387 TRIE_READ_CHAR;
3dab1dad 1388 chars++;
a3621e74
YO
1389 if ( uvc < 256 ) {
1390 if ( !trie->charmap[ uvc ] ) {
1391 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1392 if ( folder )
1393 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1394 TRIE_STORE_REVCHAR;
a3621e74
YO
1395 }
1396 } else {
1397 SV** svpp;
55eed653
NC
1398 if ( !widecharmap )
1399 widecharmap = newHV();
a3621e74 1400
55eed653 1401 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1402
1403 if ( !svpp )
e4584336 1404 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1405
1406 if ( !SvTRUE( *svpp ) ) {
1407 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1408 TRIE_STORE_REVCHAR;
a3621e74
YO
1409 }
1410 }
1411 }
3dab1dad
YO
1412 if( cur == first ) {
1413 trie->minlen=chars;
1414 trie->maxlen=chars;
1415 } else if (chars < trie->minlen) {
1416 trie->minlen=chars;
1417 } else if (chars > trie->maxlen) {
1418 trie->maxlen=chars;
1419 }
1420
a3621e74
YO
1421 } /* end first pass */
1422 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1423 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1424 (int)depth * 2 + 2,"",
55eed653 1425 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1426 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1427 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1428 );
c944940b 1429 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1430
1431 /*
1432 We now know what we are dealing with in terms of unique chars and
1433 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1434 representation using a flat table will take. If it's over a reasonable
1435 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1436 conservative but potentially much slower representation using an array
1437 of lists.
1438
1439 At the end we convert both representations into the same compressed
1440 form that will be used in regexec.c for matching with. The latter
1441 is a form that cannot be used to construct with but has memory
1442 properties similar to the list form and access properties similar
1443 to the table form making it both suitable for fast searches and
1444 small enough that its feasable to store for the duration of a program.
1445
1446 See the comment in the code where the compressed table is produced
1447 inplace from the flat tabe representation for an explanation of how
1448 the compression works.
1449
1450 */
1451
1452
3dab1dad 1453 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1454 /*
1455 Second Pass -- Array Of Lists Representation
1456
1457 Each state will be represented by a list of charid:state records
1458 (reg_trie_trans_le) the first such element holds the CUR and LEN
1459 points of the allocated array. (See defines above).
1460
1461 We build the initial structure using the lists, and then convert
1462 it into the compressed table form which allows faster lookups
1463 (but cant be modified once converted).
a3621e74
YO
1464 */
1465
a3621e74
YO
1466 STRLEN transcount = 1;
1467
1e2e3d02
YO
1468 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1469 "%*sCompiling trie using list compiler\n",
1470 (int)depth * 2 + 2, ""));
446bd890 1471
c944940b
JH
1472 trie->states = (reg_trie_state *)
1473 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1474 sizeof(reg_trie_state) );
a3621e74
YO
1475 TRIE_LIST_NEW(1);
1476 next_alloc = 2;
1477
1478 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1479
c445ea15
AL
1480 regnode * const noper = NEXTOPER( cur );
1481 U8 *uc = (U8*)STRING( noper );
1482 const U8 * const e = uc + STR_LEN( noper );
1483 U32 state = 1; /* required init */
1484 U16 charid = 0; /* sanity init */
1485 U8 *scan = (U8*)NULL; /* sanity init */
1486 STRLEN foldlen = 0; /* required init */
07be1b83 1487 U32 wordlen = 0; /* required init */
c445ea15
AL
1488 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1489
3dab1dad 1490 if (OP(noper) != NOTHING) {
786e8c11 1491 for ( ; uc < e ; uc += len ) {
c445ea15 1492
786e8c11 1493 TRIE_READ_CHAR;
c445ea15 1494
786e8c11
YO
1495 if ( uvc < 256 ) {
1496 charid = trie->charmap[ uvc ];
c445ea15 1497 } else {
55eed653 1498 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1499 if ( !svpp ) {
1500 charid = 0;
1501 } else {
1502 charid=(U16)SvIV( *svpp );
1503 }
c445ea15 1504 }
786e8c11
YO
1505 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1506 if ( charid ) {
a3621e74 1507
786e8c11
YO
1508 U16 check;
1509 U32 newstate = 0;
a3621e74 1510
786e8c11
YO
1511 charid--;
1512 if ( !trie->states[ state ].trans.list ) {
1513 TRIE_LIST_NEW( state );
c445ea15 1514 }
786e8c11
YO
1515 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1516 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1517 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1518 break;
1519 }
1520 }
1521 if ( ! newstate ) {
1522 newstate = next_alloc++;
1523 TRIE_LIST_PUSH( state, charid, newstate );
1524 transcount++;
1525 }
1526 state = newstate;
1527 } else {
1528 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1529 }
a28509cc 1530 }
c445ea15 1531 }
3dab1dad 1532 TRIE_HANDLE_WORD(state);
a3621e74
YO
1533
1534 } /* end second pass */
1535
1e2e3d02
YO
1536 /* next alloc is the NEXT state to be allocated */
1537 trie->statecount = next_alloc;
c944940b
JH
1538 trie->states = (reg_trie_state *)
1539 PerlMemShared_realloc( trie->states,
1540 next_alloc
1541 * sizeof(reg_trie_state) );
a3621e74 1542
3dab1dad 1543 /* and now dump it out before we compress it */
2b8b4781
NC
1544 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1545 revcharmap, next_alloc,
1546 depth+1)
1e2e3d02 1547 );
a3621e74 1548
c944940b
JH
1549 trie->trans = (reg_trie_trans *)
1550 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1551 {
1552 U32 state;
a3621e74
YO
1553 U32 tp = 0;
1554 U32 zp = 0;
1555
1556
1557 for( state=1 ; state < next_alloc ; state ++ ) {
1558 U32 base=0;
1559
1560 /*
1561 DEBUG_TRIE_COMPILE_MORE_r(
1562 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1563 );
1564 */
1565
1566 if (trie->states[state].trans.list) {
1567 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1568 U16 maxid=minid;
a28509cc 1569 U16 idx;
a3621e74
YO
1570
1571 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1572 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1573 if ( forid < minid ) {
1574 minid=forid;
1575 } else if ( forid > maxid ) {
1576 maxid=forid;
1577 }
a3621e74
YO
1578 }
1579 if ( transcount < tp + maxid - minid + 1) {
1580 transcount *= 2;
c944940b
JH
1581 trie->trans = (reg_trie_trans *)
1582 PerlMemShared_realloc( trie->trans,
446bd890
NC
1583 transcount
1584 * sizeof(reg_trie_trans) );
a3621e74
YO
1585 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1586 }
1587 base = trie->uniquecharcount + tp - minid;
1588 if ( maxid == minid ) {
1589 U32 set = 0;
1590 for ( ; zp < tp ; zp++ ) {
1591 if ( ! trie->trans[ zp ].next ) {
1592 base = trie->uniquecharcount + zp - minid;
1593 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1594 trie->trans[ zp ].check = state;
1595 set = 1;
1596 break;
1597 }
1598 }
1599 if ( !set ) {
1600 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1601 trie->trans[ tp ].check = state;
1602 tp++;
1603 zp = tp;
1604 }
1605 } else {
1606 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1607 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1608 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1609 trie->trans[ tid ].check = state;
1610 }
1611 tp += ( maxid - minid + 1 );
1612 }
1613 Safefree(trie->states[ state ].trans.list);
1614 }
1615 /*
1616 DEBUG_TRIE_COMPILE_MORE_r(
1617 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1618 );
1619 */
1620 trie->states[ state ].trans.base=base;
1621 }
cc601c31 1622 trie->lasttrans = tp + 1;
a3621e74
YO
1623 }
1624 } else {
1625 /*
1626 Second Pass -- Flat Table Representation.
1627
1628 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1629 We know that we will need Charcount+1 trans at most to store the data
1630 (one row per char at worst case) So we preallocate both structures
1631 assuming worst case.
1632
1633 We then construct the trie using only the .next slots of the entry
1634 structs.
1635
1636 We use the .check field of the first entry of the node temporarily to
1637 make compression both faster and easier by keeping track of how many non
1638 zero fields are in the node.
1639
1640 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1641 transition.
1642
1643 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1644 number representing the first entry of the node, and state as a
1645 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1646 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1647 are 2 entrys per node. eg:
1648
1649 A B A B
1650 1. 2 4 1. 3 7
1651 2. 0 3 3. 0 5
1652 3. 0 0 5. 0 0
1653 4. 0 0 7. 0 0
1654
1655 The table is internally in the right hand, idx form. However as we also
1656 have to deal with the states array which is indexed by nodenum we have to
1657 use TRIE_NODENUM() to convert.
1658
1659 */
1e2e3d02
YO
1660 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1661 "%*sCompiling trie using table compiler\n",
1662 (int)depth * 2 + 2, ""));
3dab1dad 1663
c944940b
JH
1664 trie->trans = (reg_trie_trans *)
1665 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1666 * trie->uniquecharcount + 1,
1667 sizeof(reg_trie_trans) );
1668 trie->states = (reg_trie_state *)
1669 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1670 sizeof(reg_trie_state) );
a3621e74
YO
1671 next_alloc = trie->uniquecharcount + 1;
1672
3dab1dad 1673
a3621e74
YO
1674 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1675
c445ea15 1676 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1677 const U8 *uc = (U8*)STRING( noper );
1678 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1679
1680 U32 state = 1; /* required init */
1681
1682 U16 charid = 0; /* sanity init */
1683 U32 accept_state = 0; /* sanity init */
1684 U8 *scan = (U8*)NULL; /* sanity init */
1685
1686 STRLEN foldlen = 0; /* required init */
07be1b83 1687 U32 wordlen = 0; /* required init */
a3621e74
YO
1688 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1689
3dab1dad 1690 if ( OP(noper) != NOTHING ) {
786e8c11 1691 for ( ; uc < e ; uc += len ) {
a3621e74 1692
786e8c11 1693 TRIE_READ_CHAR;
a3621e74 1694
786e8c11
YO
1695 if ( uvc < 256 ) {
1696 charid = trie->charmap[ uvc ];
1697 } else {
55eed653 1698 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1699 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1700 }
786e8c11
YO
1701 if ( charid ) {
1702 charid--;
1703 if ( !trie->trans[ state + charid ].next ) {
1704 trie->trans[ state + charid ].next = next_alloc;
1705 trie->trans[ state ].check++;
1706 next_alloc += trie->uniquecharcount;
1707 }
1708 state = trie->trans[ state + charid ].next;
1709 } else {
1710 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1711 }
1712 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1713 }
a3621e74 1714 }
3dab1dad
YO
1715 accept_state = TRIE_NODENUM( state );
1716 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1717
1718 } /* end second pass */
1719
3dab1dad 1720 /* and now dump it out before we compress it */
2b8b4781
NC
1721 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1722 revcharmap,
1723 next_alloc, depth+1));
a3621e74 1724
a3621e74
YO
1725 {
1726 /*
1727 * Inplace compress the table.*
1728
1729 For sparse data sets the table constructed by the trie algorithm will
1730 be mostly 0/FAIL transitions or to put it another way mostly empty.
1731 (Note that leaf nodes will not contain any transitions.)
1732
1733 This algorithm compresses the tables by eliminating most such
1734 transitions, at the cost of a modest bit of extra work during lookup:
1735
1736 - Each states[] entry contains a .base field which indicates the
1737 index in the state[] array wheres its transition data is stored.
1738
1739 - If .base is 0 there are no valid transitions from that node.
1740
1741 - If .base is nonzero then charid is added to it to find an entry in
1742 the trans array.
1743
1744 -If trans[states[state].base+charid].check!=state then the
1745 transition is taken to be a 0/Fail transition. Thus if there are fail
1746 transitions at the front of the node then the .base offset will point
1747 somewhere inside the previous nodes data (or maybe even into a node
1748 even earlier), but the .check field determines if the transition is
1749 valid.
1750
786e8c11 1751 XXX - wrong maybe?
a3621e74
YO
1752 The following process inplace converts the table to the compressed
1753 table: We first do not compress the root node 1,and mark its all its
1754 .check pointers as 1 and set its .base pointer as 1 as well. This
1755 allows to do a DFA construction from the compressed table later, and
1756 ensures that any .base pointers we calculate later are greater than
1757 0.
1758
1759 - We set 'pos' to indicate the first entry of the second node.
1760
1761 - We then iterate over the columns of the node, finding the first and
1762 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1763 and set the .check pointers accordingly, and advance pos
1764 appropriately and repreat for the next node. Note that when we copy
1765 the next pointers we have to convert them from the original
1766 NODEIDX form to NODENUM form as the former is not valid post
1767 compression.
1768
1769 - If a node has no transitions used we mark its base as 0 and do not
1770 advance the pos pointer.
1771
1772 - If a node only has one transition we use a second pointer into the
1773 structure to fill in allocated fail transitions from other states.
1774 This pointer is independent of the main pointer and scans forward
1775 looking for null transitions that are allocated to a state. When it
1776 finds one it writes the single transition into the "hole". If the
786e8c11 1777 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1778
1779 - Once compressed we can Renew/realloc the structures to release the
1780 excess space.
1781
1782 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1783 specifically Fig 3.47 and the associated pseudocode.
1784
1785 demq
1786 */
a3b680e6 1787 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1788 U32 state, charid;
a3621e74 1789 U32 pos = 0, zp=0;
1e2e3d02 1790 trie->statecount = laststate;
a3621e74
YO
1791
1792 for ( state = 1 ; state < laststate ; state++ ) {
1793 U8 flag = 0;
a28509cc
AL
1794 const U32 stateidx = TRIE_NODEIDX( state );
1795 const U32 o_used = trie->trans[ stateidx ].check;
1796 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1797 trie->trans[ stateidx ].check = 0;
1798
1799 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1800 if ( flag || trie->trans[ stateidx + charid ].next ) {
1801 if ( trie->trans[ stateidx + charid ].next ) {
1802 if (o_used == 1) {
1803 for ( ; zp < pos ; zp++ ) {
1804 if ( ! trie->trans[ zp ].next ) {
1805 break;
1806 }
1807 }
1808 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1809 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1810 trie->trans[ zp ].check = state;
1811 if ( ++zp > pos ) pos = zp;
1812 break;
1813 }
1814 used--;
1815 }
1816 if ( !flag ) {
1817 flag = 1;
1818 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1819 }
1820 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1821 trie->trans[ pos ].check = state;
1822 pos++;
1823 }
1824 }
1825 }
cc601c31 1826 trie->lasttrans = pos + 1;
c944940b
JH
1827 trie->states = (reg_trie_state *)
1828 PerlMemShared_realloc( trie->states, laststate
1829 * sizeof(reg_trie_state) );
a3621e74 1830 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1831 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1832 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1833 (int)depth * 2 + 2,"",
1834 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1835 (IV)next_alloc,
1836 (IV)pos,
a3621e74
YO
1837 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1838 );
1839
1840 } /* end table compress */
1841 }
1e2e3d02
YO
1842 DEBUG_TRIE_COMPILE_MORE_r(
1843 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1844 (int)depth * 2 + 2, "",
1845 (UV)trie->statecount,
1846 (UV)trie->lasttrans)
1847 );
cc601c31 1848 /* resize the trans array to remove unused space */
c944940b
JH
1849 trie->trans = (reg_trie_trans *)
1850 PerlMemShared_realloc( trie->trans, trie->lasttrans
1851 * sizeof(reg_trie_trans) );
a3621e74 1852
3dab1dad 1853 /* and now dump out the compressed format */
2b8b4781 1854 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1855
3dab1dad 1856 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1857 U8 nodetype =(U8)(flags & 0xFF);
1858 char *str=NULL;
786e8c11 1859
07be1b83 1860#ifdef DEBUGGING
e62cc96a 1861 regnode *optimize = NULL;
7122b237
YO
1862#ifdef RE_TRACK_PATTERN_OFFSETS
1863
b57a0404
JH
1864 U32 mjd_offset = 0;
1865 U32 mjd_nodelen = 0;
7122b237
YO
1866#endif /* RE_TRACK_PATTERN_OFFSETS */
1867#endif /* DEBUGGING */
a3621e74 1868 /*
3dab1dad
YO
1869 This means we convert either the first branch or the first Exact,
1870 depending on whether the thing following (in 'last') is a branch
1871 or not and whther first is the startbranch (ie is it a sub part of
1872 the alternation or is it the whole thing.)
1873 Assuming its a sub part we conver the EXACT otherwise we convert
1874 the whole branch sequence, including the first.
a3621e74 1875 */
3dab1dad 1876 /* Find the node we are going to overwrite */
7f69552c 1877 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1878 /* branch sub-chain */
3dab1dad 1879 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1880#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1881 DEBUG_r({
1882 mjd_offset= Node_Offset((convert));
1883 mjd_nodelen= Node_Length((convert));
1884 });
7122b237 1885#endif
7f69552c 1886 /* whole branch chain */
7122b237
YO
1887 }
1888#ifdef RE_TRACK_PATTERN_OFFSETS
1889 else {
7f69552c
YO
1890 DEBUG_r({
1891 const regnode *nop = NEXTOPER( convert );
1892 mjd_offset= Node_Offset((nop));
1893 mjd_nodelen= Node_Length((nop));
1894 });
07be1b83
YO
1895 }
1896 DEBUG_OPTIMISE_r(
1897 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1898 (int)depth * 2 + 2, "",
786e8c11 1899 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1900 );
7122b237 1901#endif
3dab1dad
YO
1902 /* But first we check to see if there is a common prefix we can
1903 split out as an EXACT and put in front of the TRIE node. */
1904 trie->startstate= 1;
55eed653 1905 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1906 U32 state;
1e2e3d02 1907 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1908 U32 ofs = 0;
8e11feef
RGS
1909 I32 idx = -1;
1910 U32 count = 0;
1911 const U32 base = trie->states[ state ].trans.base;
a3621e74 1912
3dab1dad 1913 if ( trie->states[state].wordnum )
8e11feef 1914 count = 1;
a3621e74 1915
8e11feef 1916 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1917 if ( ( base + ofs >= trie->uniquecharcount ) &&
1918 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1919 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1920 {
3dab1dad 1921 if ( ++count > 1 ) {
2b8b4781 1922 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1923 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1924 if ( state == 1 ) break;
3dab1dad
YO
1925 if ( count == 2 ) {
1926 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1927 DEBUG_OPTIMISE_r(
8e11feef
RGS
1928 PerlIO_printf(Perl_debug_log,
1929 "%*sNew Start State=%"UVuf" Class: [",
1930 (int)depth * 2 + 2, "",
786e8c11 1931 (UV)state));
be8e71aa 1932 if (idx >= 0) {
2b8b4781 1933 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1934 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1935
3dab1dad 1936 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1937 if ( folder )
1938 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1939 DEBUG_OPTIMISE_r(
07be1b83 1940 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1941 );
8e11feef
RGS
1942 }
1943 }
1944 TRIE_BITMAP_SET(trie,*ch);
1945 if ( folder )
1946 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1947 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1948 }
1949 idx = ofs;
1950 }
3dab1dad
YO
1951 }
1952 if ( count == 1 ) {
2b8b4781 1953 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1954 char *ch = SvPV_nolen( *tmp );
1955 DEBUG_OPTIMISE_r({
1956 SV *sv=sv_newmortal();
8e11feef
RGS
1957 PerlIO_printf( Perl_debug_log,
1958 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1959 (int)depth * 2 + 2, "",
de734bd5
A
1960 (UV)state, (UV)idx,
1961 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1962 PL_colors[0], PL_colors[1],
1963 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1964 PERL_PV_ESCAPE_FIRSTCHAR
1965 )
1966 );
1967 });
3dab1dad
YO
1968 if ( state==1 ) {
1969 OP( convert ) = nodetype;
1970 str=STRING(convert);
1971 STR_LEN(convert)=0;
1972 }
de734bd5
A
1973 while (*ch) {
1974 *str++ = *ch++;
1975 STR_LEN(convert)++;
1976 }
1977
8e11feef 1978 } else {
f9049ba1 1979#ifdef DEBUGGING
8e11feef
RGS
1980 if (state>1)
1981 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1982#endif
8e11feef
RGS
1983 break;
1984 }
1985 }
3dab1dad 1986 if (str) {
8e11feef 1987 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1988 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1989 trie->startstate = state;
07be1b83
YO
1990 trie->minlen -= (state - 1);
1991 trie->maxlen -= (state - 1);
1992 DEBUG_r({
1993 regnode *fix = convert;
de734bd5 1994 U32 word = trie->wordcount;
07be1b83
YO
1995 mjd_nodelen++;
1996 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1997 while( ++fix < n ) {
1998 Set_Node_Offset_Length(fix, 0, 0);
1999 }
de734bd5 2000 while (word--) {
2b8b4781 2001 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
2002 if (tmp) {
2003 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2004 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2005 else
2006 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2007 }
2008 }
07be1b83 2009 });
8e11feef
RGS
2010 if (trie->maxlen) {
2011 convert = n;
2012 } else {
3dab1dad 2013 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2014 DEBUG_r(optimize= n);
3dab1dad
YO
2015 }
2016 }
2017 }
a5ca303d
YO
2018 if (!jumper)
2019 jumper = last;
3dab1dad 2020 if ( trie->maxlen ) {
8e11feef
RGS
2021 NEXT_OFF( convert ) = (U16)(tail - convert);
2022 ARG_SET( convert, data_slot );
786e8c11
YO
2023 /* Store the offset to the first unabsorbed branch in
2024 jump[0], which is otherwise unused by the jump logic.
2025 We use this when dumping a trie and during optimisation. */
2026 if (trie->jump)
7f69552c 2027 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2028
786e8c11
YO
2029 /* XXXX */
2030 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2031 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2032 {
2033 OP( convert ) = TRIEC;
2034 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2035 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2036 trie->bitmap= NULL;
2037 } else
2038 OP( convert ) = TRIE;
a3621e74 2039
3dab1dad
YO
2040 /* store the type in the flags */
2041 convert->flags = nodetype;
a5ca303d
YO
2042 DEBUG_r({
2043 optimize = convert
2044 + NODE_STEP_REGNODE
2045 + regarglen[ OP( convert ) ];
2046 });
2047 /* XXX We really should free up the resource in trie now,
2048 as we won't use them - (which resources?) dmq */
3dab1dad 2049 }
a3621e74 2050 /* needed for dumping*/
e62cc96a 2051 DEBUG_r(if (optimize) {
07be1b83 2052 regnode *opt = convert;
e62cc96a 2053 while ( ++opt < optimize) {
07be1b83
YO
2054 Set_Node_Offset_Length(opt,0,0);
2055 }
786e8c11
YO
2056 /*
2057 Try to clean up some of the debris left after the
2058 optimisation.
a3621e74 2059 */
786e8c11 2060 while( optimize < jumper ) {
07be1b83 2061 mjd_nodelen += Node_Length((optimize));
a3621e74 2062 OP( optimize ) = OPTIMIZED;
07be1b83 2063 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2064 optimize++;
2065 }
07be1b83 2066 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2067 });
2068 } /* end node insert */
55eed653 2069 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2070#ifdef DEBUGGING
2071 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2072 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2073#else
2074 SvREFCNT_dec(revcharmap);
07be1b83 2075#endif
786e8c11
YO
2076 return trie->jump
2077 ? MADE_JUMP_TRIE
2078 : trie->startstate>1
2079 ? MADE_EXACT_TRIE
2080 : MADE_TRIE;
2081}
2082
2083STATIC void
2084S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2085{
2086/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2087
2088 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2089 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2090 ISBN 0-201-10088-6
2091
2092 We find the fail state for each state in the trie, this state is the longest proper
2093 suffix of the current states 'word' that is also a proper prefix of another word in our
2094 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2095 the DFA not to have to restart after its tried and failed a word at a given point, it
2096 simply continues as though it had been matching the other word in the first place.
2097 Consider
2098 'abcdgu'=~/abcdefg|cdgu/
2099 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2100 fail, which would bring use to the state representing 'd' in the second word where we would
2101 try 'g' and succeed, prodceding to match 'cdgu'.
2102 */
2103 /* add a fail transition */
3251b653
NC
2104 const U32 trie_offset = ARG(source);
2105 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2106 U32 *q;
2107 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2108 const U32 numstates = trie->statecount;
786e8c11
YO
2109 const U32 ubound = trie->lasttrans + ucharcount;
2110 U32 q_read = 0;
2111 U32 q_write = 0;
2112 U32 charid;
2113 U32 base = trie->states[ 1 ].trans.base;
2114 U32 *fail;
2115 reg_ac_data *aho;
2116 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2117 GET_RE_DEBUG_FLAGS_DECL;
2118#ifndef DEBUGGING
2119 PERL_UNUSED_ARG(depth);
2120#endif
2121
2122
2123 ARG_SET( stclass, data_slot );
c944940b 2124 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2125 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2126 aho->trie=trie_offset;
446bd890
NC
2127 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2128 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2129 Newxz( q, numstates, U32);
c944940b 2130 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2131 aho->refcount = 1;
2132 fail = aho->fail;
2133 /* initialize fail[0..1] to be 1 so that we always have
2134 a valid final fail state */
2135 fail[ 0 ] = fail[ 1 ] = 1;
2136
2137 for ( charid = 0; charid < ucharcount ; charid++ ) {
2138 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2139 if ( newstate ) {
2140 q[ q_write ] = newstate;
2141 /* set to point at the root */
2142 fail[ q[ q_write++ ] ]=1;
2143 }
2144 }
2145 while ( q_read < q_write) {
2146 const U32 cur = q[ q_read++ % numstates ];
2147 base = trie->states[ cur ].trans.base;
2148
2149 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2150 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2151 if (ch_state) {
2152 U32 fail_state = cur;
2153 U32 fail_base;
2154 do {
2155 fail_state = fail[ fail_state ];
2156 fail_base = aho->states[ fail_state ].trans.base;
2157 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2158
2159 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2160 fail[ ch_state ] = fail_state;
2161 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2162 {
2163 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2164 }
2165 q[ q_write++ % numstates] = ch_state;
2166 }
2167 }
2168 }
2169 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2170 when we fail in state 1, this allows us to use the
2171 charclass scan to find a valid start char. This is based on the principle
2172 that theres a good chance the string being searched contains lots of stuff
2173 that cant be a start char.
2174 */
2175 fail[ 0 ] = fail[ 1 ] = 0;
2176 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2177 PerlIO_printf(Perl_debug_log,
2178 "%*sStclass Failtable (%"UVuf" states): 0",
2179 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2180 );
786e8c11
YO
2181 for( q_read=1; q_read<numstates; q_read++ ) {
2182 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2183 }
2184 PerlIO_printf(Perl_debug_log, "\n");
2185 });
2186 Safefree(q);
2187 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2188}
2189
786e8c11 2190
a3621e74 2191/*
5d1c421c
JH
2192 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2193 * These need to be revisited when a newer toolchain becomes available.
2194 */
2195#if defined(__sparc64__) && defined(__GNUC__)
2196# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2197# undef SPARC64_GCC_WORKAROUND
2198# define SPARC64_GCC_WORKAROUND 1
2199# endif
2200#endif
2201
07be1b83 2202#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2203 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2204 SV * const mysv=sv_newmortal(); \
2205 regnode *Next = regnext(scan); \
2206 regprop(RExC_rx, mysv, scan); \
7f69552c 2207 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2208 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2209 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2210 }});
07be1b83 2211
1de06328
YO
2212
2213
2214
2215
07be1b83
YO
2216#define JOIN_EXACT(scan,min,flags) \
2217 if (PL_regkind[OP(scan)] == EXACT) \
2218 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2219
be8e71aa 2220STATIC U32
07be1b83
YO
2221S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2222 /* Merge several consecutive EXACTish nodes into one. */
2223 regnode *n = regnext(scan);
2224 U32 stringok = 1;
2225 regnode *next = scan + NODE_SZ_STR(scan);
2226 U32 merged = 0;
2227 U32 stopnow = 0;
2228#ifdef DEBUGGING
2229 regnode *stop = scan;
72f13be8 2230 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2231#else
d47053eb
RGS
2232 PERL_UNUSED_ARG(depth);
2233#endif
2234#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2235 PERL_UNUSED_ARG(flags);
2236 PERL_UNUSED_ARG(val);
07be1b83 2237#endif
07be1b83
YO
2238 DEBUG_PEEP("join",scan,depth);
2239
2240 /* Skip NOTHING, merge EXACT*. */
2241 while (n &&
2242 ( PL_regkind[OP(n)] == NOTHING ||
2243 (stringok && (OP(n) == OP(scan))))
2244 && NEXT_OFF(n)
2245 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2246
2247 if (OP(n) == TAIL || n > next)
2248 stringok = 0;
2249 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2250 DEBUG_PEEP("skip:",n,depth);
2251 NEXT_OFF(scan) += NEXT_OFF(n);
2252 next = n + NODE_STEP_REGNODE;
2253#ifdef DEBUGGING
2254 if (stringok)
2255 stop = n;
2256#endif
2257 n = regnext(n);
2258 }
2259 else if (stringok) {
786e8c11 2260 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2261 regnode * const nnext = regnext(n);
2262
2263 DEBUG_PEEP("merg",n,depth);
2264
2265 merged++;
2266 if (oldl + STR_LEN(n) > U8_MAX)
2267 break;
2268 NEXT_OFF(scan) += NEXT_OFF(n);
2269 STR_LEN(scan) += STR_LEN(n);
2270 next = n + NODE_SZ_STR(n);
2271 /* Now we can overwrite *n : */
2272 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2273#ifdef DEBUGGING
2274 stop = next - 1;
2275#endif
2276 n = nnext;
2277 if (stopnow) break;
2278 }
2279
d47053eb
RGS
2280#ifdef EXPERIMENTAL_INPLACESCAN
2281 if (flags && !NEXT_OFF(n)) {
2282 DEBUG_PEEP("atch", val, depth);
2283 if (reg_off_by_arg[OP(n)]) {
2284 ARG_SET(n, val - n);
2285 }
2286 else {
2287 NEXT_OFF(n) = val - n;
2288 }
2289 stopnow = 1;
2290 }
07be1b83
YO
2291#endif
2292 }
2293
2294 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2295 /*
2296 Two problematic code points in Unicode casefolding of EXACT nodes:
2297
2298 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2299 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2300
2301 which casefold to
2302
2303 Unicode UTF-8
2304
2305 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2306 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2307
2308 This means that in case-insensitive matching (or "loose matching",
2309 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2310 length of the above casefolded versions) can match a target string
2311 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2312 This would rather mess up the minimum length computation.
2313
2314 What we'll do is to look for the tail four bytes, and then peek
2315 at the preceding two bytes to see whether we need to decrease
2316 the minimum length by four (six minus two).
2317
2318 Thanks to the design of UTF-8, there cannot be false matches:
2319 A sequence of valid UTF-8 bytes cannot be a subsequence of
2320 another valid sequence of UTF-8 bytes.
2321
2322 */
2323 char * const s0 = STRING(scan), *s, *t;
2324 char * const s1 = s0 + STR_LEN(scan) - 1;
2325 char * const s2 = s1 - 4;
e294cc5d
JH
2326#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2327 const char t0[] = "\xaf\x49\xaf\x42";
2328#else
07be1b83 2329 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2330#endif
07be1b83
YO
2331 const char * const t1 = t0 + 3;
2332
2333 for (s = s0 + 2;
2334 s < s2 && (t = ninstr(s, s1, t0, t1));
2335 s = t + 4) {
e294cc5d
JH
2336#ifdef EBCDIC
2337 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2338 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2339#else
07be1b83
YO
2340 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2341 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2342#endif
07be1b83
YO
2343 *min -= 4;
2344 }
2345 }
2346
2347#ifdef DEBUGGING
2348 /* Allow dumping */
2349 n = scan + NODE_SZ_STR(scan);
2350 while (n <= stop) {
2351 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2352 OP(n) = OPTIMIZED;
2353 NEXT_OFF(n) = 0;
2354 }
2355 n++;
2356 }
2357#endif
2358 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2359 return stopnow;
2360}
2361
653099ff
GS
2362/* REx optimizer. Converts nodes into quickier variants "in place".
2363 Finds fixed substrings. */
2364
a0288114 2365/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2366 to the position after last scanned or to NULL. */
2367
40d049e4
YO
2368#define INIT_AND_WITHP \
2369 assert(!and_withp); \
2370 Newx(and_withp,1,struct regnode_charclass_class); \
2371 SAVEFREEPV(and_withp)
07be1b83 2372
b515a41d
YO
2373/* this is a chain of data about sub patterns we are processing that
2374 need to be handled seperately/specially in study_chunk. Its so
2375 we can simulate recursion without losing state. */
2376struct scan_frame;
2377typedef struct scan_frame {
2378 regnode *last; /* last node to process in this frame */
2379 regnode *next; /* next node to process when last is reached */
2380 struct scan_frame *prev; /*previous frame*/
2381 I32 stop; /* what stopparen do we use */
2382} scan_frame;
2383
304ee84b
YO
2384
2385#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2386
76e3520e 2387STATIC I32
40d049e4 2388S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2389 I32 *minlenp, I32 *deltap,
40d049e4
YO
2390 regnode *last,
2391 scan_data_t *data,
2392 I32 stopparen,
2393 U8* recursed,
2394 struct regnode_charclass_class *and_withp,
2395 U32 flags, U32 depth)
c277df42
IZ
2396 /* scanp: Start here (read-write). */
2397 /* deltap: Write maxlen-minlen here. */
2398 /* last: Stop before this one. */
40d049e4
YO
2399 /* data: string data about the pattern */
2400 /* stopparen: treat close N as END */
2401 /* recursed: which subroutines have we recursed into */
2402 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2403{
97aff369 2404 dVAR;
c277df42
IZ
2405 I32 min = 0, pars = 0, code;
2406 regnode *scan = *scanp, *next;
2407 I32 delta = 0;
2408 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2409 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2410 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2411 scan_data_t data_fake;
a3621e74 2412 SV *re_trie_maxbuff = NULL;
786e8c11 2413 regnode *first_non_open = scan;
e2e6a0f1 2414 I32 stopmin = I32_MAX;
8aa23a47
YO
2415 scan_frame *frame = NULL;
2416
a3621e74 2417 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2418
13a24bad 2419#ifdef DEBUGGING
40d049e4 2420 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2421#endif
40d049e4 2422
786e8c11 2423 if ( depth == 0 ) {
40d049e4 2424 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2425 first_non_open=regnext(first_non_open);
2426 }
2427
b81d288d 2428
8aa23a47
YO
2429 fake_study_recurse:
2430 while ( scan && OP(scan) != END && scan < last ){
2431 /* Peephole optimizer: */
304ee84b 2432 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2433 DEBUG_PEEP("Peep",scan,depth);
2434 JOIN_EXACT(scan,&min,0);
2435
2436 /* Follow the next-chain of the current node and optimize
2437 away all the NOTHINGs from it. */
2438 if (OP(scan) != CURLYX) {
2439 const int max = (reg_off_by_arg[OP(scan)]
2440 ? I32_MAX
2441 /* I32 may be smaller than U16 on CRAYs! */
2442 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2443 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2444 int noff;
2445 regnode *n = scan;
2446
2447 /* Skip NOTHING and LONGJMP. */
2448 while ((n = regnext(n))
2449 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2450 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2451 && off + noff < max)
2452 off += noff;
2453 if (reg_off_by_arg[OP(scan)])
2454 ARG(scan) = off;
2455 else
2456 NEXT_OFF(scan) = off;
2457 }
a3621e74 2458
c277df42 2459
8aa23a47
YO
2460
2461 /* The principal pseudo-switch. Cannot be a switch, since we
2462 look into several different things. */
2463 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2464 || OP(scan) == IFTHEN) {
2465 next = regnext(scan);
2466 code = OP(scan);
2467 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2468
2469 if (OP(next) == code || code == IFTHEN) {
2470 /* NOTE - There is similar code to this block below for handling
2471 TRIE nodes on a re-study. If you change stuff here check there
2472 too. */
2473 I32 max1 = 0, min1 = I32_MAX, num = 0;
2474 struct regnode_charclass_class accum;
2475 regnode * const startbranch=scan;
2476
2477 if (flags & SCF_DO_SUBSTR)
304ee84b 2478 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2479 if (flags & SCF_DO_STCLASS)
2480 cl_init_zero(pRExC_state, &accum);
2481
2482 while (OP(scan) == code) {
2483 I32 deltanext, minnext, f = 0, fake;
2484 struct regnode_charclass_class this_class;
2485
2486 num++;
2487 data_fake.flags = 0;
2488 if (data) {
2489 data_fake.whilem_c = data->whilem_c;
2490 data_fake.last_closep = data->last_closep;
2491 }
2492 else
2493 data_fake.last_closep = &fake;
58e23c8d
YO
2494
2495 data_fake.pos_delta = delta;
8aa23a47
YO
2496 next = regnext(scan);
2497 scan = NEXTOPER(scan);
2498 if (code != BRANCH)
c277df42 2499 scan = NEXTOPER(scan);
8aa23a47
YO
2500 if (flags & SCF_DO_STCLASS) {
2501 cl_init(pRExC_state, &this_class);
2502 data_fake.start_class = &this_class;
2503 f = SCF_DO_STCLASS_AND;
58e23c8d 2504 }
8aa23a47
YO
2505 if (flags & SCF_WHILEM_VISITED_POS)
2506 f |= SCF_WHILEM_VISITED_POS;
2507
2508 /* we suppose the run is continuous, last=next...*/
2509 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2510 next, &data_fake,
2511 stopparen, recursed, NULL, f,depth+1);
2512 if (min1 > minnext)
2513 min1 = minnext;
2514 if (max1 < minnext + deltanext)
2515 max1 = minnext + deltanext;
2516 if (deltanext == I32_MAX)
2517 is_inf = is_inf_internal = 1;
2518 scan = next;
2519 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2520 pars++;
2521 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2522 if ( stopmin > minnext)
2523 stopmin = min + min1;
2524 flags &= ~SCF_DO_SUBSTR;
2525 if (data)
2526 data->flags |= SCF_SEEN_ACCEPT;
2527 }
2528 if (data) {
2529 if (data_fake.flags & SF_HAS_EVAL)
2530 data->flags |= SF_HAS_EVAL;
2531 data->whilem_c = data_fake.whilem_c;
3dab1dad 2532 }
8aa23a47
YO
2533 if (flags & SCF_DO_STCLASS)
2534 cl_or(pRExC_state, &accum, &this_class);
2535 }
2536 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2537 min1 = 0;
2538 if (flags & SCF_DO_SUBSTR) {
2539 data->pos_min += min1;
2540 data->pos_delta += max1 - min1;
2541 if (max1 != min1 || is_inf)
2542 data->longest = &(data->longest_float);
2543 }
2544 min += min1;
2545 delta += max1 - min1;
2546 if (flags & SCF_DO_STCLASS_OR) {
2547 cl_or(pRExC_state, data->start_class, &accum);
2548 if (min1) {
2549 cl_and(data->start_class, and_withp);
2550 flags &= ~SCF_DO_STCLASS;
653099ff 2551 }
8aa23a47
YO
2552 }
2553 else if (flags & SCF_DO_STCLASS_AND) {
2554 if (min1) {
2555 cl_and(data->start_class, &accum);
2556 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2557 }
8aa23a47
YO
2558 else {
2559 /* Switch to OR mode: cache the old value of
2560 * data->start_class */
2561 INIT_AND_WITHP;
2562 StructCopy(data->start_class, and_withp,
2563 struct regnode_charclass_class);
2564 flags &= ~SCF_DO_STCLASS_AND;
2565 StructCopy(&accum, data->start_class,
2566 struct regnode_charclass_class);
2567 flags |= SCF_DO_STCLASS_OR;
2568 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2569 }
8aa23a47 2570 }
a3621e74 2571
8aa23a47
YO
2572 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2573 /* demq.
a3621e74 2574
8aa23a47
YO
2575 Assuming this was/is a branch we are dealing with: 'scan' now
2576 points at the item that follows the branch sequence, whatever
2577 it is. We now start at the beginning of the sequence and look
2578 for subsequences of
a3621e74 2579
8aa23a47
YO
2580 BRANCH->EXACT=>x1
2581 BRANCH->EXACT=>x2
2582 tail
a3621e74 2583
8aa23a47 2584 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2585
8aa23a47
YO
2586 If we can find such a subseqence we need to turn the first
2587 element into a trie and then add the subsequent branch exact
2588 strings to the trie.
a3621e74 2589
8aa23a47 2590 We have two cases
a3621e74 2591
8aa23a47 2592 1. patterns where the whole set of branch can be converted.
a3621e74 2593
8aa23a47 2594 2. patterns where only a subset can be converted.
a3621e74 2595
8aa23a47
YO
2596 In case 1 we can replace the whole set with a single regop
2597 for the trie. In case 2 we need to keep the start and end
2598 branchs so
a3621e74 2599
8aa23a47
YO
2600 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2601 becomes BRANCH TRIE; BRANCH X;
786e8c11 2602
8aa23a47
YO
2603 There is an additional case, that being where there is a
2604 common prefix, which gets split out into an EXACT like node
2605 preceding the TRIE node.
a3621e74 2606
8aa23a47
YO
2607 If x(1..n)==tail then we can do a simple trie, if not we make
2608 a "jump" trie, such that when we match the appropriate word
2609 we "jump" to the appopriate tail node. Essentailly we turn
2610 a nested if into a case structure of sorts.
b515a41d 2611
8aa23a47
YO
2612 */
2613
2614 int made=0;
2615 if (!re_trie_maxbuff) {
2616 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2617 if (!SvIOK(re_trie_maxbuff))
2618 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2619 }
2620 if ( SvIV(re_trie_maxbuff)>=0 ) {
2621 regnode *cur;
2622 regnode *first = (regnode *)NULL;
2623 regnode *last = (regnode *)NULL;
2624 regnode *tail = scan;
2625 U8 optype = 0;
2626 U32 count=0;
a3621e74
YO
2627
2628#ifdef DEBUGGING
8aa23a47 2629 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2630#endif
8aa23a47
YO
2631 /* var tail is used because there may be a TAIL
2632 regop in the way. Ie, the exacts will point to the
2633 thing following the TAIL, but the last branch will
2634 point at the TAIL. So we advance tail. If we
2635 have nested (?:) we may have to move through several
2636 tails.
2637 */
2638
2639 while ( OP( tail ) == TAIL ) {
2640 /* this is the TAIL generated by (?:) */
2641 tail = regnext( tail );
2642 }
a3621e74 2643
8aa23a47
YO
2644
2645 DEBUG_OPTIMISE_r({
2646 regprop(RExC_rx, mysv, tail );
2647 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2648 (int)depth * 2 + 2, "",
2649 "Looking for TRIE'able sequences. Tail node is: ",
2650 SvPV_nolen_const( mysv )
2651 );
2652 });
2653
2654 /*
2655
2656 step through the branches, cur represents each
2657 branch, noper is the first thing to be matched
2658 as part of that branch and noper_next is the
2659 regnext() of that node. if noper is an EXACT
2660 and noper_next is the same as scan (our current
2661 position in the regex) then the EXACT branch is
2662 a possible optimization target. Once we have
2663 two or more consequetive such branches we can
2664 create a trie of the EXACT's contents and stich
2665 it in place. If the sequence represents all of
2666 the branches we eliminate the whole thing and
2667 replace it with a single TRIE. If it is a
2668 subsequence then we need to stitch it in. This
2669 means the first branch has to remain, and needs
2670 to be repointed at the item on the branch chain
2671 following the last branch optimized. This could
2672 be either a BRANCH, in which case the
2673 subsequence is internal, or it could be the
2674 item following the branch sequence in which
2675 case the subsequence is at the end.
2676
2677 */
2678
2679 /* dont use tail as the end marker for this traverse */
2680 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2681 regnode * const noper = NEXTOPER( cur );
b515a41d 2682#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2683 regnode * const noper_next = regnext( noper );
b515a41d
YO
2684#endif
2685
8aa23a47
YO
2686 DEBUG_OPTIMISE_r({
2687 regprop(RExC_rx, mysv, cur);
2688 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2689 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2690
2691 regprop(RExC_rx, mysv, noper);
2692 PerlIO_printf( Perl_debug_log, " -> %s",
2693 SvPV_nolen_const(mysv));
2694
2695 if ( noper_next ) {
2696 regprop(RExC_rx, mysv, noper_next );
2697 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2698 SvPV_nolen_const(mysv));
2699 }
2700 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2701 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2702 });
2703 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2704 : PL_regkind[ OP( noper ) ] == EXACT )
2705 || OP(noper) == NOTHING )
786e8c11 2706#ifdef NOJUMPTRIE
8aa23a47 2707 && noper_next == tail
786e8c11 2708#endif
8aa23a47
YO
2709 && count < U16_MAX)
2710 {
2711 count++;
2712 if ( !first || optype == NOTHING ) {
2713 if (!first) first = cur;
2714 optype = OP( noper );
2715 } else {
2716 last = cur;
2717 }
2718 } else {
2719 if ( last ) {
2720 make_trie( pRExC_state,
2721 startbranch, first, cur, tail, count,
2722 optype, depth+1 );
2723 }
2724 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2725#ifdef NOJUMPTRIE
8aa23a47 2726 && noper_next == tail
786e8c11 2727#endif
8aa23a47
YO
2728 ){
2729 count = 1;
2730 first = cur;
2731 optype = OP( noper );
2732 } else {
2733 count = 0;
2734 first = NULL;
2735 optype = 0;
2736 }
2737 last = NULL;
2738 }
2739 }
2740 DEBUG_OPTIMISE_r({
2741 regprop(RExC_rx, mysv, cur);
2742 PerlIO_printf( Perl_debug_log,
2743 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2744 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2745
2746 });
2747 if ( last ) {
2748 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2749#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2750 if ( ((made == MADE_EXACT_TRIE &&
2751 startbranch == first)
2752 || ( first_non_open == first )) &&
2753 depth==0 ) {
2754 flags |= SCF_TRIE_RESTUDY;
2755 if ( startbranch == first
2756 && scan == tail )
2757 {
2758 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2759 }
2760 }
3dab1dad 2761#endif
8aa23a47
YO
2762 }
2763 }
2764
2765 } /* do trie */
2766
653099ff 2767 }
8aa23a47
YO
2768 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2769 scan = NEXTOPER(NEXTOPER(scan));
2770 } else /* single branch is optimized. */
2771 scan = NEXTOPER(scan);
2772 continue;
2773 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2774 scan_frame *newframe = NULL;
2775 I32 paren;
2776 regnode *start;
2777 regnode *end;
2778
2779 if (OP(scan) != SUSPEND) {
2780 /* set the pointer */
2781 if (OP(scan) == GOSUB) {
2782 paren = ARG(scan);
2783 RExC_recurse[ARG2L(scan)] = scan;
2784 start = RExC_open_parens[paren-1];
2785 end = RExC_close_parens[paren-1];
2786 } else {
2787 paren = 0;
f8fc2ecf 2788 start = RExC_rxi->program + 1;
8aa23a47
YO
2789 end = RExC_opend;
2790 }
2791 if (!recursed) {
2792 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2793 SAVEFREEPV(recursed);
2794 }
2795 if (!PAREN_TEST(recursed,paren+1)) {
2796 PAREN_SET(recursed,paren+1);
2797 Newx(newframe,1,scan_frame);
2798 } else {
2799 if (flags & SCF_DO_SUBSTR) {
304ee84b 2800 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2801 data->longest = &(data->longest_float);
2802 }
2803 is_inf = is_inf_internal = 1;
2804 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2805 cl_anything(pRExC_state, data->start_class);
2806 flags &= ~SCF_DO_STCLASS;
2807 }
2808 } else {
2809 Newx(newframe,1,scan_frame);
2810 paren = stopparen;
2811 start = scan+2;
2812 end = regnext(scan);
2813 }
2814 if (newframe) {
2815 assert(start);
2816 assert(end);
2817 SAVEFREEPV(newframe);
2818 newframe->next = regnext(scan);
2819 newframe->last = last;
2820 newframe->stop = stopparen;
2821 newframe->prev = frame;
2822
2823 frame = newframe;
2824 scan = start;
2825 stopparen = paren;
2826 last = end;
2827
2828 continue;
2829 }
2830 }
2831 else if (OP(scan) == EXACT) {
2832 I32 l = STR_LEN(scan);
2833 UV uc;
2834 if (UTF) {
2835 const U8 * const s = (U8*)STRING(scan);
2836 l = utf8_length(s, s + l);
2837 uc = utf8_to_uvchr(s, NULL);
2838 } else {
2839 uc = *((U8*)STRING(scan));
2840 }
2841 min += l;
2842 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2843 /* The code below prefers earlier match for fixed
2844 offset, later match for variable offset. */
2845 if (data->last_end == -1) { /* Update the start info. */
2846 data->last_start_min = data->pos_min;
2847 data->last_start_max = is_inf
2848 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2849 }
8aa23a47
YO
2850 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2851 if (UTF)
2852 SvUTF8_on(data->last_found);
2853 {
2854 SV * const sv = data->last_found;
2855 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2856 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2857 if (mg && mg->mg_len >= 0)
2858 mg->mg_len += utf8_length((U8*)STRING(scan),
2859 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2860 }
8aa23a47
YO
2861 data->last_end = data->pos_min + l;
2862 data->pos_min += l; /* As in the first entry. */
2863 data->flags &= ~SF_BEFORE_EOL;
2864 }
2865 if (flags & SCF_DO_STCLASS_AND) {
2866 /* Check whether it is compatible with what we know already! */
2867 int compat = 1;
2868
2869 if (uc >= 0x100 ||
2870 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2871 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2872 && (!(data->start_class->flags & ANYOF_FOLD)
2873 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2874 )
2875 compat = 0;
2876 ANYOF_CLASS_ZERO(data->start_class);
2877 ANYOF_BITMAP_ZERO(data->start_class);
2878 if (compat)
2879 ANYOF_BITMAP_SET(data->start_class, uc);
2880 data->start_class->flags &= ~ANYOF_EOS;
2881 if (uc < 0x100)
2882 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2883 }
2884 else if (flags & SCF_DO_STCLASS_OR) {
2885 /* false positive possible if the class is case-folded */
2886 if (uc < 0x100)
2887 ANYOF_BITMAP_SET(data->start_class, uc);
2888 else
2889 data->start_class->flags |= ANYOF_UNICODE_ALL;
2890 data->start_class->flags &= ~ANYOF_EOS;
2891 cl_and(data->start_class, and_withp);
2892 }
2893 flags &= ~SCF_DO_STCLASS;
2894 }
2895 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2896 I32 l = STR_LEN(scan);
2897 UV uc = *((U8*)STRING(scan));
2898
2899 /* Search for fixed substrings supports EXACT only. */
2900 if (flags & SCF_DO_SUBSTR) {
2901 assert(data);
304ee84b 2902 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2903 }
2904 if (UTF) {
2905 const U8 * const s = (U8 *)STRING(scan);
2906 l = utf8_length(s, s + l);
2907 uc = utf8_to_uvchr(s, NULL);
2908 }
2909 min += l;
2910 if (flags & SCF_DO_SUBSTR)
2911 data->pos_min += l;
2912 if (flags & SCF_DO_STCLASS_AND) {
2913 /* Check whether it is compatible with what we know already! */
2914 int compat = 1;
2915
2916 if (uc >= 0x100 ||
2917 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2918 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2919 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2920 compat = 0;
2921 ANYOF_CLASS_ZERO(data->start_class);
2922 ANYOF_BITMAP_ZERO(data->start_class);
2923 if (compat) {
2924 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2925 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2926 data->start_class->flags |= ANYOF_FOLD;
2927 if (OP(scan) == EXACTFL)
2928 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2929 }
8aa23a47
YO
2930 }
2931 else if (flags & SCF_DO_STCLASS_OR) {
2932 if (data->start_class->flags & ANYOF_FOLD) {
2933 /* false positive possible if the class is case-folded.
2934 Assume that the locale settings are the same... */
1aa99e6b
IH
2935 if (uc < 0x100)
2936 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2937 data->start_class->flags &= ~ANYOF_EOS;
2938 }
8aa23a47 2939 cl_and(data->start_class, and_withp);
653099ff 2940 }
8aa23a47
YO
2941 flags &= ~SCF_DO_STCLASS;
2942 }
2943 else if (strchr((const char*)PL_varies,OP(scan))) {
2944 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2945 I32 f = flags, pos_before = 0;
2946 regnode * const oscan = scan;
2947 struct regnode_charclass_class this_class;
2948 struct regnode_charclass_class *oclass = NULL;
2949 I32 next_is_eval = 0;
2950
2951 switch (PL_regkind[OP(scan)]) {
2952 case WHILEM: /* End of (?:...)* . */
2953 scan = NEXTOPER(scan);
2954 goto finish;
2955 case PLUS:
2956 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2957 next = NEXTOPER(scan);
2958 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2959 mincount = 1;
2960 maxcount = REG_INFTY;
2961 next = regnext(scan);
2962 scan = NEXTOPER(scan);
2963 goto do_curly;
2964 }
2965 }
2966 if (flags & SCF_DO_SUBSTR)
2967 data->pos_min++;
2968 min++;
2969 /* Fall through. */
2970 case STAR:
2971 if (flags & SCF_DO_STCLASS) {
2972 mincount = 0;
2973 maxcount = REG_INFTY;
2974 next = regnext(scan);
2975 scan = NEXTOPER(scan);
2976 goto do_curly;
2977 }
2978 is_inf = is_inf_internal = 1;
2979 scan = regnext(scan);
c277df42 2980 if (flags & SCF_DO_SUBSTR) {
304ee84b 2981 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2982 data->longest = &(data->longest_float);
c277df42 2983 }
8aa23a47
YO
2984 goto optimize_curly_tail;
2985 case CURLY:
2986 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2987 && (scan->flags == stopparen))
2988 {
2989 mincount = 1;
2990 maxcount = 1;
2991 } else {
2992 mincount = ARG1(scan);
2993 maxcount = ARG2(scan);
653099ff 2994 }
8aa23a47
YO
2995 next = regnext(scan);
2996 if (OP(scan) == CURLYX) {
2997 I32 lp = (data ? *(data->last_closep) : 0);
2998 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 2999 }
8aa23a47
YO
3000 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3001 next_is_eval = (OP(scan) == EVAL);
3002 do_curly:
3003 if (flags & SCF_DO_SUBSTR) {
304ee84b 3004 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3005 pos_before = data->pos_min;
b45f050a 3006 }
8aa23a47
YO
3007 if (data) {
3008 fl = data->flags;
3009 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3010 if (is_inf)
3011 data->flags |= SF_IS_INF;
3012 }
3013 if (flags & SCF_DO_STCLASS) {
3014 cl_init(pRExC_state, &this_class);
3015 oclass = data->start_class;
3016 data->start_class = &this_class;
3017 f |= SCF_DO_STCLASS_AND;
3018 f &= ~SCF_DO_STCLASS_OR;
3019 }
3020 /* These are the cases when once a subexpression
3021 fails at a particular position, it cannot succeed
3022 even after backtracking at the enclosing scope.
3023
3024 XXXX what if minimal match and we are at the
3025 initial run of {n,m}? */
3026 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3027 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3028
8aa23a47
YO
3029 /* This will finish on WHILEM, setting scan, or on NULL: */
3030 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3031 last, data, stopparen, recursed, NULL,
3032 (mincount == 0
3033 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3034
8aa23a47
YO
3035 if (flags & SCF_DO_STCLASS)
3036 data->start_class = oclass;
3037 if (mincount == 0 || minnext == 0) {
3038 if (flags & SCF_DO_STCLASS_OR) {
3039 cl_or(pRExC_state, data->start_class, &this_class);
3040 }
3041 else if (flags & SCF_DO_STCLASS_AND) {
3042 /* Switch to OR mode: cache the old value of
3043 * data->start_class */
3044 INIT_AND_WITHP;
3045 StructCopy(data->start_class, and_withp,
3046 struct regnode_charclass_class);
3047 flags &= ~SCF_DO_STCLASS_AND;
3048 StructCopy(&this_class, data->start_class,
3049 struct regnode_charclass_class);
3050 flags |= SCF_DO_STCLASS_OR;
3051 data->start_class->flags |= ANYOF_EOS;
3052 }
3053 } else { /* Non-zero len */
3054 if (flags & SCF_DO_STCLASS_OR) {
3055 cl_or(pRExC_state, data->start_class, &this_class);
3056 cl_and(data->start_class, and_withp);
3057 }
3058 else if (flags & SCF_DO_STCLASS_AND)
3059 cl_and(data->start_class, &this_class);
3060 flags &= ~SCF_DO_STCLASS;
3061 }
3062 if (!scan) /* It was not CURLYX, but CURLY. */
3063 scan = next;
3064 if ( /* ? quantifier ok, except for (?{ ... }) */
3065 (next_is_eval || !(mincount == 0 && maxcount == 1))
3066 && (minnext == 0) && (deltanext == 0)
3067 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3068 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3069 && ckWARN(WARN_REGEXP))
3070 {
3071 vWARN(RExC_parse,
3072 "Quantifier unexpected on zero-length expression");
3073 }
3074
3075 min += minnext * mincount;
3076 is_inf_internal |= ((maxcount == REG_INFTY
3077 && (minnext + deltanext) > 0)
3078 || deltanext == I32_MAX);
3079 is_inf |= is_inf_internal;
3080 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3081
3082 /* Try powerful optimization CURLYX => CURLYN. */
3083 if ( OP(oscan) == CURLYX && data
3084 && data->flags & SF_IN_PAR
3085 && !(data->flags & SF_HAS_EVAL)
3086 && !deltanext && minnext == 1 ) {
3087 /* Try to optimize to CURLYN. */
3088 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3089 regnode * const nxt1 = nxt;
497b47a8 3090#ifdef DEBUGGING
8aa23a47 3091 regnode *nxt2;
497b47a8 3092#endif
c277df42 3093
8aa23a47
YO
3094 /* Skip open. */
3095 nxt = regnext(nxt);
3096 if (!strchr((const char*)PL_simple,OP(nxt))
3097 && !(PL_regkind[OP(nxt)] == EXACT
3098 && STR_LEN(nxt) == 1))
3099 goto nogo;
497b47a8 3100#ifdef DEBUGGING
8aa23a47 3101 nxt2 = nxt;
497b47a8 3102#endif
8aa23a47
YO
3103 nxt = regnext(nxt);
3104 if (OP(nxt) != CLOSE)
3105 goto nogo;
3106 if (RExC_open_parens) {
3107 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3108 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3109 }
3110 /* Now we know that nxt2 is the only contents: */
3111 oscan->flags = (U8)ARG(nxt);
3112 OP(oscan) = CURLYN;
3113 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3114
c277df42 3115#ifdef DEBUGGING
8aa23a47
YO
3116 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3117 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3118 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3119 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3120 OP(nxt + 1) = OPTIMIZED; /* was count. */
3121 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3122#endif
8aa23a47
YO
3123 }
3124 nogo:
3125
3126 /* Try optimization CURLYX => CURLYM. */
3127 if ( OP(oscan) == CURLYX && data
3128 && !(data->flags & SF_HAS_PAR)
3129 && !(data->flags & SF_HAS_EVAL)
3130 && !deltanext /* atom is fixed width */
3131 && minnext != 0 /* CURLYM can't handle zero width */
3132 ) {
3133 /* XXXX How to optimize if data == 0? */
3134 /* Optimize to a simpler form. */
3135 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3136 regnode *nxt2;
3137
3138 OP(oscan) = CURLYM;
3139 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3140 && (OP(nxt2) != WHILEM))
3141 nxt = nxt2;
3142 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3143 /* Need to optimize away parenths. */
3144 if (data->flags & SF_IN_PAR) {
3145 /* Set the parenth number. */
3146 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3147
3148 if (OP(nxt) != CLOSE)
3149 FAIL("Panic opt close");
3150 oscan->flags = (U8)ARG(nxt);
3151 if (RExC_open_parens) {
3152 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3153 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3154 }
8aa23a47
YO
3155 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3156 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3157
c277df42 3158#ifdef DEBUGGING
8aa23a47
YO
3159 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3160 OP(nxt + 1) = OPTIMIZED; /* was count. */
3161 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3162 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3163#endif
c277df42 3164#if 0
8aa23a47
YO
3165 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3166 regnode *nnxt = regnext(nxt1);
3167
3168 if (nnxt == nxt) {
3169 if (reg_off_by_arg[OP(nxt1)])
3170 ARG_SET(nxt1, nxt2 - nxt1);
3171 else if (nxt2 - nxt1 < U16_MAX)
3172 NEXT_OFF(nxt1) = nxt2 - nxt1;
3173 else
3174 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3175 }
8aa23a47 3176 nxt1 = nnxt;
c277df42 3177 }
5d1c421c 3178#endif
8aa23a47
YO
3179 /* Optimize again: */
3180 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3181 NULL, stopparen, recursed, NULL, 0,depth+1);
3182 }
3183 else
3184 oscan->flags = 0;
3185 }
3186 else if ((OP(oscan) == CURLYX)
3187 && (flags & SCF_WHILEM_VISITED_POS)
3188 /* See the comment on a similar expression above.
3189 However, this time it not a subexpression
3190 we care about, but the expression itself. */
3191 && (maxcount == REG_INFTY)
3192 && data && ++data->whilem_c < 16) {
3193 /* This stays as CURLYX, we can put the count/of pair. */
3194 /* Find WHILEM (as in regexec.c) */
3195 regnode *nxt = oscan + NEXT_OFF(oscan);
3196
3197 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3198 nxt += ARG(nxt);
3199 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3200 | (RExC_whilem_seen << 4)); /* On WHILEM */
3201 }
3202 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3203 pars++;
3204 if (flags & SCF_DO_SUBSTR) {
3205 SV *last_str = NULL;
3206 int counted = mincount != 0;
a0ed51b3 3207
8aa23a47
YO
3208 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3209#if defined(SPARC64_GCC_WORKAROUND)
3210 I32 b = 0;
3211 STRLEN l = 0;
3212 const char *s = NULL;
3213 I32 old = 0;
b515a41d 3214
8aa23a47
YO
3215 if (pos_before >= data->last_start_min)
3216 b = pos_before;
3217 else
3218 b = data->last_start_min;
b515a41d 3219
8aa23a47
YO
3220 l = 0;
3221 s = SvPV_const(data->last_found, l);
3222 old = b - data->last_start_min;
3223
3224#else
3225 I32 b = pos_before >= data->last_start_min
3226 ? pos_before : data->last_start_min;
3227 STRLEN l;
3228 const char * const s = SvPV_const(data->last_found, l);
3229 I32 old = b - data->last_start_min;
3230#endif
3231
3232 if (UTF)
3233 old = utf8_hop((U8*)s, old) - (U8*)s;
3234
3235 l -= old;
3236 /* Get the added string: */
3237 last_str = newSVpvn(s + old, l);
3238 if (UTF)
3239 SvUTF8_on(last_str);
3240 if (deltanext == 0 && pos_before == b) {
3241 /* What was added is a constant string */
3242 if (mincount > 1) {
3243 SvGROW(last_str, (mincount * l) + 1);
3244 repeatcpy(SvPVX(last_str) + l,
3245 SvPVX_const(last_str), l, mincount - 1);
3246 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3247 /* Add additional parts. */
3248 SvCUR_set(data->last_found,
3249 SvCUR(data->last_found) - l);
3250 sv_catsv(data->last_found, last_str);
3251 {
3252 SV * sv = data->last_found;
3253 MAGIC *mg =
3254 SvUTF8(sv) && SvMAGICAL(sv) ?
3255 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3256 if (mg && mg->mg_len >= 0)
3257 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3258 }
8aa23a47 3259 data->last_end += l * (mincount - 1);
b515a41d 3260 }
8aa23a47
YO
3261 } else {
3262 /* start offset must point into the last copy */
3263 data->last_start_min += minnext * (mincount - 1);
3264 data->last_start_max += is_inf ? I32_MAX
3265 : (maxcount - 1) * (minnext + data->pos_delta);
3266 }
c277df42 3267 }
8aa23a47
YO
3268 /* It is counted once already... */
3269 data->pos_min += minnext * (mincount - counted);
3270 data->pos_delta += - counted * deltanext +
3271 (minnext + deltanext) * maxcount - minnext * mincount;
3272 if (mincount != maxcount) {
3273 /* Cannot extend fixed substrings found inside
3274 the group. */
304ee84b 3275 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3276 if (mincount && last_str) {
3277 SV * const sv = data->last_found;
3278 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3279 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3280
3281 if (mg)
3282 mg->mg_len = -1;
3283 sv_setsv(sv, last_str);
3284 data->last_end = data->pos_min;
3285 data->last_start_min =
3286 data->pos_min - CHR_SVLEN(last_str);
3287 data->last_start_max = is_inf
3288 ? I32_MAX
3289 : data->pos_min + data->pos_delta
3290 - CHR_SVLEN(last_str);
3291 }
3292 data->longest = &(data->longest_float);
3293 }
3294 SvREFCNT_dec(last_str);
c277df42 3295 }
8aa23a47
YO
3296 if (data && (fl & SF_HAS_EVAL))
3297 data->flags |= SF_HAS_EVAL;
3298 optimize_curly_tail:
3299 if (OP(oscan) != CURLYX) {
3300 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3301 && NEXT_OFF(next))
3302 NEXT_OFF(oscan) += NEXT_OFF(next);
3303 }
3304 continue;
3305 default: /* REF and CLUMP only? */
3306 if (flags & SCF_DO_SUBSTR) {
304ee84b 3307 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3308 data->longest = &(data->longest_float);
3309 }
3310 is_inf = is_inf_internal = 1;
3311 if (flags & SCF_DO_STCLASS_OR)
3312 cl_anything(pRExC_state, data->start_class);
3313 flags &= ~SCF_DO_STCLASS;
3314 break;
c277df42 3315 }
8aa23a47
YO
3316 }
3317 else if (strchr((const char*)PL_simple,OP(scan))) {
3318 int value = 0;
653099ff 3319
8aa23a47 3320 if (flags & SCF_DO_SUBSTR) {
304ee84b 3321 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3322 data->pos_min++;
3323 }
3324 min++;
3325 if (flags & SCF_DO_STCLASS) {
3326 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3327
8aa23a47
YO
3328 /* Some of the logic below assumes that switching
3329 locale on will only add false positives. */
3330 switch (PL_regkind[OP(scan)]) {
3331 case SANY:
3332 default:
3333 do_default:
3334 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3335 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3336 cl_anything(pRExC_state, data->start_class);
3337 break;
3338 case REG_ANY:
3339 if (OP(scan) == SANY)
3340 goto do_default;
3341 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3342 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3343 || (data->start_class->flags & ANYOF_CLASS));
3344 cl_anything(pRExC_state, data->start_class);
653099ff 3345 }
8aa23a47
YO
3346 if (flags & SCF_DO_STCLASS_AND || !value)
3347 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3348 break;
3349 case ANYOF:
3350 if (flags & SCF_DO_STCLASS_AND)
3351 cl_and(data->start_class,
3352 (struct regnode_charclass_class*)scan);
653099ff 3353 else
8aa23a47
YO
3354 cl_or(pRExC_state, data->start_class,
3355 (struct regnode_charclass_class*)scan);
3356 break;
3357 case ALNUM:
3358 if (flags & SCF_DO_STCLASS_AND) {
3359 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3361 for (value = 0; value < 256; value++)
3362 if (!isALNUM(value))
3363 ANYOF_BITMAP_CLEAR(data->start_class, value);
3364 }
653099ff 3365 }
8aa23a47
YO
3366 else {
3367 if (data->start_class->flags & ANYOF_LOCALE)
3368 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3369 else {
3370 for (value = 0; value < 256; value++)
3371 if (isALNUM(value))
3372 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3373 }
8aa23a47
YO
3374 }
3375 break;
3376 case ALNUML:
3377 if (flags & SCF_DO_STCLASS_AND) {
3378 if (data->start_class->flags & ANYOF_LOCALE)
3379 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3380 }
3381 else {
3382 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3383 data->start_class->flags |= ANYOF_LOCALE;
3384 }
3385 break;
3386 case NALNUM:
3387 if (flags & SCF_DO_STCLASS_AND) {
3388 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3389 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3390 for (value = 0; value < 256; value++)
3391 if (isALNUM(value))
3392 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3393 }
3394 }
8aa23a47
YO
3395 else {
3396 if (data->start_class->flags & ANYOF_LOCALE)
3397 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3398 else {
3399 for (value = 0; value < 256; value++)
3400 if (!isALNUM(value))
3401 ANYOF_BITMAP_SET(data->start_class, value);
3402 }
653099ff 3403 }
8aa23a47
YO
3404 break;
3405 case NALNUML:
3406 if (flags & SCF_DO_STCLASS_AND) {
3407 if (data->start_class->flags & ANYOF_LOCALE)
3408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3409 }
8aa23a47
YO
3410 else {
3411 data->start_class->flags |= ANYOF_LOCALE;
3412 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3413 }
3414 break;
3415 case SPACE:
3416 if (flags & SCF_DO_STCLASS_AND) {
3417 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3418 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3419 for (value = 0; value < 256; value++)
3420 if (!isSPACE(value))
3421 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3422 }
3423 }
8aa23a47
YO
3424 else {
3425 if (data->start_class->flags & ANYOF_LOCALE)
3426 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3427 else {
3428 for (value = 0; value < 256; value++)
3429 if (isSPACE(value))
3430 ANYOF_BITMAP_SET(data->start_class, value);
3431 }
653099ff 3432 }
8aa23a47
YO
3433 break;
3434 case SPACEL:
3435 if (flags & SCF_DO_STCLASS_AND) {
3436 if (data->start_class->flags & ANYOF_LOCALE)
3437 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3438 }
3439 else {
3440 data->start_class->flags |= ANYOF_LOCALE;
3441 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3442 }
3443 break;
3444 case NSPACE:
3445 if (flags & SCF_DO_STCLASS_AND) {
3446 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3447 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3448 for (value = 0; value < 256; value++)
3449 if (isSPACE(value))
3450 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3451 }
8aa23a47
YO
3452 }
3453 else {
3454 if (data->start_class->flags & ANYOF_LOCALE)
3455 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3456 else {
3457 for (value = 0; value < 256; value++)
3458 if (!isSPACE(value))
3459 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3460 }
3461 }
8aa23a47
YO
3462 break;
3463 case NSPACEL:
3464 if (flags & SCF_DO_STCLASS_AND) {
3465 if (data->start_class->flags & ANYOF_LOCALE) {
3466 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3467 for (value = 0; value < 256; value++)
3468 if (!isSPACE(value))
3469 ANYOF_BITMAP_CLEAR(data->start_class, value);
3470 }
653099ff 3471 }
8aa23a47
YO
3472 else {
3473 data->start_class->flags |= ANYOF_LOCALE;
3474 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3475 }
3476 break;
3477 case DIGIT:
3478 if (flags & SCF_DO_STCLASS_AND) {
3479 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3480 for (value = 0; value < 256; value++)
3481 if (!isDIGIT(value))
3482 ANYOF_BITMAP_CLEAR(data->start_class, value);
3483 }
3484 else {
3485 if (data->start_class->flags & ANYOF_LOCALE)
3486 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3487 else {
3488 for (value = 0; value < 256; value++)
3489 if (isDIGIT(value))
3490 ANYOF_BITMAP_SET(data->start_class, value);
3491 }
3492 }
3493 break;
3494 case NDIGIT:
3495 if (flags & SCF_DO_STCLASS_AND) {
3496 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3497 for (value = 0; value < 256; value++)
3498 if (isDIGIT(value))
3499 ANYOF_BITMAP_CLEAR(data->start_class, value);
3500 }
3501 else {
3502 if (data->start_class->flags & ANYOF_LOCALE)
3503 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3504 else {
3505 for (value = 0; value < 256; value++)
3506 if (!isDIGIT(value))
3507 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3508 }
3509 }
8aa23a47
YO
3510 break;
3511 }
3512 if (flags & SCF_DO_STCLASS_OR)
3513 cl_and(data->start_class, and_withp);
3514 flags &= ~SCF_DO_STCLASS;
3515 }
3516 }
3517 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3518 data->flags |= (OP(scan) == MEOL
3519 ? SF_BEFORE_MEOL
3520 : SF_BEFORE_SEOL);
3521 }
3522 else if ( PL_regkind[OP(scan)] == BRANCHJ
3523 /* Lookbehind, or need to calculate parens/evals/stclass: */
3524 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3525 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3526 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3527 || OP(scan) == UNLESSM )
3528 {
3529 /* Negative Lookahead/lookbehind
3530 In this case we can't do fixed string optimisation.
3531 */
1de06328 3532
8aa23a47
YO
3533 I32 deltanext, minnext, fake = 0;
3534 regnode *nscan;
3535 struct regnode_charclass_class intrnl;
3536 int f = 0;
1de06328 3537
8aa23a47
YO
3538 data_fake.flags = 0;
3539 if (data) {
3540 data_fake.whilem_c = data->whilem_c;
3541 data_fake.last_closep = data->last_closep;
c277df42 3542 }
8aa23a47
YO
3543 else
3544 data_fake.last_closep = &fake;
58e23c8d 3545 data_fake.pos_delta = delta;
8aa23a47
YO
3546 if ( flags & SCF_DO_STCLASS && !scan->flags
3547 && OP(scan) == IFMATCH ) { /* Lookahead */
3548 cl_init(pRExC_state, &intrnl);
3549 data_fake.start_class = &intrnl;
3550 f |= SCF_DO_STCLASS_AND;
3551 }
3552 if (flags & SCF_WHILEM_VISITED_POS)
3553 f |= SCF_WHILEM_VISITED_POS;
3554 next = regnext(scan);
3555 nscan = NEXTOPER(NEXTOPER(scan));
3556 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3557 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3558 if (scan->flags) {
3559 if (deltanext) {
58e23c8d 3560 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3561 }
3562 else if (minnext > (I32)U8_MAX) {
58e23c8d 3563 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3564 }
3565 scan->flags = (U8)minnext;
3566 }
3567 if (data) {
3568 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3569 pars++;
3570 if (data_fake.flags & SF_HAS_EVAL)
3571 data->flags |= SF_HAS_EVAL;
3572 data->whilem_c = data_fake.whilem_c;
3573 }
3574 if (f & SCF_DO_STCLASS_AND) {
3575 const int was = (data->start_class->flags & ANYOF_EOS);
3576
3577 cl_and(data->start_class, &intrnl);
3578 if (was)
3579 data->start_class->flags |= ANYOF_EOS;
3580 }
cb434fcc 3581 }
8aa23a47
YO
3582#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3583 else {
3584 /* Positive Lookahead/lookbehind
3585 In this case we can do fixed string optimisation,
3586 but we must be careful about it. Note in the case of
3587 lookbehind the positions will be offset by the minimum
3588 length of the pattern, something we won't know about
3589 until after the recurse.
3590 */
3591 I32 deltanext, fake = 0;
3592 regnode *nscan;
3593 struct regnode_charclass_class intrnl;
3594 int f = 0;
3595 /* We use SAVEFREEPV so that when the full compile
3596 is finished perl will clean up the allocated
3597 minlens when its all done. This was we don't
3598 have to worry about freeing them when we know
3599 they wont be used, which would be a pain.
3600 */
3601 I32 *minnextp;
3602 Newx( minnextp, 1, I32 );
3603 SAVEFREEPV(minnextp);
3604
3605 if (data) {
3606 StructCopy(data, &data_fake, scan_data_t);
3607 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3608 f |= SCF_DO_SUBSTR;
3609 if (scan->flags)
304ee84b 3610 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3611 data_fake.last_found=newSVsv(data->last_found);
3612 }
3613 }
3614 else
3615 data_fake.last_closep = &fake;
3616 data_fake.flags = 0;
58e23c8d 3617 data_fake.pos_delta = delta;
8aa23a47
YO
3618 if (is_inf)
3619 data_fake.flags |= SF_IS_INF;
3620 if ( flags & SCF_DO_STCLASS && !scan->flags
3621 && OP(scan) == IFMATCH ) { /* Lookahead */
3622 cl_init(pRExC_state, &intrnl);
3623 data_fake.start_class = &intrnl;
3624 f |= SCF_DO_STCLASS_AND;
3625 }
3626 if (flags & SCF_WHILEM_VISITED_POS)
3627 f |= SCF_WHILEM_VISITED_POS;
3628 next = regnext(scan);
3629 nscan = NEXTOPER(NEXTOPER(scan));
3630
3631 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3632 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3633 if (scan->flags) {
3634 if (deltanext) {
58e23c8d 3635 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3636 }
3637 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3638 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3639 }
3640 scan->flags = (U8)*minnextp;
3641 }
3642
3643 *minnextp += min;
3644
3645 if (f & SCF_DO_STCLASS_AND) {
3646 const int was = (data->start_class->flags & ANYOF_EOS);
3647
3648 cl_and(data->start_class, &intrnl);
3649 if (was)
3650 data->start_class->flags |= ANYOF_EOS;
3651 }
3652 if (data) {
3653 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3654 pars++;
3655 if (data_fake.flags & SF_HAS_EVAL)
3656 data->flags |= SF_HAS_EVAL;
3657 data->whilem_c = data_fake.whilem_c;
3658 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3659 if (RExC_rx->minlen<*minnextp)
3660 RExC_rx->minlen=*minnextp;
304ee84b 3661 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3662 SvREFCNT_dec(data_fake.last_found);
3663
3664 if ( data_fake.minlen_fixed != minlenp )
3665 {
3666 data->offset_fixed= data_fake.offset_fixed;
3667 data->minlen_fixed= data_fake.minlen_fixed;
3668 data->lookbehind_fixed+= scan->flags;
3669 }
3670 if ( data_fake.minlen_float != minlenp )
3671 {
3672 data->minlen_float= data_fake.minlen_float;
3673 data->offset_float_min=data_fake.offset_float_min;
3674 data->offset_float_max=data_fake.offset_float_max;
3675 data->lookbehind_float+= scan->flags;
3676 }
3677 }
3678 }
3679
3680
40d049e4 3681 }
8aa23a47
YO
3682#endif
3683 }
3684 else if (OP(scan) == OPEN) {
3685 if (stopparen != (I32)ARG(scan))
3686 pars++;
3687 }
3688 else if (OP(scan) == CLOSE) {
3689 if (stopparen == (I32)ARG(scan)) {
3690 break;
3691 }
3692 if ((I32)ARG(scan) == is_par) {
3693 next = regnext(scan);
b515a41d 3694
8aa23a47
YO
3695 if ( next && (OP(next) != WHILEM) && next < last)
3696 is_par = 0; /* Disable optimization */
40d049e4 3697 }
8aa23a47
YO
3698 if (data)
3699 *(data->last_closep) = ARG(scan);
3700 }
3701 else if (OP(scan) == EVAL) {
c277df42
IZ
3702 if (data)
3703 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3704 }
3705 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3706 if (flags & SCF_DO_SUBSTR) {
304ee84b 3707 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3708 flags &= ~SCF_DO_SUBSTR;
40d049e4 3709 }
8aa23a47
YO
3710 if (data && OP(scan)==ACCEPT) {
3711 data->flags |= SCF_SEEN_ACCEPT;
3712 if (stopmin > min)
3713 stopmin = min;
e2e6a0f1 3714 }
8aa23a47
YO
3715 }
3716 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3717 {
0f5d15d6 3718 if (flags & SCF_DO_SUBSTR) {
304ee84b 3719 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3720 data->longest = &(data->longest_float);
3721 }
3722 is_inf = is_inf_internal = 1;
653099ff 3723 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3724 cl_anything(pRExC_state, data->start_class);
96776eda 3725 flags &= ~SCF_DO_STCLASS;
8aa23a47 3726 }
58e23c8d 3727 else if (OP(scan) == GPOS) {
bbe252da 3728 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3729 !(delta || is_inf || (data && data->pos_delta)))
3730 {
bbe252da
YO
3731 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3732 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3733 if (RExC_rx->gofs < (U32)min)
3734 RExC_rx->gofs = min;
3735 } else {
bbe252da 3736 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3737 RExC_rx->gofs = 0;
3738 }
3739 }
786e8c11 3740#ifdef TRIE_STUDY_OPT
40d049e4 3741#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3742 else if (PL_regkind[OP(scan)] == TRIE) {
3743 /* NOTE - There is similar code to this block above for handling
3744 BRANCH nodes on the initial study. If you change stuff here
3745 check there too. */
3746 regnode *trie_node= scan;
3747 regnode *tail= regnext(scan);
f8fc2ecf 3748 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3749 I32 max1 = 0, min1 = I32_MAX;
3750 struct regnode_charclass_class accum;
3751
3752 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3753 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3754 if (flags & SCF_DO_STCLASS)
3755 cl_init_zero(pRExC_state, &accum);
3756
3757 if (!trie->jump) {
3758 min1= trie->minlen;
3759 max1= trie->maxlen;
3760 } else {
3761 const regnode *nextbranch= NULL;
3762 U32 word;
3763
3764 for ( word=1 ; word <= trie->wordcount ; word++)
3765 {
3766 I32 deltanext=0, minnext=0, f = 0, fake;
3767 struct regnode_charclass_class this_class;
3768
3769 data_fake.flags = 0;
3770 if (data) {
3771 data_fake.whilem_c = data->whilem_c;
3772 data_fake.last_closep = data->last_closep;
3773 }
3774 else
3775 data_fake.last_closep = &fake;
58e23c8d 3776 data_fake.pos_delta = delta;
8aa23a47
YO
3777 if (flags & SCF_DO_STCLASS) {
3778 cl_init(pRExC_state, &this_class);
3779 data_fake.start_class = &this_class;
3780 f = SCF_DO_STCLASS_AND;
3781 }
3782 if (flags & SCF_WHILEM_VISITED_POS)
3783 f |= SCF_WHILEM_VISITED_POS;
3784
3785 if (trie->jump[word]) {
3786 if (!nextbranch)
3787 nextbranch = trie_node + trie->jump[0];
3788 scan= trie_node + trie->jump[word];
3789 /* We go from the jump point to the branch that follows
3790 it. Note this means we need the vestigal unused branches
3791 even though they arent otherwise used.
3792 */
3793 minnext = study_chunk(pRExC_state, &scan, minlenp,
3794 &deltanext, (regnode *)nextbranch, &data_fake,
3795 stopparen, recursed, NULL, f,depth+1);
3796 }
3797 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3798 nextbranch= regnext((regnode*)nextbranch);
3799
3800 if (min1 > (I32)(minnext + trie->minlen))
3801 min1 = minnext + trie->minlen;
3802 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3803 max1 = minnext + deltanext + trie->maxlen;
3804 if (deltanext == I32_MAX)
3805 is_inf = is_inf_internal = 1;
3806
3807 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3808 pars++;
3809 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3810 if ( stopmin > min + min1)
3811 stopmin = min + min1;
3812 flags &= ~SCF_DO_SUBSTR;
3813 if (data)
3814 data->flags |= SCF_SEEN_ACCEPT;
3815 }
3816 if (data) {
3817 if (data_fake.flags & SF_HAS_EVAL)
3818 data->flags |= SF_HAS_EVAL;
3819 data->whilem_c = data_fake.whilem_c;
3820 }
3821 if (flags & SCF_DO_STCLASS)
3822 cl_or(pRExC_state, &accum, &this_class);
3823 }
3824 }
3825 if (flags & SCF_DO_SUBSTR) {
3826 data->pos_min += min1;
3827 data->pos_delta += max1 - min1;
3828 if (max1 != min1 || is_inf)
3829 data->longest = &(data->longest_float);
3830 }
3831 min += min1;
3832 delta += max1 - min1;
3833 if (flags & SCF_DO_STCLASS_OR) {
3834 cl_or(pRExC_state, data->start_class, &accum);
3835 if (min1) {
3836 cl_and(data->start_class, and_withp);
3837 flags &= ~SCF_DO_STCLASS;
3838 }
3839 }
3840 else if (flags & SCF_DO_STCLASS_AND) {
3841 if (min1) {
3842 cl_and(data->start_class, &accum);
3843 flags &= ~SCF_DO_STCLASS;
3844 }
3845 else {
3846 /* Switch to OR mode: cache the old value of
3847 * data->start_class */
3848 INIT_AND_WITHP;
3849 StructCopy(data->start_class, and_withp,
3850 struct regnode_charclass_class);
3851 flags &= ~SCF_DO_STCLASS_AND;
3852 StructCopy(&accum, data->start_class,
3853 struct regnode_charclass_class);
3854 flags |= SCF_DO_STCLASS_OR;
3855 data->start_class->flags |= ANYOF_EOS;
3856 }
3857 }
3858 scan= tail;
3859 continue;
3860 }
786e8c11 3861#else
8aa23a47 3862 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3863 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3864 U8*bang=NULL;
3865
3866 min += trie->minlen;
3867 delta += (trie->maxlen - trie->minlen);
3868 flags &= ~SCF_DO_STCLASS; /* xxx */
3869 if (flags & SCF_DO_SUBSTR) {
304ee84b 3870 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3871 data->pos_min += trie->minlen;
3872 data->pos_delta += (trie->maxlen - trie->minlen);
3873 if (trie->maxlen != trie->minlen)
3874 data->longest = &(data->longest_float);
3875 }
3876 if (trie->jump) /* no more substrings -- for now /grr*/
3877 flags &= ~SCF_DO_SUBSTR;
b515a41d 3878 }
8aa23a47
YO
3879#endif /* old or new */
3880#endif /* TRIE_STUDY_OPT */
3881 /* Else: zero-length, ignore. */
3882 scan = regnext(scan);
3883 }
3884 if (frame) {
3885 last = frame->last;
3886 scan = frame->next;
3887 stopparen = frame->stop;
3888 frame = frame->prev;
3889 goto fake_study_recurse;
c277df42
IZ
3890 }
3891
3892 finish:
8aa23a47 3893 assert(!frame);
304ee84b 3894 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3895
c277df42 3896 *scanp = scan;
aca2d497 3897 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3898 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3899 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3900 if (is_par > (I32)U8_MAX)
c277df42
IZ
3901 is_par = 0;
3902 if (is_par && pars==1 && data) {
3903 data->flags |= SF_IN_PAR;
3904 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3905 }
3906 else if (pars && data) {
c277df42
IZ
3907 data->flags |= SF_HAS_PAR;
3908 data->flags &= ~SF_IN_PAR;
3909 }
653099ff 3910 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3911 cl_and(data->start_class, and_withp);
786e8c11
YO
3912 if (flags & SCF_TRIE_RESTUDY)
3913 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3914
304ee84b 3915 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3916
e2e6a0f1 3917 return min < stopmin ? min : stopmin;
c277df42
IZ
3918}
3919
2eccd3b2
NC
3920STATIC U32
3921S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3922{
4a4e7719
NC
3923 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3924
3925 Renewc(RExC_rxi->data,
3926 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3927 char, struct reg_data);
3928 if(count)
f8fc2ecf 3929 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3930 else
f8fc2ecf 3931 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3932 RExC_rxi->data->count = count + n;
3933 Copy(s, RExC_rxi->data->what + count, n, U8);
3934 return count;
c277df42
IZ
3935}
3936
f8149455 3937/*XXX: todo make this not included in a non debugging perl */
76234dfb 3938#ifndef PERL_IN_XSUB_RE
d88dccdf 3939void
864dbfa3 3940Perl_reginitcolors(pTHX)
d88dccdf 3941{
97aff369 3942 dVAR;
1df70142 3943 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3944 if (s) {
1df70142
AL
3945 char *t = savepv(s);
3946 int i = 0;
3947 PL_colors[0] = t;
d88dccdf 3948 while (++i < 6) {
1df70142
AL
3949 t = strchr(t, '\t');
3950 if (t) {
3951 *t = '\0';
3952 PL_colors[i] = ++t;
d88dccdf
IZ
3953 }
3954 else
1df70142 3955 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3956 }
3957 } else {
1df70142 3958 int i = 0;
b81d288d 3959 while (i < 6)
06b5626a 3960 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3961 }
3962 PL_colorset = 1;
3963}
76234dfb 3964#endif
8615cb43 3965
07be1b83 3966
786e8c11
YO
3967#ifdef TRIE_STUDY_OPT
3968#define CHECK_RESTUDY_GOTO \
3969 if ( \
3970 (data.flags & SCF_TRIE_RESTUDY) \
3971 && ! restudied++ \
3972 ) goto reStudy
3973#else
3974#define CHECK_RESTUDY_GOTO
3975#endif
f9f4320a 3976
a687059c 3977/*
e50aee73 3978 - pregcomp - compile a regular expression into internal code
a687059c
LW
3979 *
3980 * We can't allocate space until we know how big the compiled form will be,
3981 * but we can't compile it (and thus know how big it is) until we've got a
3982 * place to put the code. So we cheat: we compile it twice, once with code
3983 * generation turned off and size counting turned on, and once "for real".
3984 * This also means that we don't allocate space until we are sure that the
3985 * thing really will compile successfully, and we never have to move the
3986 * code and thus invalidate pointers into it. (Note that it has to be in
3987 * one piece because free() must be able to free it all.) [NB: not true in perl]
3988 *
3989 * Beware that the optimization-preparation code in here knows about some
3990 * of the structure of the compiled regexp. [I'll say.]
3991 */
b9b4dddf
YO
3992
3993
3994
f9f4320a 3995#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3996#define RE_ENGINE_PTR &PL_core_reg_engine
3997#else
f9f4320a
YO
3998extern const struct regexp_engine my_reg_engine;
3999#define RE_ENGINE_PTR &my_reg_engine
4000#endif
6d5c990f
RGS
4001
4002#ifndef PERL_IN_XSUB_RE
a687059c 4003regexp *
864dbfa3 4004Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 4005{
97aff369 4006 dVAR;
6d5c990f 4007 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
4008 /* Dispatch a request to compile a regexp to correct
4009 regexp engine. */
f9f4320a
YO
4010 if (table) {
4011 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4012 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4013 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4014 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4015 DEBUG_COMPILE_r({
8d8756e7 4016 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4017 SvIV(*ptr));
4018 });
f2f78491 4019 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 4020 }
b9b4dddf 4021 }
2a5d9b1d
RGS
4022 return Perl_re_compile(aTHX_ exp, xend, pm);
4023}
6d5c990f 4024#endif
2a5d9b1d
RGS
4025
4026regexp *
4027Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4028{
4029 dVAR;
a0d0e21e 4030 register regexp *r;
f8fc2ecf 4031 register regexp_internal *ri;
c277df42 4032 regnode *scan;
c277df42 4033 regnode *first;
a0d0e21e 4034 I32 flags;
a0d0e21e
LW
4035 I32 minlen = 0;
4036 I32 sawplus = 0;
4037 I32 sawopen = 0;
2c2d71f5 4038 scan_data_t data;
830247a4 4039 RExC_state_t RExC_state;
be8e71aa 4040 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4041#ifdef TRIE_STUDY_OPT
4042 int restudied= 0;
4043 RExC_state_t copyRExC_state;
4044#endif
2a5d9b1d 4045 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4046 DEBUG_r(if (!PL_colorset) reginitcolors());
4047
a0d0e21e 4048 if (exp == NULL)
c277df42 4049 FAIL("NULL regexp argument");
a0d0e21e 4050
a5961de5 4051 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4052
5cfc7842 4053 RExC_precomp = exp;
a3621e74 4054 DEBUG_COMPILE_r({
ab3bbdeb
YO
4055 SV *dsv= sv_newmortal();
4056 RE_PV_QUOTED_DECL(s, RExC_utf8,
4057 dsv, RExC_precomp, (xend - exp), 60);
4058 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4059 PL_colors[4],PL_colors[5],s);
a5961de5 4060 });
e2509266 4061 RExC_flags = pm->op_pmflags;
830247a4 4062 RExC_sawback = 0;
bbce6d69 4063
830247a4
IZ
4064 RExC_seen = 0;
4065 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4066 RExC_seen_evals = 0;
4067 RExC_extralen = 0;
c277df42 4068
bbce6d69 4069 /* First pass: determine size, legality. */
830247a4 4070 RExC_parse = exp;
fac92740 4071 RExC_start = exp;
830247a4
IZ
4072 RExC_end = xend;
4073 RExC_naughty = 0;
4074 RExC_npar = 1;
e2e6a0f1 4075 RExC_nestroot = 0;
830247a4
IZ
4076 RExC_size = 0L;
4077 RExC_emit = &PL_regdummy;
4078 RExC_whilem_seen = 0;
fc8cd66c 4079 RExC_charnames = NULL;
40d049e4
YO
4080 RExC_open_parens = NULL;
4081 RExC_close_parens = NULL;
4082 RExC_opend = NULL;
81714fb9 4083 RExC_paren_names = NULL;
1f1031fe
YO
4084#ifdef DEBUGGING
4085 RExC_paren_name_list = NULL;
4086#endif
40d049e4
YO
4087 RExC_recurse = NULL;
4088 RExC_recurse_count = 0;
81714fb9 4089
85ddcde9
JH
4090#if 0 /* REGC() is (currently) a NOP at the first pass.
4091 * Clever compilers notice this and complain. --jhi */
830247a4 4092 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4093#endif
3dab1dad
YO
4094 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4095 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4096 RExC_precomp = NULL;
a0d0e21e
LW
4097 return(NULL);
4098 }
07be1b83 4099 DEBUG_PARSE_r({
81714fb9
YO
4100 PerlIO_printf(Perl_debug_log,
4101 "Required size %"IVdf" nodes\n"
4102 "Starting second pass (creation)\n",
4103 (IV)RExC_size);
07be1b83
YO
4104 RExC_lastnum=0;
4105 RExC_lastparse=NULL;
4106 });
c277df42
IZ
4107 /* Small enough for pointer-storage convention?
4108 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4109 if (RExC_size >= 0x10000L && RExC_extralen)
4110 RExC_size += RExC_extralen;
c277df42 4111 else
830247a4
IZ
4112 RExC_extralen = 0;
4113 if (RExC_whilem_seen > 15)
4114 RExC_whilem_seen = 15;
a0d0e21e 4115
e2e6a0f1
YO
4116#ifdef DEBUGGING
4117 /* Make room for a sentinel value at the end of the program */
4118 RExC_size++;
4119#endif
4120
f9f4320a
YO
4121 /* Allocate space and zero-initialize. Note, the two step process
4122 of zeroing when in debug mode, thus anything assigned has to
4123 happen after that */
f8fc2ecf
YO
4124 Newxz(r, 1, regexp);
4125 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4126 char, regexp_internal);
4127 if ( r == NULL || ri == NULL )
b45f050a 4128 FAIL("Regexp out of space");
0f79a09d
GS
4129#ifdef DEBUGGING
4130 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4131 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4132#else
f8fc2ecf
YO
4133 /* bulk initialize base fields with 0. */
4134 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4135#endif
58e23c8d
YO
4136
4137 /* non-zero initialization begins here */
f8fc2ecf 4138 RXi_SET( r, ri );
f9f4320a 4139 r->engine= RE_ENGINE_PTR;
c277df42 4140 r->refcnt = 1;
bbce6d69 4141 r->prelen = xend - exp;
5cfc7842 4142 r->precomp = savepvn(RExC_precomp, r->prelen);
bbe252da
YO
4143 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
4144 r->intflags = 0;
830247a4 4145 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4146
6bda09f9 4147 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4148 Newxz(RExC_open_parens, RExC_npar,regnode *);
4149 SAVEFREEPV(RExC_open_parens);
4150 Newxz(RExC_close_parens,RExC_npar,regnode *);
4151 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4152 }
4153
4154 /* Useful during FAIL. */
7122b237
YO
4155#ifdef RE_TRACK_PATTERN_OFFSETS
4156 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4157 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4158 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4159 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4160 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4161#endif
4162 SetProgLen(ri,RExC_size);
830247a4 4163 RExC_rx = r;
f8fc2ecf 4164 RExC_rxi = ri;
bbce6d69 4165
4166 /* Second pass: emit code. */
e2509266 4167 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4168 RExC_parse = exp;
4169 RExC_end = xend;
4170 RExC_naughty = 0;
4171 RExC_npar = 1;
f8fc2ecf
YO
4172 RExC_emit_start = ri->program;
4173 RExC_emit = ri->program;
e2e6a0f1
YO
4174#ifdef DEBUGGING
4175 /* put a sentinal on the end of the program so we can check for
4176 overwrites */
f8fc2ecf 4177 ri->program[RExC_size].type = 255;
e2e6a0f1 4178#endif
2cd61cdb 4179 /* Store the count of eval-groups for security checks: */
f8149455 4180 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4181 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4182 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4183 return(NULL);
6bda09f9 4184
07be1b83
YO
4185 /* XXXX To minimize changes to RE engine we always allocate
4186 3-units-long substrs field. */
4187 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4188 if (RExC_recurse_count) {
4189 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4190 SAVEFREEPV(RExC_recurse);
4191 }
a0d0e21e 4192
07be1b83 4193reStudy:
1de06328 4194 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4195 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4196
07be1b83
YO
4197#ifdef TRIE_STUDY_OPT
4198 if ( restudied ) {
5d458dd8 4199 U32 seen=RExC_seen;
07be1b83 4200 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4201
4202 RExC_state = copyRExC_state;
4203 if (seen & REG_TOP_LEVEL_BRANCHES)
4204 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4205 else
4206 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4207 if (data.last_found) {
07be1b83 4208 SvREFCNT_dec(data.longest_fixed);
07be1b83 4209 SvREFCNT_dec(data.longest_float);
07be1b83 4210 SvREFCNT_dec(data.last_found);
1de06328 4211 }
40d049e4 4212 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4213 } else {
40d049e4 4214 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4215 copyRExC_state = RExC_state;
07be1b83 4216 }
40d049e4
YO
4217#else
4218 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4219#endif
fc8cd66c 4220
a0d0e21e 4221 /* Dig out information for optimizations. */
bbe252da 4222 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4223 pm->op_pmflags = RExC_flags;
a0ed51b3 4224 if (UTF)
bbe252da 4225 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4226 ri->regstclass = NULL;
830247a4 4227 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4228 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4229 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4230
1de06328
YO
4231 /* testing for BRANCH here tells us whether there is "must appear"
4232 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4233 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4234 I32 fake;
c5254dd6 4235 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4236 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4237 int stclass_flag;
07be1b83 4238 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4239
4240 first = scan;
c277df42 4241 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4242 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4243 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4244 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4245 /* for now we can't handle lookbehind IFMATCH*/
4246 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4247 (OP(first) == PLUS) ||
4248 (OP(first) == MINMOD) ||
653099ff 4249 /* An {n,m} with n>0 */
07be1b83
YO
4250 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4251 {
786e8c11 4252
a0d0e21e
LW
4253 if (OP(first) == PLUS)
4254 sawplus = 1;
4255 else
3dab1dad 4256 first += regarglen[OP(first)];
07be1b83
YO
4257 if (OP(first) == IFMATCH) {
4258 first = NEXTOPER(first);
4259 first += EXTRA_STEP_2ARGS;
7c167cea 4260 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4261 first = NEXTOPER(first);
a687059c
LW
4262 }
4263
a0d0e21e
LW
4264 /* Starting-point info. */
4265 again:
786e8c11 4266 DEBUG_PEEP("first:",first,0);
07be1b83 4267 /* Ignore EXACT as we deal with it later. */
3dab1dad 4268 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4269 if (OP(first) == EXACT)
6f207bd3 4270 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4271 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4272 ri->regstclass = first;
b3c9acc1 4273 }
07be1b83 4274#ifdef TRIE_STCLASS
786e8c11 4275 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4276 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4277 {
786e8c11 4278 regnode *trie_op;
07be1b83 4279 /* this can happen only on restudy */
786e8c11 4280 if ( OP(first) == TRIE ) {
c944940b 4281 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4282 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4283 StructCopy(first,trieop,struct regnode_1);
4284 trie_op=(regnode *)trieop;
4285 } else {
c944940b 4286 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4287 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4288 StructCopy(first,trieop,struct regnode_charclass);
4289 trie_op=(regnode *)trieop;
4290 }
1de06328 4291 OP(trie_op)+=2;
786e8c11 4292 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4293 ri->regstclass = trie_op;
07be1b83
YO
4294 }
4295#endif
bfed75c6 4296 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4297 ri->regstclass = first;
3dab1dad
YO
4298 else if (PL_regkind[OP(first)] == BOUND ||
4299 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4300 ri->regstclass = first;
3dab1dad 4301 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4302 r->extflags |= (OP(first) == MBOL
4303 ? RXf_ANCH_MBOL
cad2e5aa 4304 : (OP(first) == SBOL
bbe252da
YO
4305 ? RXf_ANCH_SBOL
4306 : RXf_ANCH_BOL));
a0d0e21e 4307 first = NEXTOPER(first);
774d564b 4308 goto again;
4309 }
4310 else if (OP(first) == GPOS) {
bbe252da 4311 r->extflags |= RXf_ANCH_GPOS;
774d564b 4312 first = NEXTOPER(first);
4313 goto again;
a0d0e21e 4314 }
cf2a2b69
YO
4315 else if ((!sawopen || !RExC_sawback) &&
4316 (OP(first) == STAR &&
3dab1dad 4317 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4318 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4319 {
4320 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4321 const int type =
4322 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4323 ? RXf_ANCH_MBOL
4324 : RXf_ANCH_SBOL;
4325 r->extflags |= type;
4326 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4327 first = NEXTOPER(first);
774d564b 4328 goto again;
a0d0e21e 4329 }
b81d288d 4330 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4331 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4332 /* x+ must match at the 1st pos of run of x's */
bbe252da 4333 r->intflags |= PREGf_SKIP;
a0d0e21e 4334
c277df42 4335 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4336#ifdef TRIE_STUDY_OPT
81714fb9 4337 DEBUG_PARSE_r(
be8e71aa
YO
4338 if (!restudied)
4339 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4340 (IV)(first - scan + 1))
4341 );
4342#else
81714fb9 4343 DEBUG_PARSE_r(
be8e71aa
YO
4344 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4345 (IV)(first - scan + 1))
4346 );
4347#endif
4348
4349
a0d0e21e
LW
4350 /*
4351 * If there's something expensive in the r.e., find the
4352 * longest literal string that must appear and make it the
4353 * regmust. Resolve ties in favor of later strings, since
4354 * the regstart check works with the beginning of the r.e.
4355 * and avoiding duplication strengthens checking. Not a
4356 * strong reason, but sufficient in the absence of others.
4357 * [Now we resolve ties in favor of the earlier string if
c277df42 4358 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4359 * earlier string may buy us something the later one won't.]
4360 */
de8c5301 4361
396482e1
GA
4362 data.longest_fixed = newSVpvs("");
4363 data.longest_float = newSVpvs("");
4364 data.last_found = newSVpvs("");
c277df42
IZ
4365 data.longest = &(data.longest_fixed);
4366 first = scan;
f8fc2ecf 4367 if (!ri->regstclass) {
830247a4 4368 cl_init(pRExC_state, &ch_class);
653099ff
GS
4369 data.start_class = &ch_class;
4370 stclass_flag = SCF_DO_STCLASS_AND;
4371 } else /* XXXX Check for BOUND? */
4372 stclass_flag = 0;
cb434fcc 4373 data.last_closep = &last_close;
de8c5301 4374
1de06328 4375 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4376 &data, -1, NULL, NULL,
4377 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4378
07be1b83 4379
786e8c11
YO
4380 CHECK_RESTUDY_GOTO;
4381
4382
830247a4 4383 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4384 && data.last_start_min == 0 && data.last_end > 0
830247a4 4385 && !RExC_seen_zerolen
2bf803e2 4386 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4387 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4388 r->extflags |= RXf_CHECK_ALL;
304ee84b 4389 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4390 SvREFCNT_dec(data.last_found);
4391
1de06328
YO
4392 /* Note that code very similar to this but for anchored string
4393 follows immediately below, changes may need to be made to both.
4394 Be careful.
4395 */
a0ed51b3 4396 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4397 if (longest_float_length
c277df42
IZ
4398 || (data.flags & SF_FL_BEFORE_EOL
4399 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4400 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4401 {
1182767e 4402 I32 t,ml;
cf93c79d 4403
1de06328 4404 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4405 && data.offset_fixed == data.offset_float_min
4406 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4407 goto remove_float; /* As in (a)+. */
4408
1de06328
YO
4409 /* copy the information about the longest float from the reg_scan_data
4410 over to the program. */
33b8afdf
JH
4411 if (SvUTF8(data.longest_float)) {
4412 r->float_utf8 = data.longest_float;
c445ea15 4413 r->float_substr = NULL;
33b8afdf
JH
4414 } else {
4415 r->float_substr = data.longest_float;
c445ea15 4416 r->float_utf8 = NULL;
33b8afdf 4417 }
1de06328
YO
4418 /* float_end_shift is how many chars that must be matched that
4419 follow this item. We calculate it ahead of time as once the
4420 lookbehind offset is added in we lose the ability to correctly
4421 calculate it.*/
4422 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4423 : (I32)longest_float_length;
1de06328
YO
4424 r->float_end_shift = ml - data.offset_float_min
4425 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4426 + data.lookbehind_float;
4427 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4428 r->float_max_offset = data.offset_float_max;
1182767e 4429 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4430 r->float_max_offset -= data.lookbehind_float;
4431
cf93c79d
IZ
4432 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4433 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4434 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4435 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4436 }
4437 else {
aca2d497 4438 remove_float:
c445ea15 4439 r->float_substr = r->float_utf8 = NULL;
c277df42 4440 SvREFCNT_dec(data.longest_float);
c5254dd6 4441 longest_float_length = 0;
a0d0e21e 4442 }
c277df42 4443
1de06328
YO
4444 /* Note that code very similar to this but for floating string
4445 is immediately above, changes may need to be made to both.
4446 Be careful.
4447 */
a0ed51b3 4448 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4449 if (longest_fixed_length
c277df42
IZ
4450 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4451 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4452 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4453 {
1182767e 4454 I32 t,ml;
cf93c79d 4455
1de06328
YO
4456 /* copy the information about the longest fixed
4457 from the reg_scan_data over to the program. */
33b8afdf
JH
4458 if (SvUTF8(data.longest_fixed)) {
4459 r->anchored_utf8 = data.longest_fixed;
c445ea15 4460 r->anchored_substr = NULL;
33b8afdf
JH
4461 } else {
4462 r->anchored_substr = data.longest_fixed;
c445ea15 4463 r->anchored_utf8 = NULL;
33b8afdf 4464 }
1de06328
YO
4465 /* fixed_end_shift is how many chars that must be matched that
4466 follow this item. We calculate it ahead of time as once the
4467 lookbehind offset is added in we lose the ability to correctly
4468 calculate it.*/
4469 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4470 : (I32)longest_fixed_length;
1de06328
YO
4471 r->anchored_end_shift = ml - data.offset_fixed
4472 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4473 + data.lookbehind_fixed;
4474 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4475
cf93c79d
IZ
4476 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4477 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4478 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4479 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4480 }
4481 else {
c445ea15 4482 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4483 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4484 longest_fixed_length = 0;
a0d0e21e 4485 }
f8fc2ecf
YO
4486 if (ri->regstclass
4487 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4488 ri->regstclass = NULL;
33b8afdf
JH
4489 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4490 && stclass_flag
653099ff 4491 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4492 && !cl_is_anything(data.start_class))
4493 {
2eccd3b2 4494 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4495
f8fc2ecf 4496 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4497 struct regnode_charclass_class);
4498 StructCopy(data.start_class,
f8fc2ecf 4499 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4500 struct regnode_charclass_class);
f8fc2ecf 4501 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4502 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4503 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4504 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4505 PerlIO_printf(Perl_debug_log,
a0288114 4506 "synthetic stclass \"%s\".\n",
3f7c398e 4507 SvPVX_const(sv));});
653099ff 4508 }
c277df42
IZ
4509
4510 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4511 if (longest_fixed_length > longest_float_length) {
1de06328 4512 r->check_end_shift = r->anchored_end_shift;
c277df42 4513 r->check_substr = r->anchored_substr;
33b8afdf 4514 r->check_utf8 = r->anchored_utf8;
c277df42 4515 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4516 if (r->extflags & RXf_ANCH_SINGLE)
4517 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4518 }
4519 else {
1de06328 4520 r->check_end_shift = r->float_end_shift;
c277df42 4521 r->check_substr = r->float_substr;
33b8afdf 4522 r->check_utf8 = r->float_utf8;
1de06328
YO
4523 r->check_offset_min = r->float_min_offset;
4524 r->check_offset_max = r->float_max_offset;
a0d0e21e 4525 }
30382c73
IZ
4526 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4527 This should be changed ASAP! */
bbe252da
YO
4528 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4529 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4530 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4531 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4532 }
1de06328
YO
4533 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4534 if ( (STRLEN)minlen < longest_float_length )
4535 minlen= longest_float_length;
4536 if ( (STRLEN)minlen < longest_fixed_length )
4537 minlen= longest_fixed_length;
4538 */
a0ed51b3
LW
4539 }
4540 else {
c277df42
IZ
4541 /* Several toplevels. Best we can is to set minlen. */
4542 I32 fake;
653099ff 4543 struct regnode_charclass_class ch_class;
cb434fcc 4544 I32 last_close = 0;
c277df42 4545
5d458dd8 4546 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4547
f8fc2ecf 4548 scan = ri->program + 1;
830247a4 4549 cl_init(pRExC_state, &ch_class);
653099ff 4550 data.start_class = &ch_class;
cb434fcc 4551 data.last_closep = &last_close;
07be1b83 4552
de8c5301 4553
1de06328 4554 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4555 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4556
786e8c11 4557 CHECK_RESTUDY_GOTO;
07be1b83 4558
33b8afdf 4559 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4560 = r->float_substr = r->float_utf8 = NULL;
653099ff 4561 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4562 && !cl_is_anything(data.start_class))
4563 {
2eccd3b2 4564 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4565
f8fc2ecf 4566 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4567 struct regnode_charclass_class);
4568 StructCopy(data.start_class,
f8fc2ecf 4569 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4570 struct regnode_charclass_class);
f8fc2ecf 4571 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4572 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4573 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4574 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4575 PerlIO_printf(Perl_debug_log,
a0288114 4576 "synthetic stclass \"%s\".\n",
3f7c398e 4577 SvPVX_const(sv));});
653099ff 4578 }
a0d0e21e
LW
4579 }
4580
1de06328
YO
4581 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4582 the "real" pattern. */
cf9788e3
RGS
4583 DEBUG_OPTIMISE_r({
4584 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4585 (IV)minlen, (IV)r->minlen);
cf9788e3 4586 });
de8c5301 4587 r->minlenret = minlen;
1de06328
YO
4588 if (r->minlen < minlen)
4589 r->minlen = minlen;
4590
b81d288d 4591 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4592 r->extflags |= RXf_GPOS_SEEN;
830247a4 4593 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4594 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4595 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4596 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4597 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4598 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4599 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4600 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4601 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4602 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4603 if (RExC_paren_names)
4604 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4605 else
4606 r->paren_names = NULL;
e357fc67
YO
4607 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4608 r->extflags |= RXf_WHITE;
4609 else if (r->prelen == 1 && r->precomp[0] == '^')
4610 r->extflags |= RXf_START_ONLY;
4611
1f1031fe
YO
4612#ifdef DEBUGGING
4613 if (RExC_paren_names) {
4614 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4615 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4616 } else
1f1031fe 4617#endif
cde0cee5 4618 ri->name_list_idx = 0;
1f1031fe 4619
40d049e4
YO
4620 if (RExC_recurse_count) {
4621 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4622 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4623 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4624 }
4625 }
a02a5408
JC
4626 Newxz(r->startp, RExC_npar, I32);
4627 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4628 /* assume we don't need to swap parens around before we match */
4629
be8e71aa
YO
4630 DEBUG_DUMP_r({
4631 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4632 regdump(r);
4633 });
7122b237
YO
4634#ifdef RE_TRACK_PATTERN_OFFSETS
4635 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4636 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4637 U32 i;
4638 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4639 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4640 for (i = 1; i <= len; i++) {
7122b237 4641 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4642 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4643 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4644 }
4645 PerlIO_printf(Perl_debug_log, "\n");
4646 });
7122b237 4647#endif
a0d0e21e 4648 return(r);
a687059c
LW
4649}
4650
f9f4320a 4651#undef CORE_ONLY_BLOCK
f9f4320a 4652#undef RE_ENGINE_PTR
3dab1dad 4653
9af228c6 4654#ifndef PERL_IN_XSUB_RE
81714fb9 4655SV*
44a2ac75 4656Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags)
81714fb9 4657{
44a2ac75
YO
4658 AV *retarray = NULL;
4659 SV *ret;
4660 if (flags & 1)
4661 retarray=newAV();
4662
4663 if (from_re || PL_curpm) {
4664 const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm);
81714fb9
YO
4665 if (rx && rx->paren_names) {
4666 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4667 if (he_str) {
4668 IV i;
4669 SV* sv_dat=HeVAL(he_str);
4670 I32 *nums=(I32*)SvPVX(sv_dat);
4671 for ( i=0; i<SvIVX(sv_dat); i++ ) {
0a4db386 4672 if ((I32)(rx->lastparen) >= nums[i] &&
81714fb9
YO
4673 rx->endp[nums[i]] != -1)
4674 {
44a2ac75
YO
4675 ret = reg_numbered_buff_get(nums[i],rx,NULL,0);
4676 if (!retarray)
4677 return ret;
4678 } else {
4679 ret = newSVsv(&PL_sv_undef);
4680 }
4681 if (retarray) {
4682 SvREFCNT_inc(ret);
4683 av_push(retarray, ret);
81714fb9
YO
4684 }
4685 }
44a2ac75
YO
4686 if (retarray)
4687 return (SV*)retarray;
81714fb9
YO
4688 }
4689 }
4690 }
44a2ac75
YO
4691 return NULL;
4692}
4693
4694SV*
4695Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags)
4696{
4697 char *s = NULL;
a9d504c3 4698 I32 i = 0;
44a2ac75
YO
4699 I32 s1, t1;
4700 SV *sv = usesv ? usesv : newSVpvs("");
cb5c6874 4701 PERL_UNUSED_ARG(flags);
44a2ac75 4702
cde0cee5
YO
4703 if (!rx->subbeg) {
4704 sv_setsv(sv,&PL_sv_undef);
4705 return sv;
4706 }
4707 else
4708 if (paren == -2 && rx->startp[0] != -1) {
44a2ac75
YO
4709 /* $` */
4710 i = rx->startp[0];
cde0cee5 4711 s = rx->subbeg;
44a2ac75
YO
4712 }
4713 else
cde0cee5 4714 if (paren == -1 && rx->endp[0] != -1) {
44a2ac75
YO
4715 /* $' */
4716 s = rx->subbeg + rx->endp[0];
4717 i = rx->sublen - rx->endp[0];
4718 }
4719 else
4720 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4721 (s1 = rx->startp[paren]) != -1 &&
4722 (t1 = rx->endp[paren]) != -1)
4723 {
4724 /* $& $1 ... */
4725 i = t1 - s1;
4726 s = rx->subbeg + s1;
cde0cee5
YO
4727 } else {
4728 sv_setsv(sv,&PL_sv_undef);
4729 return sv;
4730 }
4731 assert(rx->sublen >= (s - rx->subbeg) + i );
4732 if (i >= 0) {
4733 const int oldtainted = PL_tainted;
4734 TAINT_NOT;
4735 sv_setpvn(sv, s, i);
4736 PL_tainted = oldtainted;
4737 if ( (rx->extflags & RXf_CANY_SEEN)
4738 ? (RX_MATCH_UTF8(rx)
4739 && (!i || is_utf8_string((U8*)s, i)))
4740 : (RX_MATCH_UTF8(rx)) )
4741 {
4742 SvUTF8_on(sv);
4743 }
4744 else
4745 SvUTF8_off(sv);
4746 if (PL_tainting) {
4747 if (RX_MATCH_TAINTED(rx)) {
4748 if (SvTYPE(sv) >= SVt_PVMG) {
4749 MAGIC* const mg = SvMAGIC(sv);
4750 MAGIC* mgt;
4751 PL_tainted = 1;
4752 SvMAGIC_set(sv, mg->mg_moremagic);
4753 SvTAINT(sv);
4754 if ((mgt = SvMAGIC(sv))) {
4755 mg->mg_moremagic = mgt;
4756 SvMAGIC_set(sv, mg);
44a2ac75 4757 }
cde0cee5
YO
4758 } else {
4759 PL_tainted = 1;
4760 SvTAINT(sv);
4761 }
4762 } else
4763 SvTAINTED_off(sv);
44a2ac75 4764 }
81714fb9 4765 } else {
44a2ac75 4766 sv_setsv(sv,&PL_sv_undef);
81714fb9 4767 }
44a2ac75 4768 return sv;
81714fb9 4769}
9af228c6 4770#endif
0a4db386 4771
894be9b7 4772/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4773 * If flags is REG_RSN_RETURN_NULL returns null.
4774 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4775 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4776 * to the parsed name as looked up in the RExC_paren_names hash.
4777 * If there is an error throws a vFAIL().. type exception.
894be9b7 4778 */
0a4db386
YO
4779
4780#define REG_RSN_RETURN_NULL 0
4781#define REG_RSN_RETURN_NAME 1
4782#define REG_RSN_RETURN_DATA 2
4783
894be9b7
YO
4784STATIC SV*
4785S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4786 char *name_start = RExC_parse;
1f1031fe
YO
4787
4788 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4789 /* skip IDFIRST by using do...while */
4790 if (UTF)
4791 do {
4792 RExC_parse += UTF8SKIP(RExC_parse);
4793 } while (isALNUM_utf8((U8*)RExC_parse));
4794 else
4795 do {
4796 RExC_parse++;
4797 } while (isALNUM(*RExC_parse));
894be9b7 4798 }
1f1031fe 4799
0a4db386
YO
4800 if ( flags ) {
4801 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4802 (int)(RExC_parse - name_start)));
894be9b7 4803 if (UTF)
0a4db386
YO
4804 SvUTF8_on(sv_name);
4805 if ( flags == REG_RSN_RETURN_NAME)
4806 return sv_name;
4807 else if (flags==REG_RSN_RETURN_DATA) {
4808 HE *he_str = NULL;
4809 SV *sv_dat = NULL;
4810 if ( ! sv_name ) /* should not happen*/
4811 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4812 if (RExC_paren_names)
4813 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4814 if ( he_str )
4815 sv_dat = HeVAL(he_str);
4816 if ( ! sv_dat )
4817 vFAIL("Reference to nonexistent named group");
4818 return sv_dat;
4819 }
4820 else {
4821 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4822 }
4823 /* NOT REACHED */
894be9b7 4824 }
0a4db386 4825 return NULL;
894be9b7
YO
4826}
4827
3dab1dad
YO
4828#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4829 int rem=(int)(RExC_end - RExC_parse); \
4830 int cut; \
4831 int num; \
4832 int iscut=0; \
4833 if (rem>10) { \
4834 rem=10; \
4835 iscut=1; \
4836 } \
4837 cut=10-rem; \
4838 if (RExC_lastparse!=RExC_parse) \
4839 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4840 rem, RExC_parse, \
4841 cut + 4, \
4842 iscut ? "..." : "<" \
4843 ); \
4844 else \
4845 PerlIO_printf(Perl_debug_log,"%16s",""); \
4846 \
4847 if (SIZE_ONLY) \
4848 num=RExC_size; \
4849 else \
4850 num=REG_NODE_NUM(RExC_emit); \
4851 if (RExC_lastnum!=num) \
0a4db386 4852 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4853 else \
0a4db386 4854 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4855 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4856 (int)((depth*2)), "", \
3dab1dad
YO
4857 (funcname) \
4858 ); \
4859 RExC_lastnum=num; \
4860 RExC_lastparse=RExC_parse; \
4861})
4862
07be1b83
YO
4863
4864
3dab1dad
YO
4865#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4866 DEBUG_PARSE_MSG((funcname)); \
4867 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4868})
6bda09f9
YO
4869#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4870 DEBUG_PARSE_MSG((funcname)); \
4871 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4872})
a687059c
LW
4873/*
4874 - reg - regular expression, i.e. main body or parenthesized thing
4875 *
4876 * Caller must absorb opening parenthesis.
4877 *
4878 * Combining parenthesis handling with the base level of regular expression
4879 * is a trifle forced, but the need to tie the tails of the branches to what
4880 * follows makes it hard to avoid.
4881 */
07be1b83
YO
4882#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4883#ifdef DEBUGGING
4884#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4885#else
4886#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4887#endif
3dab1dad 4888
e2e6a0f1
YO
4889/* this idea is borrowed from STR_WITH_LEN in handy.h */
4890#define CHECK_WORD(s,v,l) \
4891 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4892
76e3520e 4893STATIC regnode *
3dab1dad 4894S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4895 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4896{
27da23d5 4897 dVAR;
c277df42
IZ
4898 register regnode *ret; /* Will be the head of the group. */
4899 register regnode *br;
4900 register regnode *lastbr;
cbbf8932 4901 register regnode *ender = NULL;
a0d0e21e 4902 register I32 parno = 0;
cbbf8932
AL
4903 I32 flags;
4904 const I32 oregflags = RExC_flags;
6136c704
AL
4905 bool have_branch = 0;
4906 bool is_open = 0;
9d1d55b5
JP
4907
4908 /* for (?g), (?gc), and (?o) warnings; warning
4909 about (?c) will warn about (?g) -- japhy */
4910
6136c704
AL
4911#define WASTED_O 0x01
4912#define WASTED_G 0x02
4913#define WASTED_C 0x04
4914#define WASTED_GC (0x02|0x04)
cbbf8932 4915 I32 wastedflags = 0x00;
9d1d55b5 4916
fac92740 4917 char * parse_start = RExC_parse; /* MJD */
a28509cc 4918 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4919
3dab1dad
YO
4920 GET_RE_DEBUG_FLAGS_DECL;
4921 DEBUG_PARSE("reg ");
4922
4923
821b33a5 4924 *flagp = 0; /* Tentatively. */
a0d0e21e 4925
9d1d55b5 4926
a0d0e21e
LW
4927 /* Make an OPEN node, if parenthesized. */
4928 if (paren) {
e2e6a0f1
YO
4929 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4930 char *start_verb = RExC_parse;
4931 STRLEN verb_len = 0;
4932 char *start_arg = NULL;
4933 unsigned char op = 0;
4934 int argok = 1;
4935 int internal_argval = 0; /* internal_argval is only useful if !argok */
4936 while ( *RExC_parse && *RExC_parse != ')' ) {
4937 if ( *RExC_parse == ':' ) {
4938 start_arg = RExC_parse + 1;
4939 break;
4940 }
4941 RExC_parse++;
4942 }
4943 ++start_verb;
4944 verb_len = RExC_parse - start_verb;
4945 if ( start_arg ) {
4946 RExC_parse++;
4947 while ( *RExC_parse && *RExC_parse != ')' )
4948 RExC_parse++;
4949 if ( *RExC_parse != ')' )
4950 vFAIL("Unterminated verb pattern argument");
4951 if ( RExC_parse == start_arg )
4952 start_arg = NULL;
4953 } else {
4954 if ( *RExC_parse != ')' )
4955 vFAIL("Unterminated verb pattern");
4956 }
5d458dd8 4957
e2e6a0f1
YO
4958 switch ( *start_verb ) {
4959 case 'A': /* (*ACCEPT) */
4960 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
4961 op = ACCEPT;
4962 internal_argval = RExC_nestroot;
4963 }
4964 break;
4965 case 'C': /* (*COMMIT) */
4966 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
4967 op = COMMIT;
e2e6a0f1
YO
4968 break;
4969 case 'F': /* (*FAIL) */
4970 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
4971 op = OPFAIL;
4972 argok = 0;
4973 }
4974 break;
5d458dd8
YO
4975 case ':': /* (*:NAME) */
4976 case 'M': /* (*MARK:NAME) */
4977 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 4978 op = MARKPOINT;
5d458dd8
YO
4979 argok = -1;
4980 }
4981 break;
4982 case 'P': /* (*PRUNE) */
4983 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
4984 op = PRUNE;
e2e6a0f1 4985 break;
5d458dd8
YO
4986 case 'S': /* (*SKIP) */
4987 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
4988 op = SKIP;
4989 break;
4990 case 'T': /* (*THEN) */
4991 /* [19:06] <TimToady> :: is then */
4992 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
4993 op = CUTGROUP;
4994 RExC_seen |= REG_SEEN_CUTGROUP;
4995 }
e2e6a0f1
YO
4996 break;
4997 }
4998 if ( ! op ) {
4999 RExC_parse++;
5000 vFAIL3("Unknown verb pattern '%.*s'",
5001 verb_len, start_verb);
5002 }
5003 if ( argok ) {
5004 if ( start_arg && internal_argval ) {
5005 vFAIL3("Verb pattern '%.*s' may not have an argument",
5006 verb_len, start_verb);
5007 } else if ( argok < 0 && !start_arg ) {
5008 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5009 verb_len, start_verb);
5010 } else {
5011 ret = reganode(pRExC_state, op, internal_argval);
5012 if ( ! internal_argval && ! SIZE_ONLY ) {
5013 if (start_arg) {
5014 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5015 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5016 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5017 ret->flags = 0;
5018 } else {
5019 ret->flags = 1;
5020 }
5021 }
5022 }
5023 if (!internal_argval)
5024 RExC_seen |= REG_SEEN_VERBARG;
5025 } else if ( start_arg ) {
5026 vFAIL3("Verb pattern '%.*s' may not have an argument",
5027 verb_len, start_verb);
5028 } else {
5029 ret = reg_node(pRExC_state, op);
5030 }
5031 nextchar(pRExC_state);
5032 return ret;
5033 } else
fac92740 5034 if (*RExC_parse == '?') { /* (?...) */
6136c704 5035 bool is_logical = 0;
a28509cc 5036 const char * const seqstart = RExC_parse;
ca9dfc88 5037
830247a4
IZ
5038 RExC_parse++;
5039 paren = *RExC_parse++;
c277df42 5040 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5041 switch (paren) {
894be9b7 5042
1f1031fe
YO
5043 case 'P': /* (?P...) variants for those used to PCRE/Python */
5044 paren = *RExC_parse++;
5045 if ( paren == '<') /* (?P<...>) named capture */
5046 goto named_capture;
5047 else if (paren == '>') { /* (?P>name) named recursion */
5048 goto named_recursion;
5049 }
5050 else if (paren == '=') { /* (?P=...) named backref */
5051 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5052 you change this make sure you change that */
5053 char* name_start = RExC_parse;
5054 U32 num = 0;
5055 SV *sv_dat = reg_scan_name(pRExC_state,
5056 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5057 if (RExC_parse == name_start || *RExC_parse != ')')
5058 vFAIL2("Sequence %.3s... not terminated",parse_start);
5059
5060 if (!SIZE_ONLY) {
5061 num = add_data( pRExC_state, 1, "S" );
5062 RExC_rxi->data->data[num]=(void*)sv_dat;
5063 SvREFCNT_inc(sv_dat);
5064 }
5065 RExC_sawback = 1;
5066 ret = reganode(pRExC_state,
5067 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5068 num);
5069 *flagp |= HASWIDTH;
5070
5071 Set_Node_Offset(ret, parse_start+1);
5072 Set_Node_Cur_Length(ret); /* MJD */
5073
5074 nextchar(pRExC_state);
5075 return ret;
5076 }
5077 goto unknown;
fac92740 5078 case '<': /* (?<...) */
b81d288d 5079 if (*RExC_parse == '!')
c277df42 5080 paren = ',';
0a4db386 5081 else if (*RExC_parse != '=')
1f1031fe 5082 named_capture:
0a4db386 5083 { /* (?<...>) */
81714fb9 5084 char *name_start;
894be9b7 5085 SV *svname;
81714fb9
YO
5086 paren= '>';
5087 case '\'': /* (?'...') */
5088 name_start= RExC_parse;
0a4db386
YO
5089 svname = reg_scan_name(pRExC_state,
5090 SIZE_ONLY ? /* reverse test from the others */
5091 REG_RSN_RETURN_NAME :
5092 REG_RSN_RETURN_NULL);
81714fb9
YO
5093 if (RExC_parse == name_start)
5094 goto unknown;
5095 if (*RExC_parse != paren)
5096 vFAIL2("Sequence (?%c... not terminated",
5097 paren=='>' ? '<' : paren);
5098 if (SIZE_ONLY) {
e62cc96a
YO
5099 HE *he_str;
5100 SV *sv_dat = NULL;
894be9b7
YO
5101 if (!svname) /* shouldnt happen */
5102 Perl_croak(aTHX_
5103 "panic: reg_scan_name returned NULL");
81714fb9
YO
5104 if (!RExC_paren_names) {
5105 RExC_paren_names= newHV();
5106 sv_2mortal((SV*)RExC_paren_names);
1f1031fe
YO
5107#ifdef DEBUGGING
5108 RExC_paren_name_list= newAV();
5109 sv_2mortal((SV*)RExC_paren_name_list);
5110#endif
81714fb9
YO
5111 }
5112 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5113 if ( he_str )
81714fb9 5114 sv_dat = HeVAL(he_str);
e62cc96a 5115 if ( ! sv_dat ) {
81714fb9 5116 /* croak baby croak */
e62cc96a
YO
5117 Perl_croak(aTHX_
5118 "panic: paren_name hash element allocation failed");
5119 } else if ( SvPOK(sv_dat) ) {
81714fb9
YO
5120 IV count=SvIV(sv_dat);
5121 I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1);
5122 SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32));
5123 pv[count]=RExC_npar;
5124 SvIVX(sv_dat)++;
5125 } else {
5126 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5127 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5128 SvIOK_on(sv_dat);
5129 SvIVX(sv_dat)= 1;
e62cc96a 5130 }
1f1031fe
YO
5131#ifdef DEBUGGING
5132 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5133 SvREFCNT_dec(svname);
5134#endif
e62cc96a 5135
81714fb9
YO
5136 /*sv_dump(sv_dat);*/
5137 }
5138 nextchar(pRExC_state);
5139 paren = 1;
5140 goto capturing_parens;
5141 }
5142 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5143 RExC_parse++;
fac92740
MJD
5144 case '=': /* (?=...) */
5145 case '!': /* (?!...) */
830247a4 5146 RExC_seen_zerolen++;
e2e6a0f1
YO
5147 if (*RExC_parse == ')') {
5148 ret=reg_node(pRExC_state, OPFAIL);
5149 nextchar(pRExC_state);
5150 return ret;
5151 }
fac92740
MJD
5152 case ':': /* (?:...) */
5153 case '>': /* (?>...) */
a0d0e21e 5154 break;
fac92740
MJD
5155 case '$': /* (?$...) */
5156 case '@': /* (?@...) */
8615cb43 5157 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5158 break;
fac92740 5159 case '#': /* (?#...) */
830247a4
IZ
5160 while (*RExC_parse && *RExC_parse != ')')
5161 RExC_parse++;
5162 if (*RExC_parse != ')')
c277df42 5163 FAIL("Sequence (?#... not terminated");
830247a4 5164 nextchar(pRExC_state);
a0d0e21e
LW
5165 *flagp = TRYAGAIN;
5166 return NULL;
894be9b7
YO
5167 case '0' : /* (?0) */
5168 case 'R' : /* (?R) */
5169 if (*RExC_parse != ')')
6bda09f9 5170 FAIL("Sequence (?R) not terminated");
1a147d38 5171 ret = reg_node(pRExC_state, GOSTART);
7f69552c
YO
5172 nextchar(pRExC_state);
5173 return ret;
5174 /*notreached*/
894be9b7
YO
5175 { /* named and numeric backreferences */
5176 I32 num;
894be9b7
YO
5177 case '&': /* (?&NAME) */
5178 parse_start = RExC_parse - 1;
1f1031fe 5179 named_recursion:
894be9b7 5180 {
0a4db386
YO
5181 SV *sv_dat = reg_scan_name(pRExC_state,
5182 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5183 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5184 }
5185 goto gen_recurse_regop;
5186 /* NOT REACHED */
542fa716
YO
5187 case '+':
5188 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5189 RExC_parse++;
5190 vFAIL("Illegal pattern");
5191 }
5192 goto parse_recursion;
5193 /* NOT REACHED*/
5194 case '-': /* (?-1) */
5195 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5196 RExC_parse--; /* rewind to let it be handled later */
5197 goto parse_flags;
5198 }
5199 /*FALLTHROUGH */
6bda09f9
YO
5200 case '1': case '2': case '3': case '4': /* (?1) */
5201 case '5': case '6': case '7': case '8': case '9':
5202 RExC_parse--;
542fa716 5203 parse_recursion:
894be9b7
YO
5204 num = atoi(RExC_parse);
5205 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5206 if (*RExC_parse == '-')
5207 RExC_parse++;
6bda09f9
YO
5208 while (isDIGIT(*RExC_parse))
5209 RExC_parse++;
5210 if (*RExC_parse!=')')
5211 vFAIL("Expecting close bracket");
894be9b7
YO
5212
5213 gen_recurse_regop:
542fa716
YO
5214 if ( paren == '-' ) {
5215 /*
5216 Diagram of capture buffer numbering.
5217 Top line is the normal capture buffer numbers
5218 Botton line is the negative indexing as from
5219 the X (the (?-2))
5220
5221 + 1 2 3 4 5 X 6 7
5222 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5223 - 5 4 3 2 1 X x x
5224
5225 */
5226 num = RExC_npar + num;
5227 if (num < 1) {
5228 RExC_parse++;
5229 vFAIL("Reference to nonexistent group");
5230 }
5231 } else if ( paren == '+' ) {
5232 num = RExC_npar + num - 1;
5233 }
5234
1a147d38 5235 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5236 if (!SIZE_ONLY) {
5237 if (num > (I32)RExC_rx->nparens) {
5238 RExC_parse++;
5239 vFAIL("Reference to nonexistent group");
5240 }
40d049e4 5241 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5242 RExC_emit++;
226de585 5243 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5244 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5245 } else {
6bda09f9 5246 RExC_size++;
6bda09f9 5247 }
0a4db386 5248 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5249 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5250 Set_Node_Offset(ret, parse_start); /* MJD */
5251
6bda09f9
YO
5252 nextchar(pRExC_state);
5253 return ret;
894be9b7
YO
5254 } /* named and numeric backreferences */
5255 /* NOT REACHED */
5256
fac92740 5257 case 'p': /* (?p...) */
9014280d 5258 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5259 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5260 /* FALL THROUGH*/
fac92740 5261 case '?': /* (??...) */
6136c704 5262 is_logical = 1;
438a3801
YST
5263 if (*RExC_parse != '{')
5264 goto unknown;
830247a4 5265 paren = *RExC_parse++;
0f5d15d6 5266 /* FALL THROUGH */
fac92740 5267 case '{': /* (?{...}) */
c277df42 5268 {
2eccd3b2
NC
5269 I32 count = 1;
5270 U32 n = 0;
c277df42 5271 char c;
830247a4 5272 char *s = RExC_parse;
c277df42 5273
830247a4
IZ
5274 RExC_seen_zerolen++;
5275 RExC_seen |= REG_SEEN_EVAL;
5276 while (count && (c = *RExC_parse)) {
6136c704
AL
5277 if (c == '\\') {
5278 if (RExC_parse[1])
5279 RExC_parse++;
5280 }
b81d288d 5281 else if (c == '{')
c277df42 5282 count++;
b81d288d 5283 else if (c == '}')
c277df42 5284 count--;
830247a4 5285 RExC_parse++;
c277df42 5286 }
6136c704 5287 if (*RExC_parse != ')') {
b81d288d 5288 RExC_parse = s;
b45f050a
JF
5289 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5290 }
c277df42 5291 if (!SIZE_ONLY) {
f3548bdc 5292 PAD *pad;
6136c704
AL
5293 OP_4tree *sop, *rop;
5294 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5295
569233ed
SB
5296 ENTER;
5297 Perl_save_re_context(aTHX);
f3548bdc 5298 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5299 sop->op_private |= OPpREFCOUNTED;
5300 /* re_dup will OpREFCNT_inc */
5301 OpREFCNT_set(sop, 1);
569233ed 5302 LEAVE;
c277df42 5303
830247a4 5304 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5305 RExC_rxi->data->data[n] = (void*)rop;
5306 RExC_rxi->data->data[n+1] = (void*)sop;
5307 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5308 SvREFCNT_dec(sv);
a0ed51b3 5309 }
e24b16f9 5310 else { /* First pass */
830247a4 5311 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5312 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5313 /* No compiled RE interpolated, has runtime
5314 components ===> unsafe. */
5315 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5316 if (PL_tainting && PL_tainted)
cc6b7395 5317 FAIL("Eval-group in insecure regular expression");
54df2634 5318#if PERL_VERSION > 8
923e4eb5 5319 if (IN_PERL_COMPILETIME)
b5c19bd7 5320 PL_cv_has_eval = 1;
54df2634 5321#endif
c277df42 5322 }
b5c19bd7 5323
830247a4 5324 nextchar(pRExC_state);
6136c704 5325 if (is_logical) {
830247a4 5326 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5327 if (!SIZE_ONLY)
5328 ret->flags = 2;
3dab1dad 5329 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5330 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5331 return ret;
5332 }
ccb2c380
MP
5333 ret = reganode(pRExC_state, EVAL, n);
5334 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5335 Set_Node_Offset(ret, parse_start);
5336 return ret;
c277df42 5337 }
fac92740 5338 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5339 {
0a4db386 5340 int is_define= 0;
fac92740 5341 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5342 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5343 || RExC_parse[1] == '<'
830247a4 5344 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5345 I32 flag;
5346
830247a4 5347 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5348 if (!SIZE_ONLY)
5349 ret->flags = 1;
3dab1dad 5350 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5351 goto insert_if;
b81d288d 5352 }
a0ed51b3 5353 }
0a4db386
YO
5354 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5355 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5356 {
5357 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5358 char *name_start= RExC_parse++;
2eccd3b2 5359 U32 num = 0;
0a4db386
YO
5360 SV *sv_dat=reg_scan_name(pRExC_state,
5361 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5362 if (RExC_parse == name_start || *RExC_parse != ch)
5363 vFAIL2("Sequence (?(%c... not terminated",
5364 (ch == '>' ? '<' : ch));
5365 RExC_parse++;
5366 if (!SIZE_ONLY) {
5367 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5368 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5369 SvREFCNT_inc(sv_dat);
5370 }
5371 ret = reganode(pRExC_state,NGROUPP,num);
5372 goto insert_if_check_paren;
5373 }
5374 else if (RExC_parse[0] == 'D' &&
5375 RExC_parse[1] == 'E' &&
5376 RExC_parse[2] == 'F' &&
5377 RExC_parse[3] == 'I' &&
5378 RExC_parse[4] == 'N' &&
5379 RExC_parse[5] == 'E')
5380 {
5381 ret = reganode(pRExC_state,DEFINEP,0);
5382 RExC_parse +=6 ;
5383 is_define = 1;
5384 goto insert_if_check_paren;
5385 }
5386 else if (RExC_parse[0] == 'R') {
5387 RExC_parse++;
5388 parno = 0;
5389 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5390 parno = atoi(RExC_parse++);
5391 while (isDIGIT(*RExC_parse))
5392 RExC_parse++;
5393 } else if (RExC_parse[0] == '&') {
5394 SV *sv_dat;
5395 RExC_parse++;
5396 sv_dat = reg_scan_name(pRExC_state,
5397 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5398 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5399 }
1a147d38 5400 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5401 goto insert_if_check_paren;
5402 }
830247a4 5403 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5404 /* (?(1)...) */
6136c704 5405 char c;
830247a4 5406 parno = atoi(RExC_parse++);
c277df42 5407
830247a4
IZ
5408 while (isDIGIT(*RExC_parse))
5409 RExC_parse++;
fac92740 5410 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5411
0a4db386 5412 insert_if_check_paren:
830247a4 5413 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5414 vFAIL("Switch condition not recognized");
c277df42 5415 insert_if:
3dab1dad
YO
5416 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5417 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5418 if (br == NULL)
830247a4 5419 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5420 else
3dab1dad 5421 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5422 c = *nextchar(pRExC_state);
d1b80229
IZ
5423 if (flags&HASWIDTH)
5424 *flagp |= HASWIDTH;
c277df42 5425 if (c == '|') {
0a4db386
YO
5426 if (is_define)
5427 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5428 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5429 regbranch(pRExC_state, &flags, 1,depth+1);
5430 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5431 if (flags&HASWIDTH)
5432 *flagp |= HASWIDTH;
830247a4 5433 c = *nextchar(pRExC_state);
a0ed51b3
LW
5434 }
5435 else
c277df42
IZ
5436 lastbr = NULL;
5437 if (c != ')')
8615cb43 5438 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5439 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5440 REGTAIL(pRExC_state, br, ender);
c277df42 5441 if (lastbr) {
3dab1dad
YO
5442 REGTAIL(pRExC_state, lastbr, ender);
5443 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5444 }
5445 else
3dab1dad 5446 REGTAIL(pRExC_state, ret, ender);
c277df42 5447 return ret;
a0ed51b3
LW
5448 }
5449 else {
830247a4 5450 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5451 }
5452 }
1b1626e4 5453 case 0:
830247a4 5454 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5455 vFAIL("Sequence (? incomplete");
1b1626e4 5456 break;
a0d0e21e 5457 default:
cde0cee5
YO
5458 --RExC_parse;
5459 parse_flags: /* (?i) */
5460 {
5461 U32 posflags = 0, negflags = 0;
5462 U32 *flagsp = &posflags;
5463
5464 while (*RExC_parse) {
5465 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
5466 /* (?g), (?gc) and (?o) are useless here
5467 and must be globally applied -- japhy */
cde0cee5
YO
5468 switch (*RExC_parse) {
5469 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5470 case 'o':
5471 case 'g':
9d1d55b5 5472 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5473 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5474 if (! (wastedflags & wflagbit) ) {
5475 wastedflags |= wflagbit;
5476 vWARN5(
5477 RExC_parse + 1,
5478 "Useless (%s%c) - %suse /%c modifier",
5479 flagsp == &negflags ? "?-" : "?",
5480 *RExC_parse,
5481 flagsp == &negflags ? "don't " : "",
5482 *RExC_parse
5483 );
5484 }
5485 }
cde0cee5
YO
5486 break;
5487
5488 case 'c':
9d1d55b5 5489 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5490 if (! (wastedflags & WASTED_C) ) {
5491 wastedflags |= WASTED_GC;
9d1d55b5
JP
5492 vWARN3(
5493 RExC_parse + 1,
5494 "Useless (%sc) - %suse /gc modifier",
5495 flagsp == &negflags ? "?-" : "?",
5496 flagsp == &negflags ? "don't " : ""
5497 );
5498 }
5499 }
cde0cee5
YO
5500 break;
5501 case 'k':
5502 if (flagsp == &negflags) {
5503 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5504 vWARN(RExC_parse + 1,"Useless use of (?-k)");
5505 } else {
5506 *flagsp |= RXf_PMf_KEEPCOPY;
5507 }
5508 break;
5509 case '-':
5510 if (flagsp == &negflags)
5511 goto unknown;
5512 flagsp = &negflags;
5513 wastedflags = 0; /* reset so (?g-c) warns twice */
5514 break;
5515 case ':':
5516 paren = ':';
5517 /*FALLTHROUGH*/
5518 case ')':
5519 RExC_flags |= posflags;
5520 RExC_flags &= ~negflags;
5521 nextchar(pRExC_state);
5522 if (paren != ':') {
5523 *flagp = TRYAGAIN;
5524 return NULL;
5525 } else {
5526 ret = NULL;
5527 goto parse_rest;
5528 }
5529 /*NOTREACHED*/
5530 default:
5531 unknown:
5532 RExC_parse++;
5533 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5534 /*NOTREACHED*/
5535 }
830247a4 5536 ++RExC_parse;
48c036b1 5537 }
cde0cee5 5538 }} /* one for the default block, one for the switch */
a0d0e21e 5539 }
fac92740 5540 else { /* (...) */
81714fb9 5541 capturing_parens:
830247a4
IZ
5542 parno = RExC_npar;
5543 RExC_npar++;
e2e6a0f1 5544
830247a4 5545 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5546 if (!SIZE_ONLY ){
5547 if (!RExC_nestroot)
5548 RExC_nestroot = parno;
5549 if (RExC_seen & REG_SEEN_RECURSE) {
5550 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5551 "Setting open paren #%"IVdf" to %d\n",
5552 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5553 RExC_open_parens[parno-1]= ret;
5554 }
6bda09f9 5555 }
fac92740
MJD
5556 Set_Node_Length(ret, 1); /* MJD */
5557 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5558 is_open = 1;
a0d0e21e 5559 }
a0ed51b3 5560 }
fac92740 5561 else /* ! paren */
a0d0e21e 5562 ret = NULL;
cde0cee5
YO
5563
5564 parse_rest:
a0d0e21e 5565 /* Pick up the branches, linking them together. */
fac92740 5566 parse_start = RExC_parse; /* MJD */
3dab1dad 5567 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5568 /* branch_len = (paren != 0); */
2af232bd 5569
a0d0e21e
LW
5570 if (br == NULL)
5571 return(NULL);
830247a4
IZ
5572 if (*RExC_parse == '|') {
5573 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5574 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5575 }
fac92740 5576 else { /* MJD */
6bda09f9 5577 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5578 Set_Node_Length(br, paren != 0);
5579 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5580 }
c277df42
IZ
5581 have_branch = 1;
5582 if (SIZE_ONLY)
830247a4 5583 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5584 }
5585 else if (paren == ':') {
c277df42
IZ
5586 *flagp |= flags&SIMPLE;
5587 }
6136c704 5588 if (is_open) { /* Starts with OPEN. */
3dab1dad 5589 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5590 }
5591 else if (paren != '?') /* Not Conditional */
a0d0e21e 5592 ret = br;
32a0ca98 5593 *flagp |= flags & (SPSTART | HASWIDTH);
c277df42 5594 lastbr = br;
830247a4
IZ
5595 while (*RExC_parse == '|') {
5596 if (!SIZE_ONLY && RExC_extralen) {
5597 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5598 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5599 }
5600 if (SIZE_ONLY)
830247a4
IZ
5601 RExC_extralen += 2; /* Account for LONGJMP. */
5602 nextchar(pRExC_state);
3dab1dad 5603 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5604
a687059c 5605 if (br == NULL)
a0d0e21e 5606 return(NULL);
3dab1dad 5607 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5608 lastbr = br;
821b33a5
IZ
5609 if (flags&HASWIDTH)
5610 *flagp |= HASWIDTH;
a687059c 5611 *flagp |= flags&SPSTART;
a0d0e21e
LW
5612 }
5613
c277df42
IZ
5614 if (have_branch || paren != ':') {
5615 /* Make a closing node, and hook it on the end. */
5616 switch (paren) {
5617 case ':':
830247a4 5618 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5619 break;
5620 case 1:
830247a4 5621 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5622 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5623 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5624 "Setting close paren #%"IVdf" to %d\n",
5625 (IV)parno, REG_NODE_NUM(ender)));
5626 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5627 if (RExC_nestroot == parno)
5628 RExC_nestroot = 0;
40d049e4 5629 }
fac92740
MJD
5630 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5631 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5632 break;
5633 case '<':
c277df42
IZ
5634 case ',':
5635 case '=':
5636 case '!':
c277df42 5637 *flagp &= ~HASWIDTH;
821b33a5
IZ
5638 /* FALL THROUGH */
5639 case '>':
830247a4 5640 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5641 break;
5642 case 0:
830247a4 5643 ender = reg_node(pRExC_state, END);
40d049e4
YO
5644 if (!SIZE_ONLY) {
5645 assert(!RExC_opend); /* there can only be one! */
5646 RExC_opend = ender;
5647 }
c277df42
IZ
5648 break;
5649 }
eaf3ca90 5650 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5651
9674d46a 5652 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5653 if (depth==1)
5654 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5655
c277df42 5656 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5657 for (br = ret; br; br = regnext(br)) {
5658 const U8 op = PL_regkind[OP(br)];
5659 if (op == BRANCH) {
07be1b83 5660 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5661 }
5662 else if (op == BRANCHJ) {
07be1b83 5663 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5664 }
c277df42
IZ
5665 }
5666 }
a0d0e21e 5667 }
c277df42
IZ
5668
5669 {
e1ec3a88
AL
5670 const char *p;
5671 static const char parens[] = "=!<,>";
c277df42
IZ
5672
5673 if (paren && (p = strchr(parens, paren))) {
eb160463 5674 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5675 int flag = (p - parens) > 1;
5676
5677 if (paren == '>')
5678 node = SUSPEND, flag = 0;
6bda09f9 5679 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5680 Set_Node_Cur_Length(ret);
5681 Set_Node_Offset(ret, parse_start + 1);
c277df42 5682 ret->flags = flag;
07be1b83 5683 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5684 }
a0d0e21e
LW
5685 }
5686
5687 /* Check for proper termination. */
ce3e6498 5688 if (paren) {
e2509266 5689 RExC_flags = oregflags;
830247a4
IZ
5690 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5691 RExC_parse = oregcomp_parse;
380a0633 5692 vFAIL("Unmatched (");
ce3e6498 5693 }
a0ed51b3 5694 }
830247a4
IZ
5695 else if (!paren && RExC_parse < RExC_end) {
5696 if (*RExC_parse == ')') {
5697 RExC_parse++;
380a0633 5698 vFAIL("Unmatched )");
a0ed51b3
LW
5699 }
5700 else
b45f050a 5701 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5702 /* NOTREACHED */
5703 }
a687059c 5704
a0d0e21e 5705 return(ret);
a687059c
LW
5706}
5707
5708/*
5709 - regbranch - one alternative of an | operator
5710 *
5711 * Implements the concatenation operator.
5712 */
76e3520e 5713STATIC regnode *
3dab1dad 5714S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5715{
97aff369 5716 dVAR;
c277df42
IZ
5717 register regnode *ret;
5718 register regnode *chain = NULL;
5719 register regnode *latest;
5720 I32 flags = 0, c = 0;
3dab1dad
YO
5721 GET_RE_DEBUG_FLAGS_DECL;
5722 DEBUG_PARSE("brnc");
b81d288d 5723 if (first)
c277df42
IZ
5724 ret = NULL;
5725 else {
b81d288d 5726 if (!SIZE_ONLY && RExC_extralen)
830247a4 5727 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5728 else {
830247a4 5729 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5730 Set_Node_Length(ret, 1);
5731 }
c277df42
IZ
5732 }
5733
b81d288d 5734 if (!first && SIZE_ONLY)
830247a4 5735 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5736
c277df42 5737 *flagp = WORST; /* Tentatively. */
a0d0e21e 5738
830247a4
IZ
5739 RExC_parse--;
5740 nextchar(pRExC_state);
5741 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5742 flags &= ~TRYAGAIN;
3dab1dad 5743 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5744 if (latest == NULL) {
5745 if (flags & TRYAGAIN)
5746 continue;
5747 return(NULL);
a0ed51b3
LW
5748 }
5749 else if (ret == NULL)
c277df42 5750 ret = latest;
a0d0e21e 5751 *flagp |= flags&HASWIDTH;
c277df42 5752 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5753 *flagp |= flags&SPSTART;
5754 else {
830247a4 5755 RExC_naughty++;
3dab1dad 5756 REGTAIL(pRExC_state, chain, latest);
a687059c 5757 }
a0d0e21e 5758 chain = latest;
c277df42
IZ
5759 c++;
5760 }
5761 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5762 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5763 if (ret == NULL)
5764 ret = chain;
5765 }
5766 if (c == 1) {
5767 *flagp |= flags&SIMPLE;
a0d0e21e 5768 }
a687059c 5769
d4c19fe8 5770 return ret;
a687059c
LW
5771}
5772
5773/*
5774 - regpiece - something followed by possible [*+?]
5775 *
5776 * Note that the branching code sequences used for ? and the general cases
5777 * of * and + are somewhat optimized: they use the same NOTHING node as
5778 * both the endmarker for their branch list and the body of the last branch.
5779 * It might seem that this node could be dispensed with entirely, but the
5780 * endmarker role is not redundant.
5781 */
76e3520e 5782STATIC regnode *
3dab1dad 5783S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5784{
97aff369 5785 dVAR;
c277df42 5786 register regnode *ret;
a0d0e21e
LW
5787 register char op;
5788 register char *next;
5789 I32 flags;
1df70142 5790 const char * const origparse = RExC_parse;
a0d0e21e 5791 I32 min;
c277df42 5792 I32 max = REG_INFTY;
fac92740 5793 char *parse_start;
10edeb5d 5794 const char *maxpos = NULL;
3dab1dad
YO
5795 GET_RE_DEBUG_FLAGS_DECL;
5796 DEBUG_PARSE("piec");
a0d0e21e 5797
3dab1dad 5798 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5799 if (ret == NULL) {
5800 if (flags & TRYAGAIN)
5801 *flagp |= TRYAGAIN;
5802 return(NULL);
5803 }
5804
830247a4 5805 op = *RExC_parse;
a0d0e21e 5806
830247a4 5807 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5808 maxpos = NULL;
fac92740 5809 parse_start = RExC_parse; /* MJD */
830247a4 5810 next = RExC_parse + 1;
a0d0e21e
LW
5811 while (isDIGIT(*next) || *next == ',') {
5812 if (*next == ',') {
5813 if (maxpos)
5814 break;
5815 else
5816 maxpos = next;
a687059c 5817 }
a0d0e21e
LW
5818 next++;
5819 }
5820 if (*next == '}') { /* got one */
5821 if (!maxpos)
5822 maxpos = next;
830247a4
IZ
5823 RExC_parse++;
5824 min = atoi(RExC_parse);
a0d0e21e
LW
5825 if (*maxpos == ',')
5826 maxpos++;
5827 else
830247a4 5828 maxpos = RExC_parse;
a0d0e21e
LW
5829 max = atoi(maxpos);
5830 if (!max && *maxpos != '0')
c277df42
IZ
5831 max = REG_INFTY; /* meaning "infinity" */
5832 else if (max >= REG_INFTY)
8615cb43 5833 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5834 RExC_parse = next;
5835 nextchar(pRExC_state);
a0d0e21e
LW
5836
5837 do_curly:
5838 if ((flags&SIMPLE)) {
830247a4 5839 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5840 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5841 Set_Node_Offset(ret, parse_start+1); /* MJD */
5842 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5843 }
5844 else {
3dab1dad 5845 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5846
5847 w->flags = 0;
3dab1dad 5848 REGTAIL(pRExC_state, ret, w);
830247a4 5849 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5850 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5851 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5852 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5853 }
6bda09f9 5854 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5855 /* MJD hk */
5856 Set_Node_Offset(ret, parse_start+1);
2af232bd 5857 Set_Node_Length(ret,
fac92740 5858 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5859
830247a4 5860 if (!SIZE_ONLY && RExC_extralen)
c277df42 5861 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5862 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5863 if (SIZE_ONLY)
830247a4
IZ
5864 RExC_whilem_seen++, RExC_extralen += 3;
5865 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5866 }
c277df42 5867 ret->flags = 0;
a0d0e21e
LW
5868
5869 if (min > 0)
821b33a5
IZ
5870 *flagp = WORST;
5871 if (max > 0)
5872 *flagp |= HASWIDTH;
a0d0e21e 5873 if (max && max < min)
8615cb43 5874 vFAIL("Can't do {n,m} with n > m");
c277df42 5875 if (!SIZE_ONLY) {
eb160463
GS
5876 ARG1_SET(ret, (U16)min);
5877 ARG2_SET(ret, (U16)max);
a687059c 5878 }
a687059c 5879
a0d0e21e 5880 goto nest_check;
a687059c 5881 }
a0d0e21e 5882 }
a687059c 5883
a0d0e21e
LW
5884 if (!ISMULT1(op)) {
5885 *flagp = flags;
a687059c 5886 return(ret);
a0d0e21e 5887 }
bb20fd44 5888
c277df42 5889#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5890
5891 /* if this is reinstated, don't forget to put this back into perldiag:
5892
5893 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5894
5895 (F) The part of the regexp subject to either the * or + quantifier
5896 could match an empty string. The {#} shows in the regular
5897 expression about where the problem was discovered.
5898
5899 */
5900
bb20fd44 5901 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5902 vFAIL("Regexp *+ operand could be empty");
b81d288d 5903#endif
bb20fd44 5904
fac92740 5905 parse_start = RExC_parse;
830247a4 5906 nextchar(pRExC_state);
a0d0e21e 5907
821b33a5 5908 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5909
5910 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5911 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5912 ret->flags = 0;
830247a4 5913 RExC_naughty += 4;
a0d0e21e
LW
5914 }
5915 else if (op == '*') {
5916 min = 0;
5917 goto do_curly;
a0ed51b3
LW
5918 }
5919 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 5920 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 5921 ret->flags = 0;
830247a4 5922 RExC_naughty += 3;
a0d0e21e
LW
5923 }
5924 else if (op == '+') {
5925 min = 1;
5926 goto do_curly;
a0ed51b3
LW
5927 }
5928 else if (op == '?') {
a0d0e21e
LW
5929 min = 0; max = 1;
5930 goto do_curly;
5931 }
5932 nest_check:
041457d9 5933 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 5934 vWARN3(RExC_parse,
b45f050a 5935 "%.*s matches null string many times",
afd78fd5 5936 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 5937 origparse);
a0d0e21e
LW
5938 }
5939
b9b4dddf 5940 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 5941 nextchar(pRExC_state);
6bda09f9 5942 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 5943 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 5944 }
b9b4dddf
YO
5945#ifndef REG_ALLOW_MINMOD_SUSPEND
5946 else
5947#endif
5948 if (RExC_parse < RExC_end && *RExC_parse == '+') {
5949 regnode *ender;
5950 nextchar(pRExC_state);
5951 ender = reg_node(pRExC_state, SUCCEED);
5952 REGTAIL(pRExC_state, ret, ender);
5953 reginsert(pRExC_state, SUSPEND, ret, depth+1);
5954 ret->flags = 0;
5955 ender = reg_node(pRExC_state, TAIL);
5956 REGTAIL(pRExC_state, ret, ender);
5957 /*ret= ender;*/
5958 }
5959
5960 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 5961 RExC_parse++;
b45f050a
JF
5962 vFAIL("Nested quantifiers");
5963 }
a0d0e21e
LW
5964
5965 return(ret);
a687059c
LW
5966}
5967
fc8cd66c
YO
5968
5969/* reg_namedseq(pRExC_state,UVp)
5970
5971 This is expected to be called by a parser routine that has
5972 recognized'\N' and needs to handle the rest. RExC_parse is
5973 expected to point at the first char following the N at the time
5974 of the call.
5975
5976 If valuep is non-null then it is assumed that we are parsing inside
5977 of a charclass definition and the first codepoint in the resolved
5978 string is returned via *valuep and the routine will return NULL.
5979 In this mode if a multichar string is returned from the charnames
5980 handler a warning will be issued, and only the first char in the
5981 sequence will be examined. If the string returned is zero length
5982 then the value of *valuep is undefined and NON-NULL will
5983 be returned to indicate failure. (This will NOT be a valid pointer
5984 to a regnode.)
5985
5986 If value is null then it is assumed that we are parsing normal text
5987 and inserts a new EXACT node into the program containing the resolved
5988 string and returns a pointer to the new node. If the string is
5989 zerolength a NOTHING node is emitted.
5990
5991 On success RExC_parse is set to the char following the endbrace.
5992 Parsing failures will generate a fatal errorvia vFAIL(...)
5993
5994 NOTE: We cache all results from the charnames handler locally in
5995 the RExC_charnames hash (created on first use) to prevent a charnames
5996 handler from playing silly-buggers and returning a short string and
5997 then a long string for a given pattern. Since the regexp program
5998 size is calculated during an initial parse this would result
5999 in a buffer overrun so we cache to prevent the charname result from
6000 changing during the course of the parse.
6001
6002 */
6003STATIC regnode *
6004S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6005{
6006 char * name; /* start of the content of the name */
6007 char * endbrace; /* endbrace following the name */
6008 SV *sv_str = NULL;
6009 SV *sv_name = NULL;
6010 STRLEN len; /* this has various purposes throughout the code */
6011 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6012 regnode *ret = NULL;
6013
6014 if (*RExC_parse != '{') {
6015 vFAIL("Missing braces on \\N{}");
6016 }
6017 name = RExC_parse+1;
6018 endbrace = strchr(RExC_parse, '}');
6019 if ( ! endbrace ) {
6020 RExC_parse++;
6021 vFAIL("Missing right brace on \\N{}");
6022 }
6023 RExC_parse = endbrace + 1;
6024
6025
6026 /* RExC_parse points at the beginning brace,
6027 endbrace points at the last */
6028 if ( name[0]=='U' && name[1]=='+' ) {
6029 /* its a "unicode hex" notation {U+89AB} */
6030 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6031 | PERL_SCAN_DISALLOW_PREFIX
6032 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6033 UV cp;
196f1508 6034 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6035 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6036 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6037 cp = 0xFFFD;
6038 }
6039 if (cp > 0xff)
6040 RExC_utf8 = 1;
6041 if ( valuep ) {
6042 *valuep = cp;
6043 return NULL;
6044 }
6045 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6046 } else {
6047 /* fetch the charnames handler for this scope */
6048 HV * const table = GvHV(PL_hintgv);
6049 SV **cvp= table ?
6050 hv_fetchs(table, "charnames", FALSE) :
6051 NULL;
6052 SV *cv= cvp ? *cvp : NULL;
6053 HE *he_str;
6054 int count;
6055 /* create an SV with the name as argument */
6056 sv_name = newSVpvn(name, endbrace - name);
6057
6058 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6059 vFAIL2("Constant(\\N{%s}) unknown: "
6060 "(possibly a missing \"use charnames ...\")",
6061 SvPVX(sv_name));
6062 }
6063 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6064 vFAIL2("Constant(\\N{%s}): "
6065 "$^H{charnames} is not defined",SvPVX(sv_name));
6066 }
6067
6068
6069
6070 if (!RExC_charnames) {
6071 /* make sure our cache is allocated */
6072 RExC_charnames = newHV();
6bda09f9 6073 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
6074 }
6075 /* see if we have looked this one up before */
6076 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6077 if ( he_str ) {
6078 sv_str = HeVAL(he_str);
6079 cached = 1;
6080 } else {
6081 dSP ;
6082
6083 ENTER ;
6084 SAVETMPS ;
6085 PUSHMARK(SP) ;
6086
6087 XPUSHs(sv_name);
6088
6089 PUTBACK ;
6090
6091 count= call_sv(cv, G_SCALAR);
6092
6093 if (count == 1) { /* XXXX is this right? dmq */
6094 sv_str = POPs;
6095 SvREFCNT_inc_simple_void(sv_str);
6096 }
6097
6098 SPAGAIN ;
6099 PUTBACK ;
6100 FREETMPS ;
6101 LEAVE ;
6102
6103 if ( !sv_str || !SvOK(sv_str) ) {
6104 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6105 "did not return a defined value",SvPVX(sv_name));
6106 }
6107 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6108 cached = 1;
6109 }
6110 }
6111 if (valuep) {
6112 char *p = SvPV(sv_str, len);
6113 if (len) {
6114 STRLEN numlen = 1;
6115 if ( SvUTF8(sv_str) ) {
196f1508 6116 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6117 if (*valuep > 0x7F)
6118 RExC_utf8 = 1;
6119 /* XXXX
6120 We have to turn on utf8 for high bit chars otherwise
6121 we get failures with
6122
6123 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6124 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6125
6126 This is different from what \x{} would do with the same
6127 codepoint, where the condition is > 0xFF.
6128 - dmq
6129 */
6130
6131
6132 } else {
6133 *valuep = (UV)*p;
6134 /* warn if we havent used the whole string? */
6135 }
6136 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6137 vWARN2(RExC_parse,
6138 "Ignoring excess chars from \\N{%s} in character class",
6139 SvPVX(sv_name)
6140 );
6141 }
6142 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6143 vWARN2(RExC_parse,
6144 "Ignoring zero length \\N{%s} in character class",
6145 SvPVX(sv_name)
6146 );
6147 }
6148 if (sv_name)
6149 SvREFCNT_dec(sv_name);
6150 if (!cached)
6151 SvREFCNT_dec(sv_str);
6152 return len ? NULL : (regnode *)&len;
6153 } else if(SvCUR(sv_str)) {
6154
6155 char *s;
6156 char *p, *pend;
6157 STRLEN charlen = 1;
d008bc60 6158#ifdef DEBUGGING
fc8cd66c 6159 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6160#endif
fc8cd66c
YO
6161 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6162
6163 ret = reg_node(pRExC_state,
6164 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6165 s= STRING(ret);
6166
6167 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6168 sv_utf8_upgrade(sv_str);
6169 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6170 RExC_utf8= 1;
6171 }
6172
6173 p = SvPV(sv_str, len);
6174 pend = p + len;
6175 /* len is the length written, charlen is the size the char read */
6176 for ( len = 0; p < pend; p += charlen ) {
6177 if (UTF) {
196f1508 6178 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6179 if (FOLD) {
6180 STRLEN foldlen,numlen;
6181 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6182 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6183 /* Emit all the Unicode characters. */
6184
6185 for (foldbuf = tmpbuf;
6186 foldlen;
6187 foldlen -= numlen)
6188 {
6189 uvc = utf8_to_uvchr(foldbuf, &numlen);
6190 if (numlen > 0) {
6191 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6192 s += unilen;
6193 len += unilen;
6194 /* In EBCDIC the numlen
6195 * and unilen can differ. */
6196 foldbuf += numlen;
6197 if (numlen >= foldlen)
6198 break;
6199 }
6200 else
6201 break; /* "Can't happen." */
6202 }
6203 } else {
6204 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6205 if (unilen > 0) {
6206 s += unilen;
6207 len += unilen;
6208 }
6209 }
6210 } else {
6211 len++;
6212 REGC(*p, s++);
6213 }
6214 }
6215 if (SIZE_ONLY) {
6216 RExC_size += STR_SZ(len);
6217 } else {
6218 STR_LEN(ret) = len;
6219 RExC_emit += STR_SZ(len);
6220 }
6221 Set_Node_Cur_Length(ret); /* MJD */
6222 RExC_parse--;
6223 nextchar(pRExC_state);
6224 } else {
6225 ret = reg_node(pRExC_state,NOTHING);
6226 }
6227 if (!cached) {
6228 SvREFCNT_dec(sv_str);
6229 }
6230 if (sv_name) {
6231 SvREFCNT_dec(sv_name);
6232 }
6233 return ret;
6234
6235}
6236
6237
9e08bc66
TS
6238/*
6239 * reg_recode
6240 *
6241 * It returns the code point in utf8 for the value in *encp.
6242 * value: a code value in the source encoding
6243 * encp: a pointer to an Encode object
6244 *
6245 * If the result from Encode is not a single character,
6246 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6247 */
6248STATIC UV
6249S_reg_recode(pTHX_ const char value, SV **encp)
6250{
6251 STRLEN numlen = 1;
6252 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6253 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6254 : SvPVX(sv);
6255 const STRLEN newlen = SvCUR(sv);
6256 UV uv = UNICODE_REPLACEMENT;
6257
6258 if (newlen)
6259 uv = SvUTF8(sv)
6260 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6261 : *(U8*)s;
6262
6263 if (!newlen || numlen != newlen) {
6264 uv = UNICODE_REPLACEMENT;
6265 if (encp)
6266 *encp = NULL;
6267 }
6268 return uv;
6269}
6270
fc8cd66c 6271
a687059c
LW
6272/*
6273 - regatom - the lowest level
ee9b8eae
YO
6274
6275 Try to identify anything special at the start of the pattern. If there
6276 is, then handle it as required. This may involve generating a single regop,
6277 such as for an assertion; or it may involve recursing, such as to
6278 handle a () structure.
6279
6280 If the string doesn't start with something special then we gobble up
6281 as much literal text as we can.
6282
6283 Once we have been able to handle whatever type of thing started the
6284 sequence, we return.
6285
6286 Note: we have to be careful with escapes, as they can be both literal
6287 and special, and in the case of \10 and friends can either, depending
6288 on context. Specifically there are two seperate switches for handling
6289 escape sequences, with the one for handling literal escapes requiring
6290 a dummy entry for all of the special escapes that are actually handled
6291 by the other.
6292*/
6293
76e3520e 6294STATIC regnode *
3dab1dad 6295S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6296{
97aff369 6297 dVAR;
cbbf8932 6298 register regnode *ret = NULL;
a0d0e21e 6299 I32 flags;
45948336 6300 char *parse_start = RExC_parse;
3dab1dad
YO
6301 GET_RE_DEBUG_FLAGS_DECL;
6302 DEBUG_PARSE("atom");
a0d0e21e
LW
6303 *flagp = WORST; /* Tentatively. */
6304
ee9b8eae 6305
a0d0e21e 6306tryagain:
830247a4 6307 switch (*RExC_parse) {
a0d0e21e 6308 case '^':
830247a4
IZ
6309 RExC_seen_zerolen++;
6310 nextchar(pRExC_state);
bbe252da 6311 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6312 ret = reg_node(pRExC_state, MBOL);
bbe252da 6313 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6314 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6315 else
830247a4 6316 ret = reg_node(pRExC_state, BOL);
fac92740 6317 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6318 break;
6319 case '$':
830247a4 6320 nextchar(pRExC_state);
b81d288d 6321 if (*RExC_parse)
830247a4 6322 RExC_seen_zerolen++;
bbe252da 6323 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6324 ret = reg_node(pRExC_state, MEOL);
bbe252da 6325 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6326 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6327 else
830247a4 6328 ret = reg_node(pRExC_state, EOL);
fac92740 6329 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6330 break;
6331 case '.':
830247a4 6332 nextchar(pRExC_state);
bbe252da 6333 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6334 ret = reg_node(pRExC_state, SANY);
6335 else
6336 ret = reg_node(pRExC_state, REG_ANY);
6337 *flagp |= HASWIDTH|SIMPLE;
830247a4 6338 RExC_naughty++;
fac92740 6339 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6340 break;
6341 case '[':
b45f050a 6342 {
3dab1dad
YO
6343 char * const oregcomp_parse = ++RExC_parse;
6344 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6345 if (*RExC_parse != ']') {
6346 RExC_parse = oregcomp_parse;
b45f050a
JF
6347 vFAIL("Unmatched [");
6348 }
830247a4 6349 nextchar(pRExC_state);
a0d0e21e 6350 *flagp |= HASWIDTH|SIMPLE;
fac92740 6351 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6352 break;
b45f050a 6353 }
a0d0e21e 6354 case '(':
830247a4 6355 nextchar(pRExC_state);
3dab1dad 6356 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6357 if (ret == NULL) {
bf93d4cc 6358 if (flags & TRYAGAIN) {
830247a4 6359 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6360 /* Make parent create an empty node if needed. */
6361 *flagp |= TRYAGAIN;
6362 return(NULL);
6363 }
a0d0e21e 6364 goto tryagain;
bf93d4cc 6365 }
a0d0e21e
LW
6366 return(NULL);
6367 }
c277df42 6368 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
a0d0e21e
LW
6369 break;
6370 case '|':
6371 case ')':
6372 if (flags & TRYAGAIN) {
6373 *flagp |= TRYAGAIN;
6374 return NULL;
6375 }
b45f050a 6376 vFAIL("Internal urp");
a0d0e21e
LW
6377 /* Supposed to be caught earlier. */
6378 break;
85afd4ae 6379 case '{':
830247a4
IZ
6380 if (!regcurly(RExC_parse)) {
6381 RExC_parse++;
85afd4ae
CS
6382 goto defchar;
6383 }
6384 /* FALL THROUGH */
a0d0e21e
LW
6385 case '?':
6386 case '+':
6387 case '*':
830247a4 6388 RExC_parse++;
b45f050a 6389 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6390 break;
6391 case '\\':
ee9b8eae
YO
6392 /* Special Escapes
6393
6394 This switch handles escape sequences that resolve to some kind
6395 of special regop and not to literal text. Escape sequnces that
6396 resolve to literal text are handled below in the switch marked
6397 "Literal Escapes".
6398
6399 Every entry in this switch *must* have a corresponding entry
6400 in the literal escape switch. However, the opposite is not
6401 required, as the default for this switch is to jump to the
6402 literal text handling code.
6403 */
830247a4 6404 switch (*++RExC_parse) {
ee9b8eae 6405 /* Special Escapes */
a0d0e21e 6406 case 'A':
830247a4
IZ
6407 RExC_seen_zerolen++;
6408 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6409 *flagp |= SIMPLE;
ee9b8eae 6410 goto finish_meta_pat;
a0d0e21e 6411 case 'G':
830247a4
IZ
6412 ret = reg_node(pRExC_state, GPOS);
6413 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6414 *flagp |= SIMPLE;
ee9b8eae
YO
6415 goto finish_meta_pat;
6416 case 'K':
6417 RExC_seen_zerolen++;
6418 ret = reg_node(pRExC_state, KEEPS);
6419 *flagp |= SIMPLE;
6420 goto finish_meta_pat;
a0d0e21e 6421 case 'Z':
830247a4 6422 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6423 *flagp |= SIMPLE;
a1917ab9 6424 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6425 goto finish_meta_pat;
b85d18e9 6426 case 'z':
830247a4 6427 ret = reg_node(pRExC_state, EOS);
b85d18e9 6428 *flagp |= SIMPLE;
830247a4 6429 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6430 goto finish_meta_pat;
4a2d328f 6431 case 'C':
f33976b4
DB
6432 ret = reg_node(pRExC_state, CANY);
6433 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6434 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6435 goto finish_meta_pat;
a0ed51b3 6436 case 'X':
830247a4 6437 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6438 *flagp |= HASWIDTH;
ee9b8eae 6439 goto finish_meta_pat;
a0d0e21e 6440 case 'w':
eb160463 6441 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6442 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6443 goto finish_meta_pat;
a0d0e21e 6444 case 'W':
eb160463 6445 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6446 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6447 goto finish_meta_pat;
a0d0e21e 6448 case 'b':
830247a4
IZ
6449 RExC_seen_zerolen++;
6450 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6451 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6452 *flagp |= SIMPLE;
ee9b8eae 6453 goto finish_meta_pat;
a0d0e21e 6454 case 'B':
830247a4
IZ
6455 RExC_seen_zerolen++;
6456 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6457 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6458 *flagp |= SIMPLE;
ee9b8eae 6459 goto finish_meta_pat;
a0d0e21e 6460 case 's':
eb160463 6461 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6462 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6463 goto finish_meta_pat;
a0d0e21e 6464 case 'S':
eb160463 6465 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6466 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6467 goto finish_meta_pat;
a0d0e21e 6468 case 'd':
ffc61ed2 6469 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6470 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6471 goto finish_meta_pat;
a0d0e21e 6472 case 'D':
ffc61ed2 6473 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6474 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
6475 goto finish_meta_pat;
6476 case 'v':
6477 ret = reganode(pRExC_state, PRUNE, 0);
6478 ret->flags = 1;
6479 *flagp |= SIMPLE;
6480 goto finish_meta_pat;
6481 case 'V':
6482 ret = reganode(pRExC_state, SKIP, 0);
6483 ret->flags = 1;
6484 *flagp |= SIMPLE;
6485 finish_meta_pat:
830247a4 6486 nextchar(pRExC_state);
fac92740 6487 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 6488 break;
a14b48bc
LW
6489 case 'p':
6490 case 'P':
3568d838 6491 {
3dab1dad 6492 char* const oldregxend = RExC_end;
d008bc60 6493#ifdef DEBUGGING
ccb2c380 6494 char* parse_start = RExC_parse - 2;
d008bc60 6495#endif
a14b48bc 6496
830247a4 6497 if (RExC_parse[1] == '{') {
3568d838 6498 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6499 RExC_end = strchr(RExC_parse, '}');
6500 if (!RExC_end) {
3dab1dad 6501 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6502 RExC_parse += 2;
6503 RExC_end = oldregxend;
0da60cf5 6504 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6505 }
830247a4 6506 RExC_end++;
a14b48bc 6507 }
af6f566e 6508 else {
830247a4 6509 RExC_end = RExC_parse + 2;
af6f566e
HS
6510 if (RExC_end > oldregxend)
6511 RExC_end = oldregxend;
6512 }
830247a4 6513 RExC_parse--;
a14b48bc 6514
3dab1dad 6515 ret = regclass(pRExC_state,depth+1);
a14b48bc 6516
830247a4
IZ
6517 RExC_end = oldregxend;
6518 RExC_parse--;
ccb2c380
MP
6519
6520 Set_Node_Offset(ret, parse_start + 2);
6521 Set_Node_Cur_Length(ret);
830247a4 6522 nextchar(pRExC_state);
a14b48bc
LW
6523 *flagp |= HASWIDTH|SIMPLE;
6524 }
6525 break;
fc8cd66c
YO
6526 case 'N':
6527 /* Handle \N{NAME} here and not below because it can be
6528 multicharacter. join_exact() will join them up later on.
6529 Also this makes sure that things like /\N{BLAH}+/ and
6530 \N{BLAH} being multi char Just Happen. dmq*/
6531 ++RExC_parse;
6532 ret= reg_namedseq(pRExC_state, NULL);
6533 break;
0a4db386 6534 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 6535 parse_named_seq:
81714fb9
YO
6536 {
6537 char ch= RExC_parse[1];
1f1031fe
YO
6538 if (ch != '<' && ch != '\'' && ch != '{') {
6539 RExC_parse++;
6540 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 6541 } else {
1f1031fe
YO
6542 /* this pretty much dupes the code for (?P=...) in reg(), if
6543 you change this make sure you change that */
81714fb9 6544 char* name_start = (RExC_parse += 2);
2eccd3b2 6545 U32 num = 0;
0a4db386
YO
6546 SV *sv_dat = reg_scan_name(pRExC_state,
6547 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 6548 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 6549 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
6550 vFAIL2("Sequence %.3s... not terminated",parse_start);
6551
6552 if (!SIZE_ONLY) {
6553 num = add_data( pRExC_state, 1, "S" );
6554 RExC_rxi->data->data[num]=(void*)sv_dat;
6555 SvREFCNT_inc(sv_dat);
6556 }
6557
81714fb9
YO
6558 RExC_sawback = 1;
6559 ret = reganode(pRExC_state,
6560 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6561 num);
6562 *flagp |= HASWIDTH;
1f1031fe 6563
81714fb9
YO
6564 /* override incorrect value set in reganode MJD */
6565 Set_Node_Offset(ret, parse_start+1);
6566 Set_Node_Cur_Length(ret); /* MJD */
6567 nextchar(pRExC_state);
1f1031fe 6568
81714fb9
YO
6569 }
6570 break;
1f1031fe 6571 }
2bf803e2 6572 case 'g':
a0d0e21e
LW
6573 case '1': case '2': case '3': case '4':
6574 case '5': case '6': case '7': case '8': case '9':
6575 {
c74340f9 6576 I32 num;
2bf803e2
YO
6577 bool isg = *RExC_parse == 'g';
6578 bool isrel = 0;
6579 bool hasbrace = 0;
6580 if (isg) {
c74340f9 6581 RExC_parse++;
2bf803e2
YO
6582 if (*RExC_parse == '{') {
6583 RExC_parse++;
6584 hasbrace = 1;
6585 }
6586 if (*RExC_parse == '-') {
6587 RExC_parse++;
6588 isrel = 1;
6589 }
1f1031fe
YO
6590 if (hasbrace && !isDIGIT(*RExC_parse)) {
6591 if (isrel) RExC_parse--;
6592 RExC_parse -= 2;
6593 goto parse_named_seq;
6594 } }
c74340f9
YO
6595 num = atoi(RExC_parse);
6596 if (isrel) {
5624f11d 6597 num = RExC_npar - num;
c74340f9
YO
6598 if (num < 1)
6599 vFAIL("Reference to nonexistent or unclosed group");
6600 }
2bf803e2 6601 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6602 goto defchar;
6603 else {
3dab1dad 6604 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6605 while (isDIGIT(*RExC_parse))
6606 RExC_parse++;
1f1031fe
YO
6607 if (parse_start == RExC_parse - 1)
6608 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
6609 if (hasbrace) {
6610 if (*RExC_parse != '}')
6611 vFAIL("Unterminated \\g{...} pattern");
6612 RExC_parse++;
6613 }
c74340f9
YO
6614 if (!SIZE_ONLY) {
6615 if (num > (I32)RExC_rx->nparens)
6616 vFAIL("Reference to nonexistent group");
c74340f9 6617 }
830247a4 6618 RExC_sawback = 1;
eb160463
GS
6619 ret = reganode(pRExC_state,
6620 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6621 num);
a0d0e21e 6622 *flagp |= HASWIDTH;
2af232bd 6623
fac92740 6624 /* override incorrect value set in reganode MJD */
2af232bd 6625 Set_Node_Offset(ret, parse_start+1);
fac92740 6626 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6627 RExC_parse--;
6628 nextchar(pRExC_state);
a0d0e21e
LW
6629 }
6630 }
6631 break;
6632 case '\0':
830247a4 6633 if (RExC_parse >= RExC_end)
b45f050a 6634 FAIL("Trailing \\");
a0d0e21e
LW
6635 /* FALL THROUGH */
6636 default:
a0288114 6637 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6638 back into the quick-grab loop below */
45948336 6639 parse_start--;
a0d0e21e
LW
6640 goto defchar;
6641 }
6642 break;
4633a7c4
LW
6643
6644 case '#':
bbe252da 6645 if (RExC_flags & RXf_PMf_EXTENDED) {
3dab1dad
YO
6646 while (RExC_parse < RExC_end && *RExC_parse != '\n')
6647 RExC_parse++;
830247a4 6648 if (RExC_parse < RExC_end)
4633a7c4
LW
6649 goto tryagain;
6650 }
6651 /* FALL THROUGH */
6652
a0d0e21e 6653 default: {
ba210ebe 6654 register STRLEN len;
58ae7d3f 6655 register UV ender;
a0d0e21e 6656 register char *p;
3dab1dad 6657 char *s;
80aecb99 6658 STRLEN foldlen;
89ebb4a3 6659 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6660
6661 parse_start = RExC_parse - 1;
a0d0e21e 6662
830247a4 6663 RExC_parse++;
a0d0e21e
LW
6664
6665 defchar:
58ae7d3f 6666 ender = 0;
eb160463
GS
6667 ret = reg_node(pRExC_state,
6668 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6669 s = STRING(ret);
830247a4
IZ
6670 for (len = 0, p = RExC_parse - 1;
6671 len < 127 && p < RExC_end;
a0d0e21e
LW
6672 len++)
6673 {
3dab1dad 6674 char * const oldp = p;
5b5a24f7 6675
bbe252da 6676 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6677 p = regwhite(p, RExC_end);
a0d0e21e
LW
6678 switch (*p) {
6679 case '^':
6680 case '$':
6681 case '.':
6682 case '[':
6683 case '(':
6684 case ')':
6685 case '|':
6686 goto loopdone;
6687 case '\\':
ee9b8eae
YO
6688 /* Literal Escapes Switch
6689
6690 This switch is meant to handle escape sequences that
6691 resolve to a literal character.
6692
6693 Every escape sequence that represents something
6694 else, like an assertion or a char class, is handled
6695 in the switch marked 'Special Escapes' above in this
6696 routine, but also has an entry here as anything that
6697 isn't explicitly mentioned here will be treated as
6698 an unescaped equivalent literal.
6699 */
6700
a0d0e21e 6701 switch (*++p) {
ee9b8eae
YO
6702 /* These are all the special escapes. */
6703 case 'A': /* Start assertion */
6704 case 'b': case 'B': /* Word-boundary assertion*/
6705 case 'C': /* Single char !DANGEROUS! */
6706 case 'd': case 'D': /* digit class */
6707 case 'g': case 'G': /* generic-backref, pos assertion */
6708 case 'k': case 'K': /* named backref, keep marker */
6709 case 'N': /* named char sequence */
6710 case 'p': case 'P': /* unicode property */
6711 case 's': case 'S': /* space class */
6712 case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
6713 case 'w': case 'W': /* word class */
6714 case 'X': /* eXtended Unicode "combining character sequence" */
6715 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
6716 --p;
6717 goto loopdone;
ee9b8eae
YO
6718
6719 /* Anything after here is an escape that resolves to a
6720 literal. (Except digits, which may or may not)
6721 */
a0d0e21e
LW
6722 case 'n':
6723 ender = '\n';
6724 p++;
a687059c 6725 break;
a0d0e21e
LW
6726 case 'r':
6727 ender = '\r';
6728 p++;
a687059c 6729 break;
a0d0e21e
LW
6730 case 't':
6731 ender = '\t';
6732 p++;
a687059c 6733 break;
a0d0e21e
LW
6734 case 'f':
6735 ender = '\f';
6736 p++;
a687059c 6737 break;
a0d0e21e 6738 case 'e':
c7f1f016 6739 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6740 p++;
a687059c 6741 break;
a0d0e21e 6742 case 'a':
c7f1f016 6743 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6744 p++;
a687059c 6745 break;
a0d0e21e 6746 case 'x':
a0ed51b3 6747 if (*++p == '{') {
1df70142 6748 char* const e = strchr(p, '}');
b81d288d 6749
b45f050a 6750 if (!e) {
830247a4 6751 RExC_parse = p + 1;
b45f050a
JF
6752 vFAIL("Missing right brace on \\x{}");
6753 }
de5f0749 6754 else {
a4c04bdc
NC
6755 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6756 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6757 STRLEN numlen = e - p - 1;
53305cf1 6758 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6759 if (ender > 0xff)
6760 RExC_utf8 = 1;
a0ed51b3
LW
6761 p = e + 1;
6762 }
a0ed51b3
LW
6763 }
6764 else {
a4c04bdc 6765 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6766 STRLEN numlen = 2;
53305cf1 6767 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6768 p += numlen;
6769 }
9e08bc66
TS
6770 if (PL_encoding && ender < 0x100)
6771 goto recode_encoding;
a687059c 6772 break;
a0d0e21e
LW
6773 case 'c':
6774 p++;
bbce6d69 6775 ender = UCHARAT(p++);
6776 ender = toCTRL(ender);
a687059c 6777 break;
a0d0e21e
LW
6778 case '0': case '1': case '2': case '3':case '4':
6779 case '5': case '6': case '7': case '8':case '9':
6780 if (*p == '0' ||
830247a4 6781 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6782 I32 flags = 0;
1df70142 6783 STRLEN numlen = 3;
53305cf1 6784 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6785 p += numlen;
6786 }
6787 else {
6788 --p;
6789 goto loopdone;
a687059c 6790 }
9e08bc66
TS
6791 if (PL_encoding && ender < 0x100)
6792 goto recode_encoding;
6793 break;
6794 recode_encoding:
6795 {
6796 SV* enc = PL_encoding;
6797 ender = reg_recode((const char)(U8)ender, &enc);
6798 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6799 vWARN(p, "Invalid escape in the specified encoding");
6800 RExC_utf8 = 1;
6801 }
a687059c 6802 break;
a0d0e21e 6803 case '\0':
830247a4 6804 if (p >= RExC_end)
b45f050a 6805 FAIL("Trailing \\");
a687059c 6806 /* FALL THROUGH */
a0d0e21e 6807 default:
041457d9 6808 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6809 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6810 goto normal_default;
a0d0e21e
LW
6811 }
6812 break;
a687059c 6813 default:
a0ed51b3 6814 normal_default:
fd400ab9 6815 if (UTF8_IS_START(*p) && UTF) {
1df70142 6816 STRLEN numlen;
5e12f4fb 6817 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6818 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6819 p += numlen;
6820 }
6821 else
6822 ender = *p++;
a0d0e21e 6823 break;
a687059c 6824 }
bbe252da 6825 if (RExC_flags & RXf_PMf_EXTENDED)
830247a4 6826 p = regwhite(p, RExC_end);
60a8b682
JH
6827 if (UTF && FOLD) {
6828 /* Prime the casefolded buffer. */
ac7e0132 6829 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6830 }
a0d0e21e
LW
6831 if (ISMULT2(p)) { /* Back off on ?+*. */
6832 if (len)
6833 p = oldp;
16ea2a2e 6834 else if (UTF) {
80aecb99 6835 if (FOLD) {
60a8b682 6836 /* Emit all the Unicode characters. */
1df70142 6837 STRLEN numlen;
80aecb99
JH
6838 for (foldbuf = tmpbuf;
6839 foldlen;
6840 foldlen -= numlen) {
6841 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6842 if (numlen > 0) {
71207a34 6843 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6844 s += unilen;
6845 len += unilen;
6846 /* In EBCDIC the numlen
6847 * and unilen can differ. */
9dc45d57 6848 foldbuf += numlen;
47654450
JH
6849 if (numlen >= foldlen)
6850 break;
9dc45d57
JH
6851 }
6852 else
6853 break; /* "Can't happen." */
80aecb99
JH
6854 }
6855 }
6856 else {
71207a34 6857 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6858 if (unilen > 0) {
0ebc6274
JH
6859 s += unilen;
6860 len += unilen;
9dc45d57 6861 }
80aecb99 6862 }
a0ed51b3 6863 }
a0d0e21e
LW
6864 else {
6865 len++;
eb160463 6866 REGC((char)ender, s++);
a0d0e21e
LW
6867 }
6868 break;
a687059c 6869 }
16ea2a2e 6870 if (UTF) {
80aecb99 6871 if (FOLD) {
60a8b682 6872 /* Emit all the Unicode characters. */
1df70142 6873 STRLEN numlen;
80aecb99
JH
6874 for (foldbuf = tmpbuf;
6875 foldlen;
6876 foldlen -= numlen) {
6877 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6878 if (numlen > 0) {
71207a34 6879 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6880 len += unilen;
6881 s += unilen;
6882 /* In EBCDIC the numlen
6883 * and unilen can differ. */
9dc45d57 6884 foldbuf += numlen;
47654450
JH
6885 if (numlen >= foldlen)
6886 break;
9dc45d57
JH
6887 }
6888 else
6889 break;
80aecb99
JH
6890 }
6891 }
6892 else {
71207a34 6893 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6894 if (unilen > 0) {
0ebc6274
JH
6895 s += unilen;
6896 len += unilen;
9dc45d57 6897 }
80aecb99
JH
6898 }
6899 len--;
a0ed51b3
LW
6900 }
6901 else
eb160463 6902 REGC((char)ender, s++);
a0d0e21e
LW
6903 }
6904 loopdone:
830247a4 6905 RExC_parse = p - 1;
fac92740 6906 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6907 nextchar(pRExC_state);
793db0cb
JH
6908 {
6909 /* len is STRLEN which is unsigned, need to copy to signed */
6910 IV iv = len;
6911 if (iv < 0)
6912 vFAIL("Internal disaster");
6913 }
a0d0e21e
LW
6914 if (len > 0)
6915 *flagp |= HASWIDTH;
090f7165 6916 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6917 *flagp |= SIMPLE;
3dab1dad 6918
cd439c50 6919 if (SIZE_ONLY)
830247a4 6920 RExC_size += STR_SZ(len);
3dab1dad
YO
6921 else {
6922 STR_LEN(ret) = len;
830247a4 6923 RExC_emit += STR_SZ(len);
07be1b83 6924 }
3dab1dad 6925 }
a0d0e21e
LW
6926 break;
6927 }
a687059c 6928
a0d0e21e 6929 return(ret);
a687059c
LW
6930}
6931
873ef191 6932STATIC char *
5f66b61c 6933S_regwhite(char *p, const char *e)
5b5a24f7
CS
6934{
6935 while (p < e) {
6936 if (isSPACE(*p))
6937 ++p;
6938 else if (*p == '#') {
6939 do {
6940 p++;
6941 } while (p < e && *p != '\n');
6942 }
6943 else
6944 break;
6945 }
6946 return p;
6947}
6948
b8c5462f
JH
6949/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
6950 Character classes ([:foo:]) can also be negated ([:^foo:]).
6951 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
6952 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 6953 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
6954
6955#define POSIXCC_DONE(c) ((c) == ':')
6956#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
6957#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
6958
b8c5462f 6959STATIC I32
830247a4 6960S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 6961{
97aff369 6962 dVAR;
936ed897 6963 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 6964
830247a4 6965 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 6966 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 6967 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 6968 const char c = UCHARAT(RExC_parse);
097eb12c 6969 char* const s = RExC_parse++;
b81d288d 6970
9a86a77b 6971 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
6972 RExC_parse++;
6973 if (RExC_parse == RExC_end)
620e46c5 6974 /* Grandfather lone [:, [=, [. */
830247a4 6975 RExC_parse = s;
620e46c5 6976 else {
3dab1dad 6977 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
6978 assert(*t == c);
6979
9a86a77b 6980 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 6981 const char *posixcc = s + 1;
830247a4 6982 RExC_parse++; /* skip over the ending ] */
3dab1dad 6983
b8c5462f 6984 if (*s == ':') {
1df70142
AL
6985 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
6986 const I32 skip = t - posixcc;
80916619
NC
6987
6988 /* Initially switch on the length of the name. */
6989 switch (skip) {
6990 case 4:
3dab1dad
YO
6991 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
6992 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 6993 break;
80916619
NC
6994 case 5:
6995 /* Names all of length 5. */
6996 /* alnum alpha ascii blank cntrl digit graph lower
6997 print punct space upper */
6998 /* Offset 4 gives the best switch position. */
6999 switch (posixcc[4]) {
7000 case 'a':
3dab1dad
YO
7001 if (memEQ(posixcc, "alph", 4)) /* alpha */
7002 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7003 break;
7004 case 'e':
3dab1dad
YO
7005 if (memEQ(posixcc, "spac", 4)) /* space */
7006 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7007 break;
7008 case 'h':
3dab1dad
YO
7009 if (memEQ(posixcc, "grap", 4)) /* graph */
7010 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7011 break;
7012 case 'i':
3dab1dad
YO
7013 if (memEQ(posixcc, "asci", 4)) /* ascii */
7014 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7015 break;
7016 case 'k':
3dab1dad
YO
7017 if (memEQ(posixcc, "blan", 4)) /* blank */
7018 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7019 break;
7020 case 'l':
3dab1dad
YO
7021 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7022 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7023 break;
7024 case 'm':
3dab1dad
YO
7025 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7026 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7027 break;
7028 case 'r':
3dab1dad
YO
7029 if (memEQ(posixcc, "lowe", 4)) /* lower */
7030 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7031 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7032 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7033 break;
7034 case 't':
3dab1dad
YO
7035 if (memEQ(posixcc, "digi", 4)) /* digit */
7036 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7037 else if (memEQ(posixcc, "prin", 4)) /* print */
7038 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7039 else if (memEQ(posixcc, "punc", 4)) /* punct */
7040 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7041 break;
b8c5462f
JH
7042 }
7043 break;
80916619 7044 case 6:
3dab1dad
YO
7045 if (memEQ(posixcc, "xdigit", 6))
7046 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7047 break;
7048 }
80916619
NC
7049
7050 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7051 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7052 t - s - 1, s + 1);
80916619
NC
7053 assert (posixcc[skip] == ':');
7054 assert (posixcc[skip+1] == ']');
b45f050a 7055 } else if (!SIZE_ONLY) {
b8c5462f 7056 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7057
830247a4 7058 /* adjust RExC_parse so the warning shows after
b45f050a 7059 the class closes */
9a86a77b 7060 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7061 RExC_parse++;
b45f050a
JF
7062 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7063 }
b8c5462f
JH
7064 } else {
7065 /* Maternal grandfather:
7066 * "[:" ending in ":" but not in ":]" */
830247a4 7067 RExC_parse = s;
767d463e 7068 }
620e46c5
JH
7069 }
7070 }
7071
b8c5462f
JH
7072 return namedclass;
7073}
7074
7075STATIC void
830247a4 7076S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7077{
97aff369 7078 dVAR;
3dab1dad 7079 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7080 const char *s = RExC_parse;
7081 const char c = *s++;
b8c5462f 7082
3dab1dad 7083 while (isALNUM(*s))
b8c5462f
JH
7084 s++;
7085 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7086 if (ckWARN(WARN_REGEXP))
7087 vWARN3(s+2,
7088 "POSIX syntax [%c %c] belongs inside character classes",
7089 c, c);
b45f050a
JF
7090
7091 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7092 if (POSIXCC_NOTYET(c)) {
830247a4 7093 /* adjust RExC_parse so the error shows after
b45f050a 7094 the class closes */
9a86a77b 7095 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7096 NOOP;
b45f050a
JF
7097 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7098 }
b8c5462f
JH
7099 }
7100 }
620e46c5
JH
7101}
7102
7f6f358c 7103
89836f1f
YO
7104#define _C_C_T_(NAME,TEST,WORD) \
7105ANYOF_##NAME: \
7106 if (LOC) \
7107 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7108 else { \
7109 for (value = 0; value < 256; value++) \
7110 if (TEST) \
7111 ANYOF_BITMAP_SET(ret, value); \
7112 } \
7113 yesno = '+'; \
7114 what = WORD; \
7115 break; \
7116case ANYOF_N##NAME: \
7117 if (LOC) \
7118 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7119 else { \
7120 for (value = 0; value < 256; value++) \
7121 if (!TEST) \
7122 ANYOF_BITMAP_SET(ret, value); \
7123 } \
7124 yesno = '!'; \
7125 what = WORD; \
7126 break
7127
7128
7f6f358c
YO
7129/*
7130 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7131 matches the pattern or if the pattern matches a single char only and
7132 that char is < 256 and we are case insensitive then we produce an
7133 EXACT node instead.
7f6f358c 7134*/
89836f1f 7135
76e3520e 7136STATIC regnode *
3dab1dad 7137S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7138{
97aff369 7139 dVAR;
9ef43ace 7140 register UV value = 0;
9a86a77b 7141 register UV nextvalue;
3568d838 7142 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7143 register IV range = 0;
c277df42 7144 register regnode *ret;
ba210ebe 7145 STRLEN numlen;
ffc61ed2 7146 IV namedclass;
cbbf8932 7147 char *rangebegin = NULL;
936ed897 7148 bool need_class = 0;
c445ea15 7149 SV *listsv = NULL;
ffc61ed2 7150 UV n;
9e55ce06 7151 bool optimize_invert = TRUE;
cbbf8932 7152 AV* unicode_alternate = NULL;
1b2d223b
JH
7153#ifdef EBCDIC
7154 UV literal_endpoint = 0;
7155#endif
7f6f358c 7156 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7157
3dab1dad 7158 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7159 case we need to change the emitted regop to an EXACT. */
07be1b83 7160 const char * orig_parse = RExC_parse;
72f13be8 7161 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
7162#ifndef DEBUGGING
7163 PERL_UNUSED_ARG(depth);
7164#endif
72f13be8 7165
3dab1dad 7166 DEBUG_PARSE("clas");
7f6f358c
YO
7167
7168 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7169 ret = reganode(pRExC_state, ANYOF, 0);
7170
7171 if (!SIZE_ONLY)
7172 ANYOF_FLAGS(ret) = 0;
7173
9a86a77b 7174 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7175 RExC_naughty++;
7176 RExC_parse++;
7177 if (!SIZE_ONLY)
7178 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7179 }
a0d0e21e 7180
73060fc4 7181 if (SIZE_ONLY) {
830247a4 7182 RExC_size += ANYOF_SKIP;
73060fc4
JH
7183 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7184 }
936ed897 7185 else {
830247a4 7186 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7187 if (FOLD)
7188 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7189 if (LOC)
7190 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7191 ANYOF_BITMAP_ZERO(ret);
396482e1 7192 listsv = newSVpvs("# comment\n");
a0d0e21e 7193 }
b8c5462f 7194
9a86a77b
JH
7195 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7196
b938889d 7197 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7198 checkposixcc(pRExC_state);
b8c5462f 7199
f064b6ad
HS
7200 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7201 if (UCHARAT(RExC_parse) == ']')
7202 goto charclassloop;
ffc61ed2 7203
fc8cd66c 7204parseit:
9a86a77b 7205 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7206
7207 charclassloop:
7208
7209 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7210
73b437c8 7211 if (!range)
830247a4 7212 rangebegin = RExC_parse;
ffc61ed2 7213 if (UTF) {
5e12f4fb 7214 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7215 RExC_end - RExC_parse,
9f7f3913 7216 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7217 RExC_parse += numlen;
7218 }
7219 else
7220 value = UCHARAT(RExC_parse++);
7f6f358c 7221
9a86a77b
JH
7222 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7223 if (value == '[' && POSIXCC(nextvalue))
830247a4 7224 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7225 else if (value == '\\') {
ffc61ed2 7226 if (UTF) {
5e12f4fb 7227 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7228 RExC_end - RExC_parse,
9f7f3913 7229 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7230 RExC_parse += numlen;
7231 }
7232 else
7233 value = UCHARAT(RExC_parse++);
470c3474 7234 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7235 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7236 * be a problem later if we want switch on Unicode.
7237 * A similar issue a little bit later when switching on
7238 * namedclass. --jhi */
ffc61ed2 7239 switch ((I32)value) {
b8c5462f
JH
7240 case 'w': namedclass = ANYOF_ALNUM; break;
7241 case 'W': namedclass = ANYOF_NALNUM; break;
7242 case 's': namedclass = ANYOF_SPACE; break;
7243 case 'S': namedclass = ANYOF_NSPACE; break;
7244 case 'd': namedclass = ANYOF_DIGIT; break;
7245 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
7246 case 'N': /* Handle \N{NAME} in class */
7247 {
7248 /* We only pay attention to the first char of
7249 multichar strings being returned. I kinda wonder
7250 if this makes sense as it does change the behaviour
7251 from earlier versions, OTOH that behaviour was broken
7252 as well. */
7253 UV v; /* value is register so we cant & it /grrr */
7254 if (reg_namedseq(pRExC_state, &v)) {
7255 goto parseit;
7256 }
7257 value= v;
7258 }
7259 break;
ffc61ed2
JH
7260 case 'p':
7261 case 'P':
3dab1dad
YO
7262 {
7263 char *e;
af6f566e 7264 if (RExC_parse >= RExC_end)
2a4859cd 7265 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7266 if (*RExC_parse == '{') {
1df70142 7267 const U8 c = (U8)value;
ffc61ed2
JH
7268 e = strchr(RExC_parse++, '}');
7269 if (!e)
0da60cf5 7270 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7271 while (isSPACE(UCHARAT(RExC_parse)))
7272 RExC_parse++;
7273 if (e == RExC_parse)
0da60cf5 7274 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7275 n = e - RExC_parse;
ab13f0c7
JH
7276 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7277 n--;
ffc61ed2
JH
7278 }
7279 else {
7280 e = RExC_parse;
7281 n = 1;
7282 }
7283 if (!SIZE_ONLY) {
ab13f0c7
JH
7284 if (UCHARAT(RExC_parse) == '^') {
7285 RExC_parse++;
7286 n--;
7287 value = value == 'p' ? 'P' : 'p'; /* toggle */
7288 while (isSPACE(UCHARAT(RExC_parse))) {
7289 RExC_parse++;
7290 n--;
7291 }
7292 }
097eb12c
AL
7293 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7294 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7295 }
7296 RExC_parse = e + 1;
7297 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7298 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7299 }
f81125e2 7300 break;
b8c5462f
JH
7301 case 'n': value = '\n'; break;
7302 case 'r': value = '\r'; break;
7303 case 't': value = '\t'; break;
7304 case 'f': value = '\f'; break;
7305 case 'b': value = '\b'; break;
c7f1f016
NIS
7306 case 'e': value = ASCII_TO_NATIVE('\033');break;
7307 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7308 case 'x':
ffc61ed2 7309 if (*RExC_parse == '{') {
a4c04bdc
NC
7310 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7311 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7312 char * const e = strchr(RExC_parse++, '}');
b81d288d 7313 if (!e)
ffc61ed2 7314 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7315
7316 numlen = e - RExC_parse;
7317 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7318 RExC_parse = e + 1;
7319 }
7320 else {
a4c04bdc 7321 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7322 numlen = 2;
7323 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7324 RExC_parse += numlen;
7325 }
9e08bc66
TS
7326 if (PL_encoding && value < 0x100)
7327 goto recode_encoding;
b8c5462f
JH
7328 break;
7329 case 'c':
830247a4 7330 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7331 value = toCTRL(value);
7332 break;
7333 case '0': case '1': case '2': case '3': case '4':
7334 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7335 {
7336 I32 flags = 0;
7337 numlen = 3;
7338 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7339 RExC_parse += numlen;
7340 if (PL_encoding && value < 0x100)
7341 goto recode_encoding;
7342 break;
7343 }
7344 recode_encoding:
7345 {
7346 SV* enc = PL_encoding;
7347 value = reg_recode((const char)(U8)value, &enc);
7348 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7349 vWARN(RExC_parse,
7350 "Invalid escape in the specified encoding");
7351 break;
7352 }
1028017a 7353 default:
041457d9 7354 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7355 vWARN2(RExC_parse,
7356 "Unrecognized escape \\%c in character class passed through",
7357 (int)value);
1028017a 7358 break;
b8c5462f 7359 }
ffc61ed2 7360 } /* end of \blah */
1b2d223b
JH
7361#ifdef EBCDIC
7362 else
7363 literal_endpoint++;
7364#endif
ffc61ed2
JH
7365
7366 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7367
7368 if (!SIZE_ONLY && !need_class)
936ed897 7369 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7370
936ed897 7371 need_class = 1;
ffc61ed2
JH
7372
7373 /* a bad range like a-\d, a-[:digit:] ? */
7374 if (range) {
73b437c8 7375 if (!SIZE_ONLY) {
afd78fd5 7376 if (ckWARN(WARN_REGEXP)) {
097eb12c 7377 const int w =
afd78fd5
JH
7378 RExC_parse >= rangebegin ?
7379 RExC_parse - rangebegin : 0;
830247a4 7380 vWARN4(RExC_parse,
b45f050a 7381 "False [] range \"%*.*s\"",
097eb12c 7382 w, w, rangebegin);
afd78fd5 7383 }
3568d838
JH
7384 if (prevvalue < 256) {
7385 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7386 ANYOF_BITMAP_SET(ret, '-');
7387 }
7388 else {
7389 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7390 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7391 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7392 }
b8c5462f 7393 }
ffc61ed2
JH
7394
7395 range = 0; /* this was not a true range */
73b437c8 7396 }
ffc61ed2 7397
89836f1f
YO
7398
7399
73b437c8 7400 if (!SIZE_ONLY) {
c49a72a9
NC
7401 const char *what = NULL;
7402 char yesno = 0;
7403
3568d838
JH
7404 if (namedclass > OOB_NAMEDCLASS)
7405 optimize_invert = FALSE;
e2962f66
JH
7406 /* Possible truncation here but in some 64-bit environments
7407 * the compiler gets heartburn about switch on 64-bit values.
7408 * A similar issue a little earlier when switching on value.
98f323fa 7409 * --jhi */
e2962f66 7410 switch ((I32)namedclass) {
89836f1f
YO
7411 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7412 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7413 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7414 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7415 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7416 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7417 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7418 case _C_C_T_(PRINT, isPRINT(value), "Print");
7419 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7420 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7421 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7422 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7423 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
73b437c8
JH
7424 case ANYOF_ASCII:
7425 if (LOC)
936ed897 7426 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7427 else {
c7f1f016 7428#ifndef EBCDIC
1ba5c669
JH
7429 for (value = 0; value < 128; value++)
7430 ANYOF_BITMAP_SET(ret, value);
7431#else /* EBCDIC */
ffbc6a93 7432 for (value = 0; value < 256; value++) {
3a3c4447
JH
7433 if (isASCII(value))
7434 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7435 }
1ba5c669 7436#endif /* EBCDIC */
73b437c8 7437 }
c49a72a9
NC
7438 yesno = '+';
7439 what = "ASCII";
73b437c8
JH
7440 break;
7441 case ANYOF_NASCII:
7442 if (LOC)
936ed897 7443 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7444 else {
c7f1f016 7445#ifndef EBCDIC
1ba5c669
JH
7446 for (value = 128; value < 256; value++)
7447 ANYOF_BITMAP_SET(ret, value);
7448#else /* EBCDIC */
ffbc6a93 7449 for (value = 0; value < 256; value++) {
3a3c4447
JH
7450 if (!isASCII(value))
7451 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7452 }
1ba5c669 7453#endif /* EBCDIC */
73b437c8 7454 }
c49a72a9
NC
7455 yesno = '!';
7456 what = "ASCII";
89836f1f 7457 break;
ffc61ed2
JH
7458 case ANYOF_DIGIT:
7459 if (LOC)
7460 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7461 else {
7462 /* consecutive digits assumed */
7463 for (value = '0'; value <= '9'; value++)
7464 ANYOF_BITMAP_SET(ret, value);
7465 }
c49a72a9
NC
7466 yesno = '+';
7467 what = "Digit";
ffc61ed2
JH
7468 break;
7469 case ANYOF_NDIGIT:
7470 if (LOC)
7471 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7472 else {
7473 /* consecutive digits assumed */
7474 for (value = 0; value < '0'; value++)
7475 ANYOF_BITMAP_SET(ret, value);
7476 for (value = '9' + 1; value < 256; value++)
7477 ANYOF_BITMAP_SET(ret, value);
7478 }
c49a72a9
NC
7479 yesno = '!';
7480 what = "Digit";
89836f1f 7481 break;
f81125e2
JP
7482 case ANYOF_MAX:
7483 /* this is to handle \p and \P */
7484 break;
73b437c8 7485 default:
b45f050a 7486 vFAIL("Invalid [::] class");
73b437c8 7487 break;
b8c5462f 7488 }
c49a72a9
NC
7489 if (what) {
7490 /* Strings such as "+utf8::isWord\n" */
7491 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7492 }
b8c5462f 7493 if (LOC)
936ed897 7494 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7495 continue;
a0d0e21e 7496 }
ffc61ed2
JH
7497 } /* end of namedclass \blah */
7498
a0d0e21e 7499 if (range) {
eb160463 7500 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7501 const int w = RExC_parse - rangebegin;
7502 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7503 range = 0; /* not a valid range */
73b437c8 7504 }
a0d0e21e
LW
7505 }
7506 else {
3568d838 7507 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7508 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7509 RExC_parse[1] != ']') {
7510 RExC_parse++;
ffc61ed2
JH
7511
7512 /* a bad range like \w-, [:word:]- ? */
7513 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7514 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7515 const int w =
afd78fd5
JH
7516 RExC_parse >= rangebegin ?
7517 RExC_parse - rangebegin : 0;
830247a4 7518 vWARN4(RExC_parse,
b45f050a 7519 "False [] range \"%*.*s\"",
097eb12c 7520 w, w, rangebegin);
afd78fd5 7521 }
73b437c8 7522 if (!SIZE_ONLY)
936ed897 7523 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7524 } else
ffc61ed2
JH
7525 range = 1; /* yeah, it's a range! */
7526 continue; /* but do it the next time */
a0d0e21e 7527 }
a687059c 7528 }
ffc61ed2 7529
93733859 7530 /* now is the next time */
07be1b83 7531 /*stored += (value - prevvalue + 1);*/
ae5c130c 7532 if (!SIZE_ONLY) {
3568d838 7533 if (prevvalue < 256) {
1df70142 7534 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7535 IV i;
3568d838 7536#ifdef EBCDIC
1b2d223b
JH
7537 /* In EBCDIC [\x89-\x91] should include
7538 * the \x8e but [i-j] should not. */
7539 if (literal_endpoint == 2 &&
7540 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7541 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7542 {
3568d838
JH
7543 if (isLOWER(prevvalue)) {
7544 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7545 if (isLOWER(i))
7546 ANYOF_BITMAP_SET(ret, i);
7547 } else {
3568d838 7548 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7549 if (isUPPER(i))
7550 ANYOF_BITMAP_SET(ret, i);
7551 }
8ada0baa 7552 }
ffc61ed2 7553 else
8ada0baa 7554#endif
07be1b83
YO
7555 for (i = prevvalue; i <= ceilvalue; i++) {
7556 if (!ANYOF_BITMAP_TEST(ret,i)) {
7557 stored++;
7558 ANYOF_BITMAP_SET(ret, i);
7559 }
7560 }
3568d838 7561 }
a5961de5 7562 if (value > 255 || UTF) {
1df70142
AL
7563 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7564 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7565 stored+=2; /* can't optimize this class */
ffc61ed2 7566 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7567 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7568 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7569 prevnatvalue, natvalue);
7570 }
7571 else if (prevnatvalue == natvalue) {
7572 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7573 if (FOLD) {
89ebb4a3 7574 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7575 STRLEN foldlen;
1df70142 7576 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7577
e294cc5d
JH
7578#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7579 if (RExC_precomp[0] == ':' &&
7580 RExC_precomp[1] == '[' &&
7581 (f == 0xDF || f == 0x92)) {
7582 f = NATIVE_TO_UNI(f);
7583 }
7584#endif
c840d2a2
JH
7585 /* If folding and foldable and a single
7586 * character, insert also the folded version
7587 * to the charclass. */
9e55ce06 7588 if (f != value) {
e294cc5d
JH
7589#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7590 if ((RExC_precomp[0] == ':' &&
7591 RExC_precomp[1] == '[' &&
7592 (f == 0xA2 &&
7593 (value == 0xFB05 || value == 0xFB06))) ?
7594 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7595 foldlen == (STRLEN)UNISKIP(f) )
7596#else
eb160463 7597 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7598#endif
9e55ce06
JH
7599 Perl_sv_catpvf(aTHX_ listsv,
7600 "%04"UVxf"\n", f);
7601 else {
7602 /* Any multicharacter foldings
7603 * require the following transform:
7604 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7605 * where E folds into "pq" and F folds
7606 * into "rst", all other characters
7607 * fold to single characters. We save
7608 * away these multicharacter foldings,
7609 * to be later saved as part of the
7610 * additional "s" data. */
7611 SV *sv;
7612
7613 if (!unicode_alternate)
7614 unicode_alternate = newAV();
7615 sv = newSVpvn((char*)foldbuf, foldlen);
7616 SvUTF8_on(sv);
7617 av_push(unicode_alternate, sv);
7618 }
7619 }
254ba52a 7620
60a8b682
JH
7621 /* If folding and the value is one of the Greek
7622 * sigmas insert a few more sigmas to make the
7623 * folding rules of the sigmas to work right.
7624 * Note that not all the possible combinations
7625 * are handled here: some of them are handled
9e55ce06
JH
7626 * by the standard folding rules, and some of
7627 * them (literal or EXACTF cases) are handled
7628 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7629 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7630 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7631 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7632 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7633 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7634 }
7635 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7636 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7637 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7638 }
7639 }
ffc61ed2 7640 }
1b2d223b
JH
7641#ifdef EBCDIC
7642 literal_endpoint = 0;
7643#endif
8ada0baa 7644 }
ffc61ed2
JH
7645
7646 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7647 }
ffc61ed2 7648
936ed897 7649 if (need_class) {
4f66b38d 7650 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7651 if (SIZE_ONLY)
830247a4 7652 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7653 else
830247a4 7654 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7655 }
ffc61ed2 7656
7f6f358c
YO
7657
7658 if (SIZE_ONLY)
7659 return ret;
7660 /****** !SIZE_ONLY AFTER HERE *********/
7661
7662 if( stored == 1 && value < 256
7663 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7664 ) {
7665 /* optimize single char class to an EXACT node
7666 but *only* when its not a UTF/high char */
07be1b83
YO
7667 const char * cur_parse= RExC_parse;
7668 RExC_emit = (regnode *)orig_emit;
7669 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7670 ret = reg_node(pRExC_state,
7671 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7672 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7673 *STRING(ret)= (char)value;
7674 STR_LEN(ret)= 1;
7675 RExC_emit += STR_SZ(1);
7676 return ret;
7677 }
ae5c130c 7678 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7679 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7680 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7681 ) {
a0ed51b3 7682 for (value = 0; value < 256; ++value) {
936ed897 7683 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7684 UV fold = PL_fold[value];
ffc61ed2
JH
7685
7686 if (fold != value)
7687 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7688 }
7689 }
936ed897 7690 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7691 }
ffc61ed2 7692
ae5c130c 7693 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7694 if (optimize_invert &&
ffc61ed2
JH
7695 /* If the only flag is inversion. */
7696 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7697 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7698 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7699 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7700 }
7f6f358c 7701 {
097eb12c 7702 AV * const av = newAV();
ffc61ed2 7703 SV *rv;
9e55ce06 7704 /* The 0th element stores the character class description
6a0407ee 7705 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7706 * to initialize the appropriate swash (which gets stored in
7707 * the 1st element), and also useful for dumping the regnode.
7708 * The 2nd element stores the multicharacter foldings,
6a0407ee 7709 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7710 av_store(av, 0, listsv);
7711 av_store(av, 1, NULL);
9e55ce06 7712 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7713 rv = newRV_noinc((SV*)av);
19860706 7714 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7715 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7716 ARG_SET(ret, n);
a0ed51b3 7717 }
a0ed51b3
LW
7718 return ret;
7719}
89836f1f
YO
7720#undef _C_C_T_
7721
a0ed51b3 7722
76e3520e 7723STATIC char*
830247a4 7724S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7725{
097eb12c 7726 char* const retval = RExC_parse++;
a0d0e21e 7727
4633a7c4 7728 for (;;) {
830247a4
IZ
7729 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7730 RExC_parse[2] == '#') {
e994fd66
AE
7731 while (*RExC_parse != ')') {
7732 if (RExC_parse == RExC_end)
7733 FAIL("Sequence (?#... not terminated");
830247a4 7734 RExC_parse++;
e994fd66 7735 }
830247a4 7736 RExC_parse++;
4633a7c4
LW
7737 continue;
7738 }
bbe252da 7739 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7740 if (isSPACE(*RExC_parse)) {
7741 RExC_parse++;
748a9306
LW
7742 continue;
7743 }
830247a4 7744 else if (*RExC_parse == '#') {
e994fd66
AE
7745 while (RExC_parse < RExC_end)
7746 if (*RExC_parse++ == '\n') break;
748a9306
LW
7747 continue;
7748 }
748a9306 7749 }
4633a7c4 7750 return retval;
a0d0e21e 7751 }
a687059c
LW
7752}
7753
7754/*
c277df42 7755- reg_node - emit a node
a0d0e21e 7756*/
76e3520e 7757STATIC regnode * /* Location. */
830247a4 7758S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7759{
97aff369 7760 dVAR;
c277df42 7761 register regnode *ptr;
504618e9 7762 regnode * const ret = RExC_emit;
07be1b83 7763 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7764
c277df42 7765 if (SIZE_ONLY) {
830247a4
IZ
7766 SIZE_ALIGN(RExC_size);
7767 RExC_size += 1;
a0d0e21e
LW
7768 return(ret);
7769 }
e2e6a0f1
YO
7770#ifdef DEBUGGING
7771 if (OP(RExC_emit) == 255)
7772 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7773 reg_name[op], OP(RExC_emit));
7774#endif
c277df42 7775 NODE_ALIGN_FILL(ret);
a0d0e21e 7776 ptr = ret;
c277df42 7777 FILL_ADVANCE_NODE(ptr, op);
7122b237 7778#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7779 if (RExC_offsets) { /* MJD */
07be1b83 7780 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7781 "reg_node", __LINE__,
7782 reg_name[op],
07be1b83
YO
7783 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7784 ? "Overwriting end of array!\n" : "OK",
7785 (UV)(RExC_emit - RExC_emit_start),
7786 (UV)(RExC_parse - RExC_start),
7787 (UV)RExC_offsets[0]));
ccb2c380 7788 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7789 }
7122b237 7790#endif
830247a4 7791 RExC_emit = ptr;
a0d0e21e 7792 return(ret);
a687059c
LW
7793}
7794
7795/*
a0d0e21e
LW
7796- reganode - emit a node with an argument
7797*/
76e3520e 7798STATIC regnode * /* Location. */
830247a4 7799S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7800{
97aff369 7801 dVAR;
c277df42 7802 register regnode *ptr;
504618e9 7803 regnode * const ret = RExC_emit;
07be1b83 7804 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7805
c277df42 7806 if (SIZE_ONLY) {
830247a4
IZ
7807 SIZE_ALIGN(RExC_size);
7808 RExC_size += 2;
6bda09f9
YO
7809 /*
7810 We can't do this:
7811
7812 assert(2==regarglen[op]+1);
7813
7814 Anything larger than this has to allocate the extra amount.
7815 If we changed this to be:
7816
7817 RExC_size += (1 + regarglen[op]);
7818
7819 then it wouldn't matter. Its not clear what side effect
7820 might come from that so its not done so far.
7821 -- dmq
7822 */
a0d0e21e
LW
7823 return(ret);
7824 }
e2e6a0f1
YO
7825#ifdef DEBUGGING
7826 if (OP(RExC_emit) == 255)
7827 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7828#endif
c277df42 7829 NODE_ALIGN_FILL(ret);
a0d0e21e 7830 ptr = ret;
c277df42 7831 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 7832#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7833 if (RExC_offsets) { /* MJD */
07be1b83 7834 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7835 "reganode",
ccb2c380
MP
7836 __LINE__,
7837 reg_name[op],
07be1b83 7838 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7839 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7840 (UV)(RExC_emit - RExC_emit_start),
7841 (UV)(RExC_parse - RExC_start),
7842 (UV)RExC_offsets[0]));
ccb2c380 7843 Set_Cur_Node_Offset;
fac92740 7844 }
7122b237 7845#endif
830247a4 7846 RExC_emit = ptr;
a0d0e21e 7847 return(ret);
fe14fcc3
LW
7848}
7849
7850/*
cd439c50 7851- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7852*/
71207a34
AL
7853STATIC STRLEN
7854S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7855{
97aff369 7856 dVAR;
71207a34 7857 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7858}
7859
7860/*
a0d0e21e
LW
7861- reginsert - insert an operator in front of already-emitted operand
7862*
7863* Means relocating the operand.
7864*/
76e3520e 7865STATIC void
6bda09f9 7866S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7867{
97aff369 7868 dVAR;
c277df42
IZ
7869 register regnode *src;
7870 register regnode *dst;
7871 register regnode *place;
504618e9 7872 const int offset = regarglen[(U8)op];
6bda09f9 7873 const int size = NODE_STEP_REGNODE + offset;
07be1b83 7874 GET_RE_DEBUG_FLAGS_DECL;
22c35a8c 7875/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 7876 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 7877 if (SIZE_ONLY) {
6bda09f9 7878 RExC_size += size;
a0d0e21e
LW
7879 return;
7880 }
a687059c 7881
830247a4 7882 src = RExC_emit;
6bda09f9 7883 RExC_emit += size;
830247a4 7884 dst = RExC_emit;
40d049e4 7885 if (RExC_open_parens) {
6bda09f9 7886 int paren;
6d99fb9b 7887 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 7888 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
7889 if ( RExC_open_parens[paren] >= opnd ) {
7890 DEBUG_PARSE_FMT("open"," - %d",size);
7891 RExC_open_parens[paren] += size;
7892 } else {
7893 DEBUG_PARSE_FMT("open"," - %s","ok");
7894 }
7895 if ( RExC_close_parens[paren] >= opnd ) {
7896 DEBUG_PARSE_FMT("close"," - %d",size);
7897 RExC_close_parens[paren] += size;
7898 } else {
7899 DEBUG_PARSE_FMT("close"," - %s","ok");
7900 }
7901 }
6bda09f9 7902 }
40d049e4 7903
fac92740 7904 while (src > opnd) {
c277df42 7905 StructCopy(--src, --dst, regnode);
7122b237 7906#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7907 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 7908 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 7909 "reg_insert",
ccb2c380
MP
7910 __LINE__,
7911 reg_name[op],
07be1b83
YO
7912 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
7913 ? "Overwriting end of array!\n" : "OK",
7914 (UV)(src - RExC_emit_start),
7915 (UV)(dst - RExC_emit_start),
7916 (UV)RExC_offsets[0]));
ccb2c380
MP
7917 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
7918 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 7919 }
7122b237 7920#endif
fac92740
MJD
7921 }
7922
a0d0e21e
LW
7923
7924 place = opnd; /* Op node, where operand used to be. */
7122b237 7925#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7926 if (RExC_offsets) { /* MJD */
07be1b83 7927 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7928 "reginsert",
ccb2c380
MP
7929 __LINE__,
7930 reg_name[op],
07be1b83 7931 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 7932 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
7933 (UV)(place - RExC_emit_start),
7934 (UV)(RExC_parse - RExC_start),
786e8c11 7935 (UV)RExC_offsets[0]));
ccb2c380 7936 Set_Node_Offset(place, RExC_parse);
45948336 7937 Set_Node_Length(place, 1);
fac92740 7938 }
7122b237 7939#endif
c277df42
IZ
7940 src = NEXTOPER(place);
7941 FILL_ADVANCE_NODE(place, op);
7942 Zero(src, offset, regnode);
a687059c
LW
7943}
7944
7945/*
c277df42 7946- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 7947- SEE ALSO: regtail_study
a0d0e21e 7948*/
097eb12c 7949/* TODO: All three parms should be const */
76e3520e 7950STATIC void
3dab1dad 7951S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 7952{
97aff369 7953 dVAR;
c277df42 7954 register regnode *scan;
72f13be8 7955 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
7956#ifndef DEBUGGING
7957 PERL_UNUSED_ARG(depth);
7958#endif
a0d0e21e 7959
c277df42 7960 if (SIZE_ONLY)
a0d0e21e
LW
7961 return;
7962
7963 /* Find last node. */
7964 scan = p;
7965 for (;;) {
504618e9 7966 regnode * const temp = regnext(scan);
3dab1dad
YO
7967 DEBUG_PARSE_r({
7968 SV * const mysv=sv_newmortal();
7969 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
7970 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
7971 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
7972 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
7973 (temp == NULL ? "->" : ""),
7974 (temp == NULL ? reg_name[OP(val)] : "")
7975 );
3dab1dad
YO
7976 });
7977 if (temp == NULL)
7978 break;
7979 scan = temp;
7980 }
7981
7982 if (reg_off_by_arg[OP(scan)]) {
7983 ARG_SET(scan, val - scan);
7984 }
7985 else {
7986 NEXT_OFF(scan) = val - scan;
7987 }
7988}
7989
07be1b83 7990#ifdef DEBUGGING
3dab1dad
YO
7991/*
7992- regtail_study - set the next-pointer at the end of a node chain of p to val.
7993- Look for optimizable sequences at the same time.
7994- currently only looks for EXACT chains.
07be1b83
YO
7995
7996This is expermental code. The idea is to use this routine to perform
7997in place optimizations on branches and groups as they are constructed,
7998with the long term intention of removing optimization from study_chunk so
7999that it is purely analytical.
8000
8001Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8002to control which is which.
8003
3dab1dad
YO
8004*/
8005/* TODO: All four parms should be const */
07be1b83 8006
3dab1dad
YO
8007STATIC U8
8008S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8009{
8010 dVAR;
8011 register regnode *scan;
07be1b83
YO
8012 U8 exact = PSEUDO;
8013#ifdef EXPERIMENTAL_INPLACESCAN
8014 I32 min = 0;
8015#endif
8016
3dab1dad
YO
8017 GET_RE_DEBUG_FLAGS_DECL;
8018
07be1b83 8019
3dab1dad
YO
8020 if (SIZE_ONLY)
8021 return exact;
8022
8023 /* Find last node. */
8024
8025 scan = p;
8026 for (;;) {
8027 regnode * const temp = regnext(scan);
07be1b83
YO
8028#ifdef EXPERIMENTAL_INPLACESCAN
8029 if (PL_regkind[OP(scan)] == EXACT)
8030 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8031 return EXACT;
8032#endif
3dab1dad
YO
8033 if ( exact ) {
8034 switch (OP(scan)) {
8035 case EXACT:
8036 case EXACTF:
8037 case EXACTFL:
8038 if( exact == PSEUDO )
8039 exact= OP(scan);
07be1b83
YO
8040 else if ( exact != OP(scan) )
8041 exact= 0;
3dab1dad
YO
8042 case NOTHING:
8043 break;
8044 default:
8045 exact= 0;
8046 }
8047 }
8048 DEBUG_PARSE_r({
8049 SV * const mysv=sv_newmortal();
8050 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8051 regprop(RExC_rx, mysv, scan);
eaf3ca90 8052 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8053 SvPV_nolen_const(mysv),
eaf3ca90
YO
8054 REG_NODE_NUM(scan),
8055 reg_name[exact]);
3dab1dad 8056 });
a0d0e21e
LW
8057 if (temp == NULL)
8058 break;
8059 scan = temp;
8060 }
07be1b83
YO
8061 DEBUG_PARSE_r({
8062 SV * const mysv_val=sv_newmortal();
8063 DEBUG_PARSE_MSG("");
8064 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8065 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8066 SvPV_nolen_const(mysv_val),
8067 (IV)REG_NODE_NUM(val),
8068 (IV)(val - scan)
07be1b83
YO
8069 );
8070 });
c277df42
IZ
8071 if (reg_off_by_arg[OP(scan)]) {
8072 ARG_SET(scan, val - scan);
a0ed51b3
LW
8073 }
8074 else {
c277df42
IZ
8075 NEXT_OFF(scan) = val - scan;
8076 }
3dab1dad
YO
8077
8078 return exact;
a687059c 8079}
07be1b83 8080#endif
a687059c
LW
8081
8082/*
a687059c
LW
8083 - regcurly - a little FSA that accepts {\d+,?\d*}
8084 */
79072805 8085STATIC I32
5f66b61c 8086S_regcurly(register const char *s)
a687059c
LW
8087{
8088 if (*s++ != '{')
8089 return FALSE;
f0fcb552 8090 if (!isDIGIT(*s))
a687059c 8091 return FALSE;
f0fcb552 8092 while (isDIGIT(*s))
a687059c
LW
8093 s++;
8094 if (*s == ',')
8095 s++;
f0fcb552 8096 while (isDIGIT(*s))
a687059c
LW
8097 s++;
8098 if (*s != '}')
8099 return FALSE;
8100 return TRUE;
8101}
8102
a687059c
LW
8103
8104/*
fd181c75 8105 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8106 */
8107void
097eb12c 8108Perl_regdump(pTHX_ const regexp *r)
a687059c 8109{
35ff7856 8110#ifdef DEBUGGING
97aff369 8111 dVAR;
c445ea15 8112 SV * const sv = sv_newmortal();
ab3bbdeb 8113 SV *dsv= sv_newmortal();
f8fc2ecf 8114 RXi_GET_DECL(r,ri);
a687059c 8115
f8fc2ecf 8116 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8117
8118 /* Header fields of interest. */
ab3bbdeb
YO
8119 if (r->anchored_substr) {
8120 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8121 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8122 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8123 "anchored %s%s at %"IVdf" ",
8124 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8125 (IV)r->anchored_offset);
ab3bbdeb
YO
8126 } else if (r->anchored_utf8) {
8127 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8128 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8129 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8130 "anchored utf8 %s%s at %"IVdf" ",
8131 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8132 (IV)r->anchored_offset);
ab3bbdeb
YO
8133 }
8134 if (r->float_substr) {
8135 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8136 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8137 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8138 "floating %s%s at %"IVdf"..%"UVuf" ",
8139 s, RE_SV_TAIL(r->float_substr),
7b0972df 8140 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8141 } else if (r->float_utf8) {
8142 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8143 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8144 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8145 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8146 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8147 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8148 }
33b8afdf 8149 if (r->check_substr || r->check_utf8)
b81d288d 8150 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8151 (const char *)
8152 (r->check_substr == r->float_substr
8153 && r->check_utf8 == r->float_utf8
8154 ? "(checking floating" : "(checking anchored"));
bbe252da 8155 if (r->extflags & RXf_NOSCAN)
c277df42 8156 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8157 if (r->extflags & RXf_CHECK_ALL)
c277df42 8158 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8159 if (r->check_substr || r->check_utf8)
c277df42
IZ
8160 PerlIO_printf(Perl_debug_log, ") ");
8161
f8fc2ecf
YO
8162 if (ri->regstclass) {
8163 regprop(r, sv, ri->regstclass);
1de06328 8164 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8165 }
bbe252da 8166 if (r->extflags & RXf_ANCH) {
774d564b 8167 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8168 if (r->extflags & RXf_ANCH_BOL)
774d564b 8169 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8170 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8171 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8172 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8173 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8174 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8175 PerlIO_printf(Perl_debug_log, "(GPOS)");
8176 PerlIO_putc(Perl_debug_log, ' ');
8177 }
bbe252da 8178 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8179 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8180 if (r->intflags & PREGf_SKIP)
760ac839 8181 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8182 if (r->intflags & PREGf_IMPLICIT)
760ac839 8183 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8184 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8185 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8186 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8187 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8188#else
96a5add6 8189 PERL_UNUSED_CONTEXT;
65e66c80 8190 PERL_UNUSED_ARG(r);
17c3b450 8191#endif /* DEBUGGING */
a687059c
LW
8192}
8193
8194/*
a0d0e21e
LW
8195- regprop - printable representation of opcode
8196*/
46fc3d4c 8197void
32fc9b6a 8198Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8199{
35ff7856 8200#ifdef DEBUGGING
97aff369 8201 dVAR;
9b155405 8202 register int k;
f8fc2ecf 8203 RXi_GET_DECL(prog,progi);
1de06328 8204 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8205
a0d0e21e 8206
54dc92de 8207 sv_setpvn(sv, "", 0);
8aa23a47 8208
03363afd 8209 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8210 /* It would be nice to FAIL() here, but this may be called from
8211 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8212 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8213 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8214
3dab1dad 8215 k = PL_regkind[OP(o)];
9b155405 8216
2a782b5b 8217 if (k == EXACT) {
396482e1 8218 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8219 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8220 * is a crude hack but it may be the best for now since
8221 * we have no flag "this EXACTish node was UTF-8"
8222 * --jhi */
8223 const char * const s =
ddc5bc0f 8224 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8225 PL_colors[0], PL_colors[1],
8226 PERL_PV_ESCAPE_UNI_DETECT |
8227 PERL_PV_PRETTY_ELIPSES |
8228 PERL_PV_PRETTY_LTGT
8229 );
8230 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8231 } else if (k == TRIE) {
3dab1dad 8232 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8233 * progi->data isn't available here */
1de06328 8234 const char op = OP(o);
647f639f 8235 const U32 n = ARG(o);
1de06328 8236 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8237 (reg_ac_data *)progi->data->data[n] :
1de06328 8238 NULL;
3251b653
NC
8239 const reg_trie_data * const trie
8240 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8241
8242 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8243 DEBUG_TRIE_COMPILE_r(
8244 Perl_sv_catpvf(aTHX_ sv,
8245 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8246 (UV)trie->startstate,
1e2e3d02 8247 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8248 (UV)trie->wordcount,
8249 (UV)trie->minlen,
8250 (UV)trie->maxlen,
8251 (UV)TRIE_CHARCOUNT(trie),
8252 (UV)trie->uniquecharcount
8253 )
8254 );
8255 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8256 int i;
8257 int rangestart = -1;
f46cb337 8258 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8259 Perl_sv_catpvf(aTHX_ sv, "[");
8260 for (i = 0; i <= 256; i++) {
8261 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8262 if (rangestart == -1)
8263 rangestart = i;
8264 } else if (rangestart != -1) {
8265 if (i <= rangestart + 3)
8266 for (; rangestart < i; rangestart++)
8267 put_byte(sv, rangestart);
8268 else {
8269 put_byte(sv, rangestart);
8270 sv_catpvs(sv, "-");
8271 put_byte(sv, i - 1);
8272 }
8273 rangestart = -1;
8274 }
8275 }
8276 Perl_sv_catpvf(aTHX_ sv, "]");
8277 }
8278
a3621e74 8279 } else if (k == CURLY) {
cb434fcc 8280 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8281 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8282 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8283 }
2c2d71f5
JH
8284 else if (k == WHILEM && o->flags) /* Ordinal/of */
8285 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 8286 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 8287 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1f1031fe 8288 if ( prog->paren_names ) {
ee9b8eae
YO
8289 if ( k != REF || OP(o) < NREF) {
8290 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8291 SV **name= av_fetch(list, ARG(o), 0 );
8292 if (name)
8293 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8294 }
8295 else {
8296 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8297 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8298 I32 *nums=(I32*)SvPVX(sv_dat);
8299 SV **name= av_fetch(list, nums[0], 0 );
8300 I32 n;
8301 if (name) {
8302 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8303 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8304 (n ? "," : ""), (IV)nums[n]);
8305 }
8306 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 8307 }
1f1031fe 8308 }
ee9b8eae 8309 }
1f1031fe 8310 } else if (k == GOSUB)
6bda09f9 8311 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8312 else if (k == VERB) {
8313 if (!o->flags)
8314 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
be2597df 8315 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
e2e6a0f1 8316 } else if (k == LOGICAL)
04ebc1ab 8317 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8318 else if (k == ANYOF) {
8319 int i, rangestart = -1;
2d03de9c 8320 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8321
8322 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8323 static const char * const anyofs[] = {
653099ff
GS
8324 "\\w",
8325 "\\W",
8326 "\\s",
8327 "\\S",
8328 "\\d",
8329 "\\D",
8330 "[:alnum:]",
8331 "[:^alnum:]",
8332 "[:alpha:]",
8333 "[:^alpha:]",
8334 "[:ascii:]",
8335 "[:^ascii:]",
8336 "[:ctrl:]",
8337 "[:^ctrl:]",
8338 "[:graph:]",
8339 "[:^graph:]",
8340 "[:lower:]",
8341 "[:^lower:]",
8342 "[:print:]",
8343 "[:^print:]",
8344 "[:punct:]",
8345 "[:^punct:]",
8346 "[:upper:]",
aaa51d5e 8347 "[:^upper:]",
653099ff 8348 "[:xdigit:]",
aaa51d5e
JF
8349 "[:^xdigit:]",
8350 "[:space:]",
8351 "[:^space:]",
8352 "[:blank:]",
8353 "[:^blank:]"
653099ff
GS
8354 };
8355
19860706 8356 if (flags & ANYOF_LOCALE)
396482e1 8357 sv_catpvs(sv, "{loc}");
19860706 8358 if (flags & ANYOF_FOLD)
396482e1 8359 sv_catpvs(sv, "{i}");
653099ff 8360 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8361 if (flags & ANYOF_INVERT)
396482e1 8362 sv_catpvs(sv, "^");
ffc61ed2
JH
8363 for (i = 0; i <= 256; i++) {
8364 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8365 if (rangestart == -1)
8366 rangestart = i;
8367 } else if (rangestart != -1) {
8368 if (i <= rangestart + 3)
8369 for (; rangestart < i; rangestart++)
653099ff 8370 put_byte(sv, rangestart);
ffc61ed2
JH
8371 else {
8372 put_byte(sv, rangestart);
396482e1 8373 sv_catpvs(sv, "-");
ffc61ed2 8374 put_byte(sv, i - 1);
653099ff 8375 }
ffc61ed2 8376 rangestart = -1;
653099ff 8377 }
847a199f 8378 }
ffc61ed2
JH
8379
8380 if (o->flags & ANYOF_CLASS)
bb7a0f54 8381 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8382 if (ANYOF_CLASS_TEST(o,i))
8383 sv_catpv(sv, anyofs[i]);
8384
8385 if (flags & ANYOF_UNICODE)
396482e1 8386 sv_catpvs(sv, "{unicode}");
1aa99e6b 8387 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8388 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8389
8390 {
8391 SV *lv;
32fc9b6a 8392 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8393
ffc61ed2
JH
8394 if (lv) {
8395 if (sw) {
89ebb4a3 8396 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8397
ffc61ed2 8398 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8399 uvchr_to_utf8(s, i);
ffc61ed2 8400
3568d838 8401 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8402 if (rangestart == -1)
8403 rangestart = i;
8404 } else if (rangestart != -1) {
ffc61ed2
JH
8405 if (i <= rangestart + 3)
8406 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8407 const U8 * const e = uvchr_to_utf8(s,rangestart);
8408 U8 *p;
8409 for(p = s; p < e; p++)
ffc61ed2
JH
8410 put_byte(sv, *p);
8411 }
8412 else {
2d03de9c
AL
8413 const U8 *e = uvchr_to_utf8(s,rangestart);
8414 U8 *p;
8415 for (p = s; p < e; p++)
ffc61ed2 8416 put_byte(sv, *p);
396482e1 8417 sv_catpvs(sv, "-");
2d03de9c
AL
8418 e = uvchr_to_utf8(s, i-1);
8419 for (p = s; p < e; p++)
1df70142 8420 put_byte(sv, *p);
ffc61ed2
JH
8421 }
8422 rangestart = -1;
8423 }
19860706 8424 }
ffc61ed2 8425
396482e1 8426 sv_catpvs(sv, "..."); /* et cetera */
19860706 8427 }
fde631ed 8428
ffc61ed2 8429 {
2e0de35c 8430 char *s = savesvpv(lv);
c445ea15 8431 char * const origs = s;
b81d288d 8432
3dab1dad
YO
8433 while (*s && *s != '\n')
8434 s++;
b81d288d 8435
ffc61ed2 8436 if (*s == '\n') {
2d03de9c 8437 const char * const t = ++s;
ffc61ed2
JH
8438
8439 while (*s) {
8440 if (*s == '\n')
8441 *s = ' ';
8442 s++;
8443 }
8444 if (s[-1] == ' ')
8445 s[-1] = 0;
8446
8447 sv_catpv(sv, t);
fde631ed 8448 }
b81d288d 8449
ffc61ed2 8450 Safefree(origs);
fde631ed
JH
8451 }
8452 }
653099ff 8453 }
ffc61ed2 8454
653099ff
GS
8455 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8456 }
9b155405 8457 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8458 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8459#else
96a5add6 8460 PERL_UNUSED_CONTEXT;
65e66c80
SP
8461 PERL_UNUSED_ARG(sv);
8462 PERL_UNUSED_ARG(o);
f9049ba1 8463 PERL_UNUSED_ARG(prog);
17c3b450 8464#endif /* DEBUGGING */
35ff7856 8465}
a687059c 8466
cad2e5aa
JH
8467SV *
8468Perl_re_intuit_string(pTHX_ regexp *prog)
8469{ /* Assume that RE_INTUIT is set */
97aff369 8470 dVAR;
a3621e74 8471 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8472 PERL_UNUSED_CONTEXT;
8473
a3621e74 8474 DEBUG_COMPILE_r(
cfd0369c 8475 {
2d03de9c 8476 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8477 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8478
8479 if (!PL_colorset) reginitcolors();
8480 PerlIO_printf(Perl_debug_log,
a0288114 8481 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8482 PL_colors[4],
8483 prog->check_substr ? "" : "utf8 ",
8484 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8485 s,
8486 PL_colors[1],
8487 (strlen(s) > 60 ? "..." : ""));
8488 } );
8489
33b8afdf 8490 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8491}
8492
84da74a7 8493/*
f8149455 8494 pregfree()
84da74a7 8495
f8149455
YO
8496 handles refcounting and freeing the perl core regexp structure. When
8497 it is necessary to actually free the structure the first thing it
8498 does is call the 'free' method of the regexp_engine associated to to
8499 the regexp, allowing the handling of the void *pprivate; member
8500 first. (This routine is not overridable by extensions, which is why
8501 the extensions free is called first.)
8502
8503 See regdupe and regdupe_internal if you change anything here.
84da74a7 8504*/
f8149455 8505#ifndef PERL_IN_XSUB_RE
2b69d0c2 8506void
864dbfa3 8507Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8508{
27da23d5 8509 dVAR;
fc32ee4a 8510 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8511
7821416a
IZ
8512 if (!r || (--r->refcnt > 0))
8513 return;
f8149455
YO
8514
8515 CALLREGFREE_PVT(r); /* free the private data */
8516
43c5f42d
NC
8517 /* gcov results gave these as non-null 100% of the time, so there's no
8518 optimisation in checking them before calling Safefree */
8519 Safefree(r->precomp);
ed252734 8520 RX_MATCH_COPY_FREE(r);
f8c7b90f 8521#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8522 if (r->saved_copy)
8523 SvREFCNT_dec(r->saved_copy);
8524#endif
a193d654
GS
8525 if (r->substrs) {
8526 if (r->anchored_substr)
8527 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8528 if (r->anchored_utf8)
8529 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8530 if (r->float_substr)
8531 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8532 if (r->float_utf8)
8533 SvREFCNT_dec(r->float_utf8);
2779dcf1 8534 Safefree(r->substrs);
a193d654 8535 }
81714fb9
YO
8536 if (r->paren_names)
8537 SvREFCNT_dec(r->paren_names);
f8149455
YO
8538
8539 Safefree(r->startp);
8540 Safefree(r->endp);
8541 Safefree(r);
8542}
8543#endif
8544
8545/* regfree_internal()
8546
8547 Free the private data in a regexp. This is overloadable by
8548 extensions. Perl takes care of the regexp structure in pregfree(),
8549 this covers the *pprivate pointer which technically perldoesnt
8550 know about, however of course we have to handle the
8551 regexp_internal structure when no extension is in use.
8552
8553 Note this is called before freeing anything in the regexp
8554 structure.
8555 */
8556
8557void
8558Perl_regfree_internal(pTHX_ struct regexp *r)
8559{
8560 dVAR;
8561 RXi_GET_DECL(r,ri);
8562 GET_RE_DEBUG_FLAGS_DECL;
8563
8564 DEBUG_COMPILE_r({
8565 if (!PL_colorset)
8566 reginitcolors();
8567 {
8568 SV *dsv= sv_newmortal();
8569 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8570 dsv, r->precomp, r->prelen, 60);
8571 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8572 PL_colors[4],PL_colors[5],s);
8573 }
8574 });
7122b237
YO
8575#ifdef RE_TRACK_PATTERN_OFFSETS
8576 if (ri->u.offsets)
8577 Safefree(ri->u.offsets); /* 20010421 MJD */
8578#endif
f8fc2ecf
YO
8579 if (ri->data) {
8580 int n = ri->data->count;
f3548bdc
DM
8581 PAD* new_comppad = NULL;
8582 PAD* old_comppad;
4026c95a 8583 PADOFFSET refcnt;
dfad63ad 8584
c277df42 8585 while (--n >= 0) {
261faec3 8586 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8587 switch (ri->data->what[n]) {
c277df42 8588 case 's':
81714fb9 8589 case 'S':
55eed653 8590 case 'u':
f8fc2ecf 8591 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8592 break;
653099ff 8593 case 'f':
f8fc2ecf 8594 Safefree(ri->data->data[n]);
653099ff 8595 break;
dfad63ad 8596 case 'p':
f8fc2ecf 8597 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8598 break;
c277df42 8599 case 'o':
dfad63ad 8600 if (new_comppad == NULL)
cea2e8a9 8601 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8602 PAD_SAVE_LOCAL(old_comppad,
8603 /* Watch out for global destruction's random ordering. */
c445ea15 8604 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8605 );
b34c0dd4 8606 OP_REFCNT_LOCK;
f8fc2ecf 8607 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8608 OP_REFCNT_UNLOCK;
8609 if (!refcnt)
f8fc2ecf 8610 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8611
f3548bdc 8612 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8613 SvREFCNT_dec((SV*)new_comppad);
8614 new_comppad = NULL;
c277df42
IZ
8615 break;
8616 case 'n':
9e55ce06 8617 break;
07be1b83 8618 case 'T':
be8e71aa
YO
8619 { /* Aho Corasick add-on structure for a trie node.
8620 Used in stclass optimization only */
07be1b83 8621 U32 refcount;
f8fc2ecf 8622 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8623 OP_REFCNT_LOCK;
8624 refcount = --aho->refcount;
8625 OP_REFCNT_UNLOCK;
8626 if ( !refcount ) {
446bd890
NC
8627 PerlMemShared_free(aho->states);
8628 PerlMemShared_free(aho->fail);
446bd890
NC
8629 /* do this last!!!! */
8630 PerlMemShared_free(ri->data->data[n]);
8631 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8632 }
8633 }
8634 break;
a3621e74 8635 case 't':
07be1b83 8636 {
be8e71aa 8637 /* trie structure. */
07be1b83 8638 U32 refcount;
f8fc2ecf 8639 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8640 OP_REFCNT_LOCK;
8641 refcount = --trie->refcount;
8642 OP_REFCNT_UNLOCK;
8643 if ( !refcount ) {
446bd890 8644 PerlMemShared_free(trie->charmap);
446bd890
NC
8645 PerlMemShared_free(trie->states);
8646 PerlMemShared_free(trie->trans);
07be1b83 8647 if (trie->bitmap)
446bd890 8648 PerlMemShared_free(trie->bitmap);
07be1b83 8649 if (trie->wordlen)
446bd890 8650 PerlMemShared_free(trie->wordlen);
786e8c11 8651 if (trie->jump)
446bd890 8652 PerlMemShared_free(trie->jump);
786e8c11 8653 if (trie->nextword)
446bd890 8654 PerlMemShared_free(trie->nextword);
446bd890
NC
8655 /* do this last!!!! */
8656 PerlMemShared_free(ri->data->data[n]);
a3621e74 8657 }
07be1b83
YO
8658 }
8659 break;
c277df42 8660 default:
f8fc2ecf 8661 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8662 }
8663 }
f8fc2ecf
YO
8664 Safefree(ri->data->what);
8665 Safefree(ri->data);
a0d0e21e 8666 }
f8fc2ecf
YO
8667 if (ri->swap) {
8668 Safefree(ri->swap->startp);
8669 Safefree(ri->swap->endp);
8670 Safefree(ri->swap);
c74340f9 8671 }
f8fc2ecf 8672 Safefree(ri);
a687059c 8673}
c277df42 8674
84da74a7
YO
8675#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8676#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8677#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8678#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8679
8680/*
8681 regdupe - duplicate a regexp.
8682
8683 This routine is called by sv.c's re_dup and is expected to clone a
8684 given regexp structure. It is a no-op when not under USE_ITHREADS.
8685 (Originally this *was* re_dup() for change history see sv.c)
8686
f8149455
YO
8687 After all of the core data stored in struct regexp is duplicated
8688 the regexp_engine.dupe method is used to copy any private data
8689 stored in the *pprivate pointer. This allows extensions to handle
8690 any duplication it needs to do.
8691
8692 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8693*/
a3c0e9ca 8694#if defined(USE_ITHREADS)
f8149455 8695#ifndef PERL_IN_XSUB_RE
84da74a7 8696regexp *
f8149455 8697Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8698{
84da74a7 8699 dVAR;
f8fc2ecf 8700 regexp *ret;
f8149455 8701 int i, npar;
84da74a7 8702 struct reg_substr_datum *s;
644c02aa 8703
84da74a7
YO
8704 if (!r)
8705 return (REGEXP *)NULL;
8706
8707 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8708 return ret;
8709
f8149455 8710
84da74a7 8711 npar = r->nparens+1;
f8fc2ecf 8712 Newxz(ret, 1, regexp);
84da74a7
YO
8713 Newx(ret->startp, npar, I32);
8714 Copy(r->startp, ret->startp, npar, I32);
8715 Newx(ret->endp, npar, I32);
f8149455 8716 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8717
c945c181 8718 if (r->substrs) {
785a26d5
YO
8719 Newx(ret->substrs, 1, struct reg_substr_data);
8720 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8721 s->min_offset = r->substrs->data[i].min_offset;
8722 s->max_offset = r->substrs->data[i].max_offset;
8723 s->end_shift = r->substrs->data[i].end_shift;
8724 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8725 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8726 }
8727 } else
8728 ret->substrs = NULL;
f8149455
YO
8729
8730 ret->precomp = SAVEPVN(r->precomp, r->prelen);
8731 ret->refcnt = r->refcnt;
8732 ret->minlen = r->minlen;
8733 ret->minlenret = r->minlenret;
8734 ret->prelen = r->prelen;
8735 ret->nparens = r->nparens;
8736 ret->lastparen = r->lastparen;
8737 ret->lastcloseparen = r->lastcloseparen;
8738 ret->intflags = r->intflags;
8739 ret->extflags = r->extflags;
8740
8741 ret->sublen = r->sublen;
8742
8743 ret->engine = r->engine;
8744
8745 ret->paren_names = hv_dup_inc(r->paren_names, param);
8746
8747 if (RX_MATCH_COPIED(ret))
8748 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8749 else
8750 ret->subbeg = NULL;
8751#ifdef PERL_OLD_COPY_ON_WRITE
8752 ret->saved_copy = NULL;
8753#endif
8754
8755 ret->pprivate = r->pprivate;
785a26d5
YO
8756 if (ret->pprivate)
8757 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8758
8759 ptr_table_store(PL_ptr_table, r, ret);
8760 return ret;
8761}
8762#endif /* PERL_IN_XSUB_RE */
8763
8764/*
8765 regdupe_internal()
8766
8767 This is the internal complement to regdupe() which is used to copy
8768 the structure pointed to by the *pprivate pointer in the regexp.
8769 This is the core version of the extension overridable cloning hook.
8770 The regexp structure being duplicated will be copied by perl prior
8771 to this and will be provided as the regexp *r argument, however
8772 with the /old/ structures pprivate pointer value. Thus this routine
8773 may override any copying normally done by perl.
8774
8775 It returns a pointer to the new regexp_internal structure.
8776*/
8777
8778void *
8779Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8780{
8781 dVAR;
8782 regexp_internal *reti;
8783 int len, npar;
8784 RXi_GET_DECL(r,ri);
8785
8786 npar = r->nparens+1;
7122b237 8787 len = ProgLen(ri);
f8149455
YO
8788
8789 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8790 Copy(ri->program, reti->program, len+1, regnode);
8791
8792 if(ri->swap) {
8793 Newx(reti->swap, 1, regexp_paren_ofs);
8794 /* no need to copy these */
8795 Newx(reti->swap->startp, npar, I32);
8796 Newx(reti->swap->endp, npar, I32);
8797 } else {
8798 reti->swap = NULL;
8799 }
8800
84da74a7 8801
f8fc2ecf
YO
8802 reti->regstclass = NULL;
8803 if (ri->data) {
84da74a7 8804 struct reg_data *d;
f8fc2ecf 8805 const int count = ri->data->count;
84da74a7
YO
8806 int i;
8807
8808 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8809 char, struct reg_data);
8810 Newx(d->what, count, U8);
8811
8812 d->count = count;
8813 for (i = 0; i < count; i++) {
f8fc2ecf 8814 d->what[i] = ri->data->what[i];
84da74a7 8815 switch (d->what[i]) {
55eed653 8816 /* legal options are one of: sSfpontTu
84da74a7
YO
8817 see also regcomp.h and pregfree() */
8818 case 's':
81714fb9 8819 case 'S':
0536c0a7 8820 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8821 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8822 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8823 break;
84da74a7
YO
8824 case 'f':
8825 /* This is cheating. */
8826 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8827 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8828 struct regnode_charclass_class);
f8fc2ecf 8829 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8830 break;
8831 case 'o':
bbe252da
YO
8832 /* Compiled op trees are readonly and in shared memory,
8833 and can thus be shared without duplication. */
84da74a7 8834 OP_REFCNT_LOCK;
f8fc2ecf 8835 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8836 OP_REFCNT_UNLOCK;
8837 break;
23eab42c
NC
8838 case 'T':
8839 /* Trie stclasses are readonly and can thus be shared
8840 * without duplication. We free the stclass in pregfree
8841 * when the corresponding reg_ac_data struct is freed.
8842 */
8843 reti->regstclass= ri->regstclass;
8844 /* Fall through */
84da74a7 8845 case 't':
84da74a7 8846 OP_REFCNT_LOCK;
0536c0a7 8847 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8848 OP_REFCNT_UNLOCK;
0536c0a7
NC
8849 /* Fall through */
8850 case 'n':
8851 d->data[i] = ri->data->data[i];
84da74a7 8852 break;
84da74a7 8853 default:
f8fc2ecf 8854 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8855 }
8856 }
8857
f8fc2ecf 8858 reti->data = d;
84da74a7
YO
8859 }
8860 else
f8fc2ecf 8861 reti->data = NULL;
84da74a7 8862
cde0cee5
YO
8863 reti->name_list_idx = ri->name_list_idx;
8864
7122b237
YO
8865#ifdef RE_TRACK_PATTERN_OFFSETS
8866 if (ri->u.offsets) {
8867 Newx(reti->u.offsets, 2*len+1, U32);
8868 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
8869 }
8870#else
8871 SetProgLen(reti,len);
8872#endif
8873
f8149455 8874 return (void*)reti;
84da74a7 8875}
f8149455
YO
8876
8877#endif /* USE_ITHREADS */
84da74a7 8878
de8c5301
YO
8879/*
8880 reg_stringify()
8881
8882 converts a regexp embedded in a MAGIC struct to its stringified form,
8883 caching the converted form in the struct and returns the cached
8884 string.
8885
8886 If lp is nonnull then it is used to return the length of the
8887 resulting string
8888
8889 If flags is nonnull and the returned string contains UTF8 then
f8149455 8890 (*flags & 1) will be true.
de8c5301
YO
8891
8892 If haseval is nonnull then it is used to return whether the pattern
8893 contains evals.
8894
8895 Normally called via macro:
8896
f8149455 8897 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
8898
8899 And internally with
8900
f8149455 8901 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
8902
8903 See sv_2pv_flags() in sv.c for an example of internal usage.
8904
8905 */
f8149455 8906#ifndef PERL_IN_XSUB_RE
de8c5301
YO
8907char *
8908Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
8909 dVAR;
8910 const regexp * const re = (regexp *)mg->mg_obj;
f8149455 8911
de8c5301 8912 if (!mg->mg_ptr) {
a20207d7 8913 const char *fptr = STD_PAT_MODS; /*"msix"*/
cde0cee5 8914 char reflags[7];
de8c5301 8915 char ch;
cde0cee5
YO
8916 bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
8917 bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
8918 U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12);
8919 bool need_newline = 0;
8920 int left = 0;
8921 int right = 4 + hask;
8922 if (hask)
a20207d7 8923 reflags[left++] = KEEPCOPY_PAT_MOD; /*'k'*/
de8c5301
YO
8924 while((ch = *fptr++)) {
8925 if(reganch & 1) {
8926 reflags[left++] = ch;
8927 }
8928 else {
8929 reflags[right--] = ch;
8930 }
8931 reganch >>= 1;
8932 }
cde0cee5 8933 if(hasm) {
de8c5301 8934 reflags[left] = '-';
cde0cee5 8935 left = 5 + hask;
de8c5301 8936 }
cde0cee5 8937 /* printf("[%*.7s]\n",left,reflags); */
de8c5301
YO
8938 mg->mg_len = re->prelen + 4 + left;
8939 /*
8940 * If /x was used, we have to worry about a regex ending with a
8941 * comment later being embedded within another regex. If so, we don't
8942 * want this regex's "commentization" to leak out to the right part of
8943 * the enclosing regex, we must cap it with a newline.
8944 *
8945 * So, if /x was used, we scan backwards from the end of the regex. If
8946 * we find a '#' before we find a newline, we need to add a newline
8947 * ourself. If we find a '\n' first (or if we don't find '#' or '\n'),
8948 * we don't need to add anything. -jfriedl
8949 */
bbe252da 8950 if (PMf_EXTENDED & re->extflags) {
de8c5301
YO
8951 const char *endptr = re->precomp + re->prelen;
8952 while (endptr >= re->precomp) {
8953 const char c = *(endptr--);
8954 if (c == '\n')
8955 break; /* don't need another */
8956 if (c == '#') {
8957 /* we end while in a comment, so we need a newline */
8958 mg->mg_len++; /* save space for it */
8959 need_newline = 1; /* note to add it */
8960 break;
8961 }
8962 }
8963 }
8964
8965 Newx(mg->mg_ptr, mg->mg_len + 1 + left, char);
8966 mg->mg_ptr[0] = '(';
8967 mg->mg_ptr[1] = '?';
8968 Copy(reflags, mg->mg_ptr+2, left, char);
8969 *(mg->mg_ptr+left+2) = ':';
8970 Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char);
8971 if (need_newline)
8972 mg->mg_ptr[mg->mg_len - 2] = '\n';
8973 mg->mg_ptr[mg->mg_len - 1] = ')';
8974 mg->mg_ptr[mg->mg_len] = 0;
8975 }
8976 if (haseval)
f8149455 8977 *haseval = re->seen_evals;
de8c5301 8978 if (flags)
bbe252da 8979 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301
YO
8980
8981 if (lp)
8982 *lp = mg->mg_len;
8983 return mg->mg_ptr;
8984}
8985
c277df42
IZ
8986/*
8987 - regnext - dig the "next" pointer out of a node
c277df42
IZ
8988 */
8989regnode *
864dbfa3 8990Perl_regnext(pTHX_ register regnode *p)
c277df42 8991{
97aff369 8992 dVAR;
c277df42
IZ
8993 register I32 offset;
8994
f8fc2ecf 8995 if (!p)
c277df42
IZ
8996 return(NULL);
8997
8998 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
8999 if (offset == 0)
9000 return(NULL);
9001
c277df42 9002 return(p+offset);
c277df42 9003}
76234dfb 9004#endif
c277df42 9005
01f988be 9006STATIC void
cea2e8a9 9007S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9008{
9009 va_list args;
9010 STRLEN l1 = strlen(pat1);
9011 STRLEN l2 = strlen(pat2);
9012 char buf[512];
06bf62c7 9013 SV *msv;
73d840c0 9014 const char *message;
c277df42
IZ
9015
9016 if (l1 > 510)
9017 l1 = 510;
9018 if (l1 + l2 > 510)
9019 l2 = 510 - l1;
9020 Copy(pat1, buf, l1 , char);
9021 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9022 buf[l1 + l2] = '\n';
9023 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9024#ifdef I_STDARG
9025 /* ANSI variant takes additional second argument */
c277df42 9026 va_start(args, pat2);
8736538c
AS
9027#else
9028 va_start(args);
9029#endif
5a844595 9030 msv = vmess(buf, &args);
c277df42 9031 va_end(args);
cfd0369c 9032 message = SvPV_const(msv,l1);
c277df42
IZ
9033 if (l1 > 512)
9034 l1 = 512;
9035 Copy(message, buf, l1 , char);
197cf9b9 9036 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9037 Perl_croak(aTHX_ "%s", buf);
c277df42 9038}
a0ed51b3
LW
9039
9040/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9041
76234dfb 9042#ifndef PERL_IN_XSUB_RE
a0ed51b3 9043void
864dbfa3 9044Perl_save_re_context(pTHX)
b81d288d 9045{
97aff369 9046 dVAR;
1ade1aa1
NC
9047
9048 struct re_save_state *state;
9049
9050 SAVEVPTR(PL_curcop);
9051 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9052
9053 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9054 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9055 SSPUSHINT(SAVEt_RE_STATE);
9056
46ab3289 9057 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9058
a0ed51b3 9059 PL_reg_start_tmp = 0;
a0ed51b3 9060 PL_reg_start_tmpl = 0;
c445ea15 9061 PL_reg_oldsaved = NULL;
a5db57d6 9062 PL_reg_oldsavedlen = 0;
a5db57d6 9063 PL_reg_maxiter = 0;
a5db57d6 9064 PL_reg_leftiter = 0;
c445ea15 9065 PL_reg_poscache = NULL;
a5db57d6 9066 PL_reg_poscache_size = 0;
1ade1aa1
NC
9067#ifdef PERL_OLD_COPY_ON_WRITE
9068 PL_nrs = NULL;
9069#endif
ada6e8a9 9070
c445ea15
AL
9071 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9072 if (PL_curpm) {
9073 const REGEXP * const rx = PM_GETRE(PL_curpm);
9074 if (rx) {
1df70142 9075 U32 i;
ada6e8a9 9076 for (i = 1; i <= rx->nparens; i++) {
1df70142 9077 char digits[TYPE_CHARS(long)];
d9fad198 9078 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9079 GV *const *const gvp
9080 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9081
b37c2d43
AL
9082 if (gvp) {
9083 GV * const gv = *gvp;
9084 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9085 save_scalar(gv);
49f27e4b 9086 }
ada6e8a9
AMS
9087 }
9088 }
9089 }
a0ed51b3 9090}
76234dfb 9091#endif
51371543 9092
51371543 9093static void
acfe0abc 9094clear_re(pTHX_ void *r)
51371543 9095{
97aff369 9096 dVAR;
51371543
GS
9097 ReREFCNT_dec((regexp *)r);
9098}
ffbc6a93 9099
a28509cc
AL
9100#ifdef DEBUGGING
9101
9102STATIC void
9103S_put_byte(pTHX_ SV *sv, int c)
9104{
9105 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9106 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9107 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9108 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9109 else
9110 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9111}
9112
786e8c11 9113
3dab1dad
YO
9114#define CLEAR_OPTSTART \
9115 if (optstart) STMT_START { \
70685ca0 9116 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9117 optstart=NULL; \
9118 } STMT_END
9119
786e8c11 9120#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9121
b5a2f8d8
NC
9122STATIC const regnode *
9123S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9124 const regnode *last, const regnode *plast,
9125 SV* sv, I32 indent, U32 depth)
a28509cc 9126{
97aff369 9127 dVAR;
786e8c11 9128 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9129 register const regnode *next;
3dab1dad 9130 const regnode *optstart= NULL;
1f1031fe 9131
f8fc2ecf 9132 RXi_GET_DECL(r,ri);
3dab1dad 9133 GET_RE_DEBUG_FLAGS_DECL;
1f1031fe 9134
786e8c11
YO
9135#ifdef DEBUG_DUMPUNTIL
9136 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9137 last ? last-start : 0,plast ? plast-start : 0);
9138#endif
9139
9140 if (plast && plast < last)
9141 last= plast;
9142
9143 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9144 /* While that wasn't END last time... */
a28509cc
AL
9145 NODE_ALIGN(node);
9146 op = OP(node);
de734bd5 9147 if (op == CLOSE || op == WHILEM)
786e8c11 9148 indent--;
b5a2f8d8 9149 next = regnext((regnode *)node);
1f1031fe 9150
a28509cc 9151 /* Where, what. */
8e11feef 9152 if (OP(node) == OPTIMIZED) {
e68ec53f 9153 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9154 optstart = node;
3dab1dad 9155 else
8e11feef 9156 goto after_print;
3dab1dad
YO
9157 } else
9158 CLEAR_OPTSTART;
1f1031fe 9159
32fc9b6a 9160 regprop(r, sv, node);
a28509cc 9161 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9162 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
9163
9164 if (OP(node) != OPTIMIZED) {
9165 if (next == NULL) /* Next ptr. */
9166 PerlIO_printf(Perl_debug_log, " (0)");
9167 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9168 PerlIO_printf(Perl_debug_log, " (FAIL)");
9169 else
9170 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9171 (void)PerlIO_putc(Perl_debug_log, '\n');
9172 }
9173
a28509cc
AL
9174 after_print:
9175 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9176 assert(next);
9177 {
9178 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9179 ? regnext((regnode *)next)
9180 : next);
be8e71aa
YO
9181 if (last && nnode > last)
9182 nnode = last;
786e8c11 9183 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9184 }
a28509cc
AL
9185 }
9186 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9187 assert(next);
786e8c11 9188 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9189 }
9190 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9191 const regnode *this_trie = node;
1de06328 9192 const char op = OP(node);
647f639f 9193 const U32 n = ARG(node);
1de06328 9194 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9195 (reg_ac_data *)ri->data->data[n] :
1de06328 9196 NULL;
3251b653
NC
9197 const reg_trie_data * const trie =
9198 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9199#ifdef DEBUGGING
9200 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9201#endif
786e8c11 9202 const regnode *nextbranch= NULL;
a28509cc 9203 I32 word_idx;
1de06328 9204 sv_setpvn(sv, "", 0);
786e8c11 9205 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9206 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9207
9208 PerlIO_printf(Perl_debug_log, "%*s%s ",
9209 (int)(2*(indent+3)), "",
9210 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9211 PL_colors[0], PL_colors[1],
9212 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9213 PERL_PV_PRETTY_ELIPSES |
7f69552c 9214 PERL_PV_PRETTY_LTGT
786e8c11
YO
9215 )
9216 : "???"
9217 );
9218 if (trie->jump) {
40d049e4 9219 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9220 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9221 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9222 if (dist) {
9223 if (!nextbranch)
24b23f37 9224 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9225 DUMPUNTIL(this_trie + dist, nextbranch);
9226 }
786e8c11
YO
9227 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9228 nextbranch= regnext((regnode *)nextbranch);
9229 } else {
9230 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9231 }
786e8c11
YO
9232 }
9233 if (last && next > last)
9234 node= last;
9235 else
9236 node= next;
a28509cc 9237 }
786e8c11
YO
9238 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9239 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9240 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9241 }
9242 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9243 assert(next);
786e8c11 9244 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9245 }
9246 else if ( op == PLUS || op == STAR) {
786e8c11 9247 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9248 }
9249 else if (op == ANYOF) {
9250 /* arglen 1 + class block */
9251 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9252 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9253 node = NEXTOPER(node);
9254 }
9255 else if (PL_regkind[(U8)op] == EXACT) {
9256 /* Literal string, where present. */
9257 node += NODE_SZ_STR(node) - 1;
9258 node = NEXTOPER(node);
9259 }
9260 else {
9261 node = NEXTOPER(node);
9262 node += regarglen[(U8)op];
9263 }
9264 if (op == CURLYX || op == OPEN)
786e8c11 9265 indent++;
a28509cc 9266 }
3dab1dad 9267 CLEAR_OPTSTART;
786e8c11 9268#ifdef DEBUG_DUMPUNTIL
70685ca0 9269 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9270#endif
1de06328 9271 return node;
a28509cc
AL
9272}
9273
9274#endif /* DEBUGGING */
9275
241d1a3b
NC
9276/*
9277 * Local variables:
9278 * c-indentation-style: bsd
9279 * c-basic-offset: 4
9280 * indent-tabs-mode: t
9281 * End:
9282 *
37442d52
RGS
9283 * ex: set ts=8 sts=4 sw=4 noet:
9284 */