This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge autodie 2.05 into core.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
d4cce5f1 88#ifdef op
11343788 89#undef op
d4cce5f1 90#endif /* op */
11343788 91
fe14fcc3 92#ifdef MSDOS
7e4e8c89 93# if defined(BUGGY_MSC6)
fe14fcc3 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 95# pragma optimize("a",off)
fe14fcc3 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
97# pragma optimize("w",on )
98# endif /* BUGGY_MSC6 */
fe14fcc3
LW
99#endif /* MSDOS */
100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
830247a4 105typedef struct RExC_state_t {
e2509266 106 U32 flags; /* are we folding, multilining? */
830247a4 107 char *precomp; /* uncompiled string. */
288b8c02 108 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 111 char *start; /* Start of input for compile */
830247a4
IZ
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 115 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 116 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 117 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
120 U32 seen;
121 I32 size; /* Code size. */
c74340f9
YO
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 124 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
40d049e4
YO
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
02daf0ab
YO
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
6bda09f9 135 HV *charnames; /* cache of named sequences */
81714fb9 136 HV *paren_names; /* Paren names */
1f1031fe 137
40d049e4
YO
138 regnode **recurse; /* Recurse regops */
139 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
140#if ADD_TO_REGEXEC
141 char *starttry; /* -Dr: where regtry was called. */
142#define RExC_starttry (pRExC_state->starttry)
143#endif
3dab1dad 144#ifdef DEBUGGING
be8e71aa 145 const char *lastparse;
3dab1dad 146 I32 lastnum;
1f1031fe 147 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
148#define RExC_lastparse (pRExC_state->lastparse)
149#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 150#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 151#endif
830247a4
IZ
152} RExC_state_t;
153
e2509266 154#define RExC_flags (pRExC_state->flags)
830247a4 155#define RExC_precomp (pRExC_state->precomp)
288b8c02 156#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 157#define RExC_rx (pRExC_state->rx)
f8fc2ecf 158#define RExC_rxi (pRExC_state->rxi)
fac92740 159#define RExC_start (pRExC_state->start)
830247a4
IZ
160#define RExC_end (pRExC_state->end)
161#define RExC_parse (pRExC_state->parse)
162#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
163#ifdef RE_TRACK_PATTERN_OFFSETS
164#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
165#endif
830247a4 166#define RExC_emit (pRExC_state->emit)
fac92740 167#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 168#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
169#define RExC_naughty (pRExC_state->naughty)
170#define RExC_sawback (pRExC_state->sawback)
171#define RExC_seen (pRExC_state->seen)
172#define RExC_size (pRExC_state->size)
173#define RExC_npar (pRExC_state->npar)
e2e6a0f1 174#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
175#define RExC_extralen (pRExC_state->extralen)
176#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
177#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 178#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 179#define RExC_orig_utf8 (pRExC_state->orig_utf8)
fc8cd66c 180#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
181#define RExC_open_parens (pRExC_state->open_parens)
182#define RExC_close_parens (pRExC_state->close_parens)
183#define RExC_opend (pRExC_state->opend)
81714fb9 184#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
185#define RExC_recurse (pRExC_state->recurse)
186#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 187
cde0cee5 188
a687059c
LW
189#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
a687059c 192
35c8bce7
LW
193#ifdef SPSTART
194#undef SPSTART /* dratted cpp namespace... */
195#endif
a687059c
LW
196/*
197 * Flags to be passed up and down.
198 */
a687059c 199#define WORST 0 /* Worst case. */
a3b492c3
YO
200#define HASWIDTH 0x01 /* Known to match non-null strings. */
201#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
202#define SPSTART 0x04 /* Starts with * or +. */
203#define TRYAGAIN 0x08 /* Weeded out a declaration. */
204#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 205
3dab1dad
YO
206#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
207
07be1b83
YO
208/* whether trie related optimizations are enabled */
209#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
210#define TRIE_STUDY_OPT
786e8c11 211#define FULL_TRIE_STUDY
07be1b83
YO
212#define TRIE_STCLASS
213#endif
1de06328
YO
214
215
40d049e4
YO
216
217#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
218#define PBITVAL(paren) (1 << ((paren) & 7))
219#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
220#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
221#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
222
223
1de06328
YO
224/* About scan_data_t.
225
226 During optimisation we recurse through the regexp program performing
227 various inplace (keyhole style) optimisations. In addition study_chunk
228 and scan_commit populate this data structure with information about
229 what strings MUST appear in the pattern. We look for the longest
230 string that must appear for at a fixed location, and we look for the
231 longest string that may appear at a floating location. So for instance
232 in the pattern:
233
234 /FOO[xX]A.*B[xX]BAR/
235
236 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
237 strings (because they follow a .* construct). study_chunk will identify
238 both FOO and BAR as being the longest fixed and floating strings respectively.
239
240 The strings can be composites, for instance
241
242 /(f)(o)(o)/
243
244 will result in a composite fixed substring 'foo'.
245
246 For each string some basic information is maintained:
247
248 - offset or min_offset
249 This is the position the string must appear at, or not before.
250 It also implicitly (when combined with minlenp) tells us how many
251 character must match before the string we are searching.
252 Likewise when combined with minlenp and the length of the string
253 tells us how many characters must appear after the string we have
254 found.
255
256 - max_offset
257 Only used for floating strings. This is the rightmost point that
258 the string can appear at. Ifset to I32 max it indicates that the
259 string can occur infinitely far to the right.
260
261 - minlenp
262 A pointer to the minimum length of the pattern that the string
263 was found inside. This is important as in the case of positive
264 lookahead or positive lookbehind we can have multiple patterns
265 involved. Consider
266
267 /(?=FOO).*F/
268
269 The minimum length of the pattern overall is 3, the minimum length
270 of the lookahead part is 3, but the minimum length of the part that
271 will actually match is 1. So 'FOO's minimum length is 3, but the
272 minimum length for the F is 1. This is important as the minimum length
273 is used to determine offsets in front of and behind the string being
274 looked for. Since strings can be composites this is the length of the
275 pattern at the time it was commited with a scan_commit. Note that
276 the length is calculated by study_chunk, so that the minimum lengths
277 are not known until the full pattern has been compiled, thus the
278 pointer to the value.
279
280 - lookbehind
281
282 In the case of lookbehind the string being searched for can be
283 offset past the start point of the final matching string.
284 If this value was just blithely removed from the min_offset it would
285 invalidate some of the calculations for how many chars must match
286 before or after (as they are derived from min_offset and minlen and
287 the length of the string being searched for).
288 When the final pattern is compiled and the data is moved from the
289 scan_data_t structure into the regexp structure the information
290 about lookbehind is factored in, with the information that would
291 have been lost precalculated in the end_shift field for the
292 associated string.
293
294 The fields pos_min and pos_delta are used to store the minimum offset
295 and the delta to the maximum offset at the current point in the pattern.
296
297*/
2c2d71f5
JH
298
299typedef struct scan_data_t {
1de06328
YO
300 /*I32 len_min; unused */
301 /*I32 len_delta; unused */
2c2d71f5
JH
302 I32 pos_min;
303 I32 pos_delta;
304 SV *last_found;
1de06328 305 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
306 I32 last_start_min;
307 I32 last_start_max;
1de06328
YO
308 SV **longest; /* Either &l_fixed, or &l_float. */
309 SV *longest_fixed; /* longest fixed string found in pattern */
310 I32 offset_fixed; /* offset where it starts */
311 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
312 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
313 SV *longest_float; /* longest floating string found in pattern */
314 I32 offset_float_min; /* earliest point in string it can appear */
315 I32 offset_float_max; /* latest point in string it can appear */
316 I32 *minlen_float; /* pointer to the minlen relevent to the string */
317 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
318 I32 flags;
319 I32 whilem_c;
cb434fcc 320 I32 *last_closep;
653099ff 321 struct regnode_charclass_class *start_class;
2c2d71f5
JH
322} scan_data_t;
323
a687059c 324/*
e50aee73 325 * Forward declarations for pregcomp()'s friends.
a687059c 326 */
a0d0e21e 327
27da23d5 328static const scan_data_t zero_scan_data =
1de06328 329 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
330
331#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
332#define SF_BEFORE_SEOL 0x0001
333#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
334#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
335#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
336
09b7f37c
CB
337#ifdef NO_UNARY_PLUS
338# define SF_FIX_SHIFT_EOL (0+2)
339# define SF_FL_SHIFT_EOL (0+4)
340#else
341# define SF_FIX_SHIFT_EOL (+2)
342# define SF_FL_SHIFT_EOL (+4)
343#endif
c277df42
IZ
344
345#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
346#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
347
348#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
349#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
350#define SF_IS_INF 0x0040
351#define SF_HAS_PAR 0x0080
352#define SF_IN_PAR 0x0100
353#define SF_HAS_EVAL 0x0200
354#define SCF_DO_SUBSTR 0x0400
653099ff
GS
355#define SCF_DO_STCLASS_AND 0x0800
356#define SCF_DO_STCLASS_OR 0x1000
357#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 358#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 359
786e8c11 360#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 361#define SCF_SEEN_ACCEPT 0x8000
07be1b83 362
eb160463 363#define UTF (RExC_utf8 != 0)
bbe252da
YO
364#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
365#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 366
ffc61ed2 367#define OOB_UNICODE 12345678
93733859 368#define OOB_NAMEDCLASS -1
b8c5462f 369
a0ed51b3
LW
370#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
371#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
372
8615cb43 373
b45f050a
JF
374/* length of regex to show in messages that don't mark a position within */
375#define RegexLengthToShowInErrorMessages 127
376
377/*
378 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
379 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
380 * op/pragma/warn/regcomp.
381 */
7253e4e3
RK
382#define MARKER1 "<-- HERE" /* marker as it appears in the description */
383#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 384
7253e4e3 385#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
386
387/*
388 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
389 * arg. Show regex, up to a maximum length. If it's too long, chop and add
390 * "...".
391 */
58e23c8d 392#define _FAIL(code) STMT_START { \
bfed75c6 393 const char *ellipses = ""; \
ccb2c380
MP
394 IV len = RExC_end - RExC_precomp; \
395 \
396 if (!SIZE_ONLY) \
288b8c02 397 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
398 if (len > RegexLengthToShowInErrorMessages) { \
399 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
400 len = RegexLengthToShowInErrorMessages - 10; \
401 ellipses = "..."; \
402 } \
58e23c8d 403 code; \
ccb2c380 404} STMT_END
8615cb43 405
58e23c8d
YO
406#define FAIL(msg) _FAIL( \
407 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
408 msg, (int)len, RExC_precomp, ellipses))
409
410#define FAIL2(msg,arg) _FAIL( \
411 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
412 arg, (int)len, RExC_precomp, ellipses))
413
b45f050a 414/*
b45f050a
JF
415 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
416 */
ccb2c380 417#define Simple_vFAIL(m) STMT_START { \
a28509cc 418 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
419 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
420 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
421} STMT_END
b45f050a
JF
422
423/*
424 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
425 */
ccb2c380
MP
426#define vFAIL(m) STMT_START { \
427 if (!SIZE_ONLY) \
288b8c02 428 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
429 Simple_vFAIL(m); \
430} STMT_END
b45f050a
JF
431
432/*
433 * Like Simple_vFAIL(), but accepts two arguments.
434 */
ccb2c380 435#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 436 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
437 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
438 (int)offset, RExC_precomp, RExC_precomp + offset); \
439} STMT_END
b45f050a
JF
440
441/*
442 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
443 */
ccb2c380
MP
444#define vFAIL2(m,a1) STMT_START { \
445 if (!SIZE_ONLY) \
288b8c02 446 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
447 Simple_vFAIL2(m, a1); \
448} STMT_END
b45f050a
JF
449
450
451/*
452 * Like Simple_vFAIL(), but accepts three arguments.
453 */
ccb2c380 454#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 455 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
456 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
457 (int)offset, RExC_precomp, RExC_precomp + offset); \
458} STMT_END
b45f050a
JF
459
460/*
461 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
462 */
ccb2c380
MP
463#define vFAIL3(m,a1,a2) STMT_START { \
464 if (!SIZE_ONLY) \
288b8c02 465 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
466 Simple_vFAIL3(m, a1, a2); \
467} STMT_END
b45f050a
JF
468
469/*
470 * Like Simple_vFAIL(), but accepts four arguments.
471 */
ccb2c380 472#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 473 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
474 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
475 (int)offset, RExC_precomp, RExC_precomp + offset); \
476} STMT_END
b45f050a 477
ccb2c380 478#define vWARN(loc,m) STMT_START { \
a28509cc 479 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
480 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
481 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
482} STMT_END
483
484#define vWARNdep(loc,m) STMT_START { \
a28509cc 485 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
486 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
487 "%s" REPORT_LOCATION, \
488 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
489} STMT_END
490
491
492#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 493 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
494 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
495 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
496} STMT_END
497
498#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 499 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
500 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
501 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
503
504#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 505 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
506 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
507 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
508} STMT_END
509
510#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 511 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
512 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
513 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
514} STMT_END
9d1d55b5 515
8615cb43 516
cd439c50 517/* Allow for side effects in s */
ccb2c380
MP
518#define REGC(c,s) STMT_START { \
519 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
520} STMT_END
cd439c50 521
fac92740
MJD
522/* Macros for recording node offsets. 20001227 mjd@plover.com
523 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
524 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
525 * Element 0 holds the number n.
07be1b83 526 * Position is 1 indexed.
fac92740 527 */
7122b237
YO
528#ifndef RE_TRACK_PATTERN_OFFSETS
529#define Set_Node_Offset_To_R(node,byte)
530#define Set_Node_Offset(node,byte)
531#define Set_Cur_Node_Offset
532#define Set_Node_Length_To_R(node,len)
533#define Set_Node_Length(node,len)
534#define Set_Node_Cur_Length(node)
535#define Node_Offset(n)
536#define Node_Length(n)
537#define Set_Node_Offset_Length(node,offset,len)
538#define ProgLen(ri) ri->u.proglen
539#define SetProgLen(ri,x) ri->u.proglen = x
540#else
541#define ProgLen(ri) ri->u.offsets[0]
542#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
543#define Set_Node_Offset_To_R(node,byte) STMT_START { \
544 if (! SIZE_ONLY) { \
545 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 546 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 547 if((node) < 0) { \
551405c4 548 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
549 } else { \
550 RExC_offsets[2*(node)-1] = (byte); \
551 } \
552 } \
553} STMT_END
554
555#define Set_Node_Offset(node,byte) \
556 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
557#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
558
559#define Set_Node_Length_To_R(node,len) STMT_START { \
560 if (! SIZE_ONLY) { \
561 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 562 __LINE__, (int)(node), (int)(len))); \
ccb2c380 563 if((node) < 0) { \
551405c4 564 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
565 } else { \
566 RExC_offsets[2*(node)] = (len); \
567 } \
568 } \
569} STMT_END
570
571#define Set_Node_Length(node,len) \
572 Set_Node_Length_To_R((node)-RExC_emit_start, len)
573#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
574#define Set_Node_Cur_Length(node) \
575 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
576
577/* Get offsets and lengths */
578#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
579#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
580
07be1b83
YO
581#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
582 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
583 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
584} STMT_END
7122b237 585#endif
07be1b83
YO
586
587#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
588#define EXPERIMENTAL_INPLACESCAN
7122b237 589#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 590
304ee84b
YO
591#define DEBUG_STUDYDATA(str,data,depth) \
592DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 593 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
594 "%*s" str "Pos:%"IVdf"/%"IVdf \
595 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
596 (int)(depth)*2, "", \
597 (IV)((data)->pos_min), \
598 (IV)((data)->pos_delta), \
304ee84b 599 (UV)((data)->flags), \
1de06328 600 (IV)((data)->whilem_c), \
304ee84b
YO
601 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
602 is_inf ? "INF " : "" \
1de06328
YO
603 ); \
604 if ((data)->last_found) \
605 PerlIO_printf(Perl_debug_log, \
606 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
607 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
608 SvPVX_const((data)->last_found), \
609 (IV)((data)->last_end), \
610 (IV)((data)->last_start_min), \
611 (IV)((data)->last_start_max), \
612 ((data)->longest && \
613 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
614 SvPVX_const((data)->longest_fixed), \
615 (IV)((data)->offset_fixed), \
616 ((data)->longest && \
617 (data)->longest==&((data)->longest_float)) ? "*" : "", \
618 SvPVX_const((data)->longest_float), \
619 (IV)((data)->offset_float_min), \
620 (IV)((data)->offset_float_max) \
621 ); \
622 PerlIO_printf(Perl_debug_log,"\n"); \
623});
624
acfe0abc 625static void clear_re(pTHX_ void *r);
4327152a 626
653099ff 627/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 628 Update the longest found anchored substring and the longest found
653099ff
GS
629 floating substrings if needed. */
630
4327152a 631STATIC void
304ee84b 632S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 633{
e1ec3a88
AL
634 const STRLEN l = CHR_SVLEN(data->last_found);
635 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 636 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 637
7918f24d
NC
638 PERL_ARGS_ASSERT_SCAN_COMMIT;
639
c277df42 640 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 641 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
642 if (*data->longest == data->longest_fixed) {
643 data->offset_fixed = l ? data->last_start_min : data->pos_min;
644 if (data->flags & SF_BEFORE_EOL)
b81d288d 645 data->flags
c277df42
IZ
646 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
647 else
648 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
649 data->minlen_fixed=minlenp;
650 data->lookbehind_fixed=0;
a0ed51b3 651 }
304ee84b 652 else { /* *data->longest == data->longest_float */
c277df42 653 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
654 data->offset_float_max = (l
655 ? data->last_start_max
c277df42 656 : data->pos_min + data->pos_delta);
304ee84b 657 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 658 data->offset_float_max = I32_MAX;
c277df42 659 if (data->flags & SF_BEFORE_EOL)
b81d288d 660 data->flags
c277df42
IZ
661 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
662 else
663 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
664 data->minlen_float=minlenp;
665 data->lookbehind_float=0;
c277df42
IZ
666 }
667 }
668 SvCUR_set(data->last_found, 0);
0eda9292 669 {
a28509cc 670 SV * const sv = data->last_found;
097eb12c
AL
671 if (SvUTF8(sv) && SvMAGICAL(sv)) {
672 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
673 if (mg)
674 mg->mg_len = 0;
675 }
0eda9292 676 }
c277df42
IZ
677 data->last_end = -1;
678 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 679 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
680}
681
653099ff
GS
682/* Can match anything (initialization) */
683STATIC void
097eb12c 684S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 685{
7918f24d
NC
686 PERL_ARGS_ASSERT_CL_ANYTHING;
687
653099ff 688 ANYOF_CLASS_ZERO(cl);
f8bef550 689 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 690 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
691 if (LOC)
692 cl->flags |= ANYOF_LOCALE;
693}
694
695/* Can match anything (initialization) */
696STATIC int
5f66b61c 697S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
698{
699 int value;
700
7918f24d
NC
701 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
702
aaa51d5e 703 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
704 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
705 return 1;
1aa99e6b
IH
706 if (!(cl->flags & ANYOF_UNICODE_ALL))
707 return 0;
10edeb5d 708 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 709 return 0;
653099ff
GS
710 return 1;
711}
712
713/* Can match anything (initialization) */
714STATIC void
097eb12c 715S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 716{
7918f24d
NC
717 PERL_ARGS_ASSERT_CL_INIT;
718
8ecf7187 719 Zero(cl, 1, struct regnode_charclass_class);
653099ff 720 cl->type = ANYOF;
830247a4 721 cl_anything(pRExC_state, cl);
653099ff
GS
722}
723
724STATIC void
097eb12c 725S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 726{
7918f24d
NC
727 PERL_ARGS_ASSERT_CL_INIT_ZERO;
728
8ecf7187 729 Zero(cl, 1, struct regnode_charclass_class);
653099ff 730 cl->type = ANYOF;
830247a4 731 cl_anything(pRExC_state, cl);
653099ff
GS
732 if (LOC)
733 cl->flags |= ANYOF_LOCALE;
734}
735
736/* 'And' a given class with another one. Can create false positives */
737/* We assume that cl is not inverted */
738STATIC void
5f66b61c 739S_cl_and(struct regnode_charclass_class *cl,
a28509cc 740 const struct regnode_charclass_class *and_with)
653099ff 741{
7918f24d 742 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
743
744 assert(and_with->type == ANYOF);
653099ff
GS
745 if (!(and_with->flags & ANYOF_CLASS)
746 && !(cl->flags & ANYOF_CLASS)
747 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
748 && !(and_with->flags & ANYOF_FOLD)
749 && !(cl->flags & ANYOF_FOLD)) {
750 int i;
751
752 if (and_with->flags & ANYOF_INVERT)
753 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
754 cl->bitmap[i] &= ~and_with->bitmap[i];
755 else
756 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
757 cl->bitmap[i] &= and_with->bitmap[i];
758 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
759 if (!(and_with->flags & ANYOF_EOS))
760 cl->flags &= ~ANYOF_EOS;
1aa99e6b 761
14ebb1a2
JH
762 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
763 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
764 cl->flags &= ~ANYOF_UNICODE_ALL;
765 cl->flags |= ANYOF_UNICODE;
766 ARG_SET(cl, ARG(and_with));
767 }
14ebb1a2
JH
768 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
769 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 770 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
771 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
772 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 773 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
774}
775
776/* 'OR' a given class with another one. Can create false positives */
777/* We assume that cl is not inverted */
778STATIC void
097eb12c 779S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 780{
7918f24d
NC
781 PERL_ARGS_ASSERT_CL_OR;
782
653099ff
GS
783 if (or_with->flags & ANYOF_INVERT) {
784 /* We do not use
785 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
786 * <= (B1 | !B2) | (CL1 | !CL2)
787 * which is wasteful if CL2 is small, but we ignore CL2:
788 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
789 * XXXX Can we handle case-fold? Unclear:
790 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
791 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
792 */
793 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
794 && !(or_with->flags & ANYOF_FOLD)
795 && !(cl->flags & ANYOF_FOLD) ) {
796 int i;
797
798 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
799 cl->bitmap[i] |= ~or_with->bitmap[i];
800 } /* XXXX: logic is complicated otherwise */
801 else {
830247a4 802 cl_anything(pRExC_state, cl);
653099ff
GS
803 }
804 } else {
805 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
806 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 807 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
808 || (cl->flags & ANYOF_FOLD)) ) {
809 int i;
810
811 /* OR char bitmap and class bitmap separately */
812 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
813 cl->bitmap[i] |= or_with->bitmap[i];
814 if (or_with->flags & ANYOF_CLASS) {
815 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
816 cl->classflags[i] |= or_with->classflags[i];
817 cl->flags |= ANYOF_CLASS;
818 }
819 }
820 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 821 cl_anything(pRExC_state, cl);
653099ff
GS
822 }
823 }
824 if (or_with->flags & ANYOF_EOS)
825 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
826
827 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
828 ARG(cl) != ARG(or_with)) {
829 cl->flags |= ANYOF_UNICODE_ALL;
830 cl->flags &= ~ANYOF_UNICODE;
831 }
832 if (or_with->flags & ANYOF_UNICODE_ALL) {
833 cl->flags |= ANYOF_UNICODE_ALL;
834 cl->flags &= ~ANYOF_UNICODE;
835 }
653099ff
GS
836}
837
a3621e74
YO
838#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
839#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
840#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
841#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
842
3dab1dad
YO
843
844#ifdef DEBUGGING
07be1b83 845/*
2b8b4781
NC
846 dump_trie(trie,widecharmap,revcharmap)
847 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
848 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
849
850 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
851 The _interim_ variants are used for debugging the interim
852 tables that are used to generate the final compressed
853 representation which is what dump_trie expects.
854
3dab1dad
YO
855 Part of the reason for their existance is to provide a form
856 of documentation as to how the different representations function.
07be1b83
YO
857
858*/
3dab1dad
YO
859
860/*
3dab1dad
YO
861 Dumps the final compressed table form of the trie to Perl_debug_log.
862 Used for debugging make_trie().
863*/
864
865STATIC void
2b8b4781
NC
866S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
867 AV *revcharmap, U32 depth)
3dab1dad
YO
868{
869 U32 state;
ab3bbdeb 870 SV *sv=sv_newmortal();
55eed653 871 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
872 GET_RE_DEBUG_FLAGS_DECL;
873
7918f24d 874 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 875
3dab1dad
YO
876 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
877 (int)depth * 2 + 2,"",
878 "Match","Base","Ofs" );
879
880 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 881 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 882 if ( tmp ) {
ab3bbdeb
YO
883 PerlIO_printf( Perl_debug_log, "%*s",
884 colwidth,
ddc5bc0f 885 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
886 PL_colors[0], PL_colors[1],
887 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
888 PERL_PV_ESCAPE_FIRSTCHAR
889 )
890 );
3dab1dad
YO
891 }
892 }
893 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
894 (int)depth * 2 + 2,"");
895
896 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 897 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
898 PerlIO_printf( Perl_debug_log, "\n");
899
1e2e3d02 900 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 901 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
902
903 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
904
905 if ( trie->states[ state ].wordnum ) {
906 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
907 } else {
908 PerlIO_printf( Perl_debug_log, "%6s", "" );
909 }
910
911 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
912
913 if ( base ) {
914 U32 ofs = 0;
915
916 while( ( base + ofs < trie->uniquecharcount ) ||
917 ( base + ofs - trie->uniquecharcount < trie->lasttrans
918 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
919 ofs++;
920
921 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
922
923 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
924 if ( ( base + ofs >= trie->uniquecharcount ) &&
925 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
926 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
927 {
ab3bbdeb
YO
928 PerlIO_printf( Perl_debug_log, "%*"UVXf,
929 colwidth,
3dab1dad
YO
930 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
931 } else {
ab3bbdeb 932 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
933 }
934 }
935
936 PerlIO_printf( Perl_debug_log, "]");
937
938 }
939 PerlIO_printf( Perl_debug_log, "\n" );
940 }
941}
942/*
3dab1dad
YO
943 Dumps a fully constructed but uncompressed trie in list form.
944 List tries normally only are used for construction when the number of
945 possible chars (trie->uniquecharcount) is very high.
946 Used for debugging make_trie().
947*/
948STATIC void
55eed653 949S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
950 HV *widecharmap, AV *revcharmap, U32 next_alloc,
951 U32 depth)
3dab1dad
YO
952{
953 U32 state;
ab3bbdeb 954 SV *sv=sv_newmortal();
55eed653 955 int colwidth= widecharmap ? 6 : 4;
3dab1dad 956 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
957
958 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
959
3dab1dad 960 /* print out the table precompression. */
ab3bbdeb
YO
961 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
962 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
963 "------:-----+-----------------\n" );
3dab1dad
YO
964
965 for( state=1 ; state < next_alloc ; state ++ ) {
966 U16 charid;
967
ab3bbdeb 968 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
969 (int)depth * 2 + 2,"", (UV)state );
970 if ( ! trie->states[ state ].wordnum ) {
971 PerlIO_printf( Perl_debug_log, "%5s| ","");
972 } else {
973 PerlIO_printf( Perl_debug_log, "W%4x| ",
974 trie->states[ state ].wordnum
975 );
976 }
977 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 978 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
979 if ( tmp ) {
980 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
981 colwidth,
ddc5bc0f 982 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
983 PL_colors[0], PL_colors[1],
984 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
985 PERL_PV_ESCAPE_FIRSTCHAR
986 ) ,
1e2e3d02
YO
987 TRIE_LIST_ITEM(state,charid).forid,
988 (UV)TRIE_LIST_ITEM(state,charid).newstate
989 );
990 if (!(charid % 10))
664e119d
RGS
991 PerlIO_printf(Perl_debug_log, "\n%*s| ",
992 (int)((depth * 2) + 14), "");
1e2e3d02 993 }
ab3bbdeb
YO
994 }
995 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
996 }
997}
998
999/*
3dab1dad
YO
1000 Dumps a fully constructed but uncompressed trie in table form.
1001 This is the normal DFA style state transition table, with a few
1002 twists to facilitate compression later.
1003 Used for debugging make_trie().
1004*/
1005STATIC void
55eed653 1006S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1007 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1008 U32 depth)
3dab1dad
YO
1009{
1010 U32 state;
1011 U16 charid;
ab3bbdeb 1012 SV *sv=sv_newmortal();
55eed653 1013 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1014 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1015
1016 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1017
1018 /*
1019 print out the table precompression so that we can do a visual check
1020 that they are identical.
1021 */
1022
1023 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1024
1025 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1026 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1027 if ( tmp ) {
ab3bbdeb
YO
1028 PerlIO_printf( Perl_debug_log, "%*s",
1029 colwidth,
ddc5bc0f 1030 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1031 PL_colors[0], PL_colors[1],
1032 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1033 PERL_PV_ESCAPE_FIRSTCHAR
1034 )
1035 );
3dab1dad
YO
1036 }
1037 }
1038
1039 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1040
1041 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1042 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1043 }
1044
1045 PerlIO_printf( Perl_debug_log, "\n" );
1046
1047 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1048
1049 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1050 (int)depth * 2 + 2,"",
1051 (UV)TRIE_NODENUM( state ) );
1052
1053 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1054 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1055 if (v)
1056 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1057 else
1058 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1059 }
1060 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1061 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1062 } else {
1063 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1064 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1065 }
1066 }
07be1b83 1067}
3dab1dad
YO
1068
1069#endif
1070
786e8c11
YO
1071/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1072 startbranch: the first branch in the whole branch sequence
1073 first : start branch of sequence of branch-exact nodes.
1074 May be the same as startbranch
1075 last : Thing following the last branch.
1076 May be the same as tail.
1077 tail : item following the branch sequence
1078 count : words in the sequence
1079 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1080 depth : indent depth
3dab1dad 1081
786e8c11 1082Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1083
786e8c11
YO
1084A trie is an N'ary tree where the branches are determined by digital
1085decomposition of the key. IE, at the root node you look up the 1st character and
1086follow that branch repeat until you find the end of the branches. Nodes can be
1087marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1088
786e8c11 1089 /he|she|his|hers/
72f13be8 1090
786e8c11
YO
1091would convert into the following structure. Numbers represent states, letters
1092following numbers represent valid transitions on the letter from that state, if
1093the number is in square brackets it represents an accepting state, otherwise it
1094will be in parenthesis.
07be1b83 1095
786e8c11
YO
1096 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1097 | |
1098 | (2)
1099 | |
1100 (1) +-i->(6)-+-s->[7]
1101 |
1102 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1103
786e8c11
YO
1104 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1105
1106This shows that when matching against the string 'hers' we will begin at state 1
1107read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1108then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1109is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1110single traverse. We store a mapping from accepting to state to which word was
1111matched, and then when we have multiple possibilities we try to complete the
1112rest of the regex in the order in which they occured in the alternation.
1113
1114The only prior NFA like behaviour that would be changed by the TRIE support is
1115the silent ignoring of duplicate alternations which are of the form:
1116
1117 / (DUPE|DUPE) X? (?{ ... }) Y /x
1118
1119Thus EVAL blocks follwing a trie may be called a different number of times with
1120and without the optimisation. With the optimisations dupes will be silently
1121ignored. This inconsistant behaviour of EVAL type nodes is well established as
1122the following demonstrates:
1123
1124 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1125
1126which prints out 'word' three times, but
1127
1128 'words'=~/(word|word|word)(?{ print $1 })S/
1129
1130which doesnt print it out at all. This is due to other optimisations kicking in.
1131
1132Example of what happens on a structural level:
1133
1134The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1135
1136 1: CURLYM[1] {1,32767}(18)
1137 5: BRANCH(8)
1138 6: EXACT <ac>(16)
1139 8: BRANCH(11)
1140 9: EXACT <ad>(16)
1141 11: BRANCH(14)
1142 12: EXACT <ab>(16)
1143 16: SUCCEED(0)
1144 17: NOTHING(18)
1145 18: END(0)
1146
1147This would be optimizable with startbranch=5, first=5, last=16, tail=16
1148and should turn into:
1149
1150 1: CURLYM[1] {1,32767}(18)
1151 5: TRIE(16)
1152 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1153 <ac>
1154 <ad>
1155 <ab>
1156 16: SUCCEED(0)
1157 17: NOTHING(18)
1158 18: END(0)
1159
1160Cases where tail != last would be like /(?foo|bar)baz/:
1161
1162 1: BRANCH(4)
1163 2: EXACT <foo>(8)
1164 4: BRANCH(7)
1165 5: EXACT <bar>(8)
1166 7: TAIL(8)
1167 8: EXACT <baz>(10)
1168 10: END(0)
1169
1170which would be optimizable with startbranch=1, first=1, last=7, tail=8
1171and would end up looking like:
1172
1173 1: TRIE(8)
1174 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1175 <foo>
1176 <bar>
1177 7: TAIL(8)
1178 8: EXACT <baz>(10)
1179 10: END(0)
1180
1181 d = uvuni_to_utf8_flags(d, uv, 0);
1182
1183is the recommended Unicode-aware way of saying
1184
1185 *(d++) = uv;
1186*/
1187
1e2e3d02 1188#define TRIE_STORE_REVCHAR \
786e8c11 1189 STMT_START { \
73031816
NC
1190 if (UTF) { \
1191 SV *zlopp = newSV(2); \
88c9ea1e
CB
1192 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1193 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1194 SvCUR_set(zlopp, kapow - flrbbbbb); \
1195 SvPOK_on(zlopp); \
1196 SvUTF8_on(zlopp); \
1197 av_push(revcharmap, zlopp); \
1198 } else { \
6bdeddd2 1199 char ooooff = (char)uvc; \
73031816
NC
1200 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1201 } \
1202 } STMT_END
786e8c11
YO
1203
1204#define TRIE_READ_CHAR STMT_START { \
1205 wordlen++; \
1206 if ( UTF ) { \
1207 if ( folder ) { \
1208 if ( foldlen > 0 ) { \
1209 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1210 foldlen -= len; \
1211 scan += len; \
1212 len = 0; \
1213 } else { \
1214 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1215 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1216 foldlen -= UNISKIP( uvc ); \
1217 scan = foldbuf + UNISKIP( uvc ); \
1218 } \
1219 } else { \
1220 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1221 } \
1222 } else { \
1223 uvc = (U32)*uc; \
1224 len = 1; \
1225 } \
1226} STMT_END
1227
1228
1229
1230#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1231 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1232 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1233 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1234 } \
1235 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1236 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1237 TRIE_LIST_CUR( state )++; \
1238} STMT_END
07be1b83 1239
786e8c11
YO
1240#define TRIE_LIST_NEW(state) STMT_START { \
1241 Newxz( trie->states[ state ].trans.list, \
1242 4, reg_trie_trans_le ); \
1243 TRIE_LIST_CUR( state ) = 1; \
1244 TRIE_LIST_LEN( state ) = 4; \
1245} STMT_END
07be1b83 1246
786e8c11
YO
1247#define TRIE_HANDLE_WORD(state) STMT_START { \
1248 U16 dupe= trie->states[ state ].wordnum; \
1249 regnode * const noper_next = regnext( noper ); \
1250 \
1251 if (trie->wordlen) \
1252 trie->wordlen[ curword ] = wordlen; \
1253 DEBUG_r({ \
1254 /* store the word for dumping */ \
1255 SV* tmp; \
1256 if (OP(noper) != NOTHING) \
740cce10 1257 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1258 else \
740cce10 1259 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1260 av_push( trie_words, tmp ); \
786e8c11
YO
1261 }); \
1262 \
1263 curword++; \
1264 \
1265 if ( noper_next < tail ) { \
1266 if (!trie->jump) \
c944940b 1267 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1268 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1269 if (!jumper) \
1270 jumper = noper_next; \
1271 if (!nextbranch) \
1272 nextbranch= regnext(cur); \
1273 } \
1274 \
1275 if ( dupe ) { \
1276 /* So it's a dupe. This means we need to maintain a */\
1277 /* linked-list from the first to the next. */\
1278 /* we only allocate the nextword buffer when there */\
1279 /* a dupe, so first time we have to do the allocation */\
1280 if (!trie->nextword) \
c944940b 1281 trie->nextword = (U16 *) \
446bd890 1282 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1283 while ( trie->nextword[dupe] ) \
1284 dupe= trie->nextword[dupe]; \
1285 trie->nextword[dupe]= curword; \
1286 } else { \
1287 /* we haven't inserted this word yet. */ \
1288 trie->states[ state ].wordnum = curword; \
1289 } \
1290} STMT_END
07be1b83 1291
3dab1dad 1292
786e8c11
YO
1293#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1294 ( ( base + charid >= ucharcount \
1295 && base + charid < ubound \
1296 && state == trie->trans[ base - ucharcount + charid ].check \
1297 && trie->trans[ base - ucharcount + charid ].next ) \
1298 ? trie->trans[ base - ucharcount + charid ].next \
1299 : ( state==1 ? special : 0 ) \
1300 )
3dab1dad 1301
786e8c11
YO
1302#define MADE_TRIE 1
1303#define MADE_JUMP_TRIE 2
1304#define MADE_EXACT_TRIE 4
3dab1dad 1305
a3621e74 1306STATIC I32
786e8c11 1307S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1308{
27da23d5 1309 dVAR;
a3621e74
YO
1310 /* first pass, loop through and scan words */
1311 reg_trie_data *trie;
55eed653 1312 HV *widecharmap = NULL;
2b8b4781 1313 AV *revcharmap = newAV();
a3621e74 1314 regnode *cur;
9f7f3913 1315 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1316 STRLEN len = 0;
1317 UV uvc = 0;
1318 U16 curword = 0;
1319 U32 next_alloc = 0;
786e8c11
YO
1320 regnode *jumper = NULL;
1321 regnode *nextbranch = NULL;
7f69552c 1322 regnode *convert = NULL;
a3621e74 1323 /* we just use folder as a flag in utf8 */
e1ec3a88 1324 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1325 ? PL_fold
1326 : ( flags == EXACTFL
1327 ? PL_fold_locale
1328 : NULL
1329 )
1330 );
1331
2b8b4781
NC
1332#ifdef DEBUGGING
1333 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1334 AV *trie_words = NULL;
1335 /* along with revcharmap, this only used during construction but both are
1336 * useful during debugging so we store them in the struct when debugging.
8e11feef 1337 */
2b8b4781
NC
1338#else
1339 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1340 STRLEN trie_charcount=0;
3dab1dad 1341#endif
2b8b4781 1342 SV *re_trie_maxbuff;
a3621e74 1343 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1344
1345 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1346#ifndef DEBUGGING
1347 PERL_UNUSED_ARG(depth);
1348#endif
a3621e74 1349
c944940b 1350 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1351 trie->refcount = 1;
3dab1dad 1352 trie->startstate = 1;
786e8c11 1353 trie->wordcount = word_count;
f8fc2ecf 1354 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1355 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1356 if (!(UTF && folder))
c944940b 1357 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1358 DEBUG_r({
2b8b4781 1359 trie_words = newAV();
a3621e74 1360 });
a3621e74 1361
0111c4fd 1362 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1363 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1364 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1365 }
3dab1dad
YO
1366 DEBUG_OPTIMISE_r({
1367 PerlIO_printf( Perl_debug_log,
786e8c11 1368 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1369 (int)depth * 2 + 2, "",
1370 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1371 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1372 (int)depth);
3dab1dad 1373 });
7f69552c
YO
1374
1375 /* Find the node we are going to overwrite */
1376 if ( first == startbranch && OP( last ) != BRANCH ) {
1377 /* whole branch chain */
1378 convert = first;
1379 } else {
1380 /* branch sub-chain */
1381 convert = NEXTOPER( first );
1382 }
1383
a3621e74
YO
1384 /* -- First loop and Setup --
1385
1386 We first traverse the branches and scan each word to determine if it
1387 contains widechars, and how many unique chars there are, this is
1388 important as we have to build a table with at least as many columns as we
1389 have unique chars.
1390
1391 We use an array of integers to represent the character codes 0..255
38a44b82 1392 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1393 native representation of the character value as the key and IV's for the
1394 coded index.
1395
1396 *TODO* If we keep track of how many times each character is used we can
1397 remap the columns so that the table compression later on is more
1398 efficient in terms of memory by ensuring most common value is in the
1399 middle and the least common are on the outside. IMO this would be better
1400 than a most to least common mapping as theres a decent chance the most
1401 common letter will share a node with the least common, meaning the node
1402 will not be compressable. With a middle is most common approach the worst
1403 case is when we have the least common nodes twice.
1404
1405 */
1406
a3621e74 1407 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1408 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1409 const U8 *uc = (U8*)STRING( noper );
a28509cc 1410 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1411 STRLEN foldlen = 0;
1412 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1413 const U8 *scan = (U8*)NULL;
07be1b83 1414 U32 wordlen = 0; /* required init */
02daf0ab
YO
1415 STRLEN chars = 0;
1416 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1417
3dab1dad
YO
1418 if (OP(noper) == NOTHING) {
1419 trie->minlen= 0;
1420 continue;
1421 }
02daf0ab
YO
1422 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1423 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1424 regardless of encoding */
1425
a3621e74 1426 for ( ; uc < e ; uc += len ) {
3dab1dad 1427 TRIE_CHARCOUNT(trie)++;
a3621e74 1428 TRIE_READ_CHAR;
3dab1dad 1429 chars++;
a3621e74
YO
1430 if ( uvc < 256 ) {
1431 if ( !trie->charmap[ uvc ] ) {
1432 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1433 if ( folder )
1434 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1435 TRIE_STORE_REVCHAR;
a3621e74 1436 }
02daf0ab
YO
1437 if ( set_bit ) {
1438 /* store the codepoint in the bitmap, and if its ascii
1439 also store its folded equivelent. */
1440 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1441
1442 /* store the folded codepoint */
1443 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1444
1445 if ( !UTF ) {
1446 /* store first byte of utf8 representation of
1447 codepoints in the 127 < uvc < 256 range */
1448 if (127 < uvc && uvc < 192) {
1449 TRIE_BITMAP_SET(trie,194);
1450 } else if (191 < uvc ) {
1451 TRIE_BITMAP_SET(trie,195);
1452 /* && uvc < 256 -- we know uvc is < 256 already */
1453 }
1454 }
02daf0ab
YO
1455 set_bit = 0; /* We've done our bit :-) */
1456 }
a3621e74
YO
1457 } else {
1458 SV** svpp;
55eed653
NC
1459 if ( !widecharmap )
1460 widecharmap = newHV();
a3621e74 1461
55eed653 1462 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1463
1464 if ( !svpp )
e4584336 1465 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1466
1467 if ( !SvTRUE( *svpp ) ) {
1468 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1469 TRIE_STORE_REVCHAR;
a3621e74
YO
1470 }
1471 }
1472 }
3dab1dad
YO
1473 if( cur == first ) {
1474 trie->minlen=chars;
1475 trie->maxlen=chars;
1476 } else if (chars < trie->minlen) {
1477 trie->minlen=chars;
1478 } else if (chars > trie->maxlen) {
1479 trie->maxlen=chars;
1480 }
1481
a3621e74
YO
1482 } /* end first pass */
1483 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1484 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1485 (int)depth * 2 + 2,"",
55eed653 1486 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1487 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1488 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1489 );
c944940b 1490 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1491
1492 /*
1493 We now know what we are dealing with in terms of unique chars and
1494 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1495 representation using a flat table will take. If it's over a reasonable
1496 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1497 conservative but potentially much slower representation using an array
1498 of lists.
1499
1500 At the end we convert both representations into the same compressed
1501 form that will be used in regexec.c for matching with. The latter
1502 is a form that cannot be used to construct with but has memory
1503 properties similar to the list form and access properties similar
1504 to the table form making it both suitable for fast searches and
1505 small enough that its feasable to store for the duration of a program.
1506
1507 See the comment in the code where the compressed table is produced
1508 inplace from the flat tabe representation for an explanation of how
1509 the compression works.
1510
1511 */
1512
1513
3dab1dad 1514 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1515 /*
1516 Second Pass -- Array Of Lists Representation
1517
1518 Each state will be represented by a list of charid:state records
1519 (reg_trie_trans_le) the first such element holds the CUR and LEN
1520 points of the allocated array. (See defines above).
1521
1522 We build the initial structure using the lists, and then convert
1523 it into the compressed table form which allows faster lookups
1524 (but cant be modified once converted).
a3621e74
YO
1525 */
1526
a3621e74
YO
1527 STRLEN transcount = 1;
1528
1e2e3d02
YO
1529 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1530 "%*sCompiling trie using list compiler\n",
1531 (int)depth * 2 + 2, ""));
446bd890 1532
c944940b
JH
1533 trie->states = (reg_trie_state *)
1534 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1535 sizeof(reg_trie_state) );
a3621e74
YO
1536 TRIE_LIST_NEW(1);
1537 next_alloc = 2;
1538
1539 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1540
c445ea15
AL
1541 regnode * const noper = NEXTOPER( cur );
1542 U8 *uc = (U8*)STRING( noper );
1543 const U8 * const e = uc + STR_LEN( noper );
1544 U32 state = 1; /* required init */
1545 U16 charid = 0; /* sanity init */
1546 U8 *scan = (U8*)NULL; /* sanity init */
1547 STRLEN foldlen = 0; /* required init */
07be1b83 1548 U32 wordlen = 0; /* required init */
c445ea15
AL
1549 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1550
3dab1dad 1551 if (OP(noper) != NOTHING) {
786e8c11 1552 for ( ; uc < e ; uc += len ) {
c445ea15 1553
786e8c11 1554 TRIE_READ_CHAR;
c445ea15 1555
786e8c11
YO
1556 if ( uvc < 256 ) {
1557 charid = trie->charmap[ uvc ];
c445ea15 1558 } else {
55eed653 1559 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1560 if ( !svpp ) {
1561 charid = 0;
1562 } else {
1563 charid=(U16)SvIV( *svpp );
1564 }
c445ea15 1565 }
786e8c11
YO
1566 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1567 if ( charid ) {
a3621e74 1568
786e8c11
YO
1569 U16 check;
1570 U32 newstate = 0;
a3621e74 1571
786e8c11
YO
1572 charid--;
1573 if ( !trie->states[ state ].trans.list ) {
1574 TRIE_LIST_NEW( state );
c445ea15 1575 }
786e8c11
YO
1576 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1577 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1578 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1579 break;
1580 }
1581 }
1582 if ( ! newstate ) {
1583 newstate = next_alloc++;
1584 TRIE_LIST_PUSH( state, charid, newstate );
1585 transcount++;
1586 }
1587 state = newstate;
1588 } else {
1589 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1590 }
a28509cc 1591 }
c445ea15 1592 }
3dab1dad 1593 TRIE_HANDLE_WORD(state);
a3621e74
YO
1594
1595 } /* end second pass */
1596
1e2e3d02
YO
1597 /* next alloc is the NEXT state to be allocated */
1598 trie->statecount = next_alloc;
c944940b
JH
1599 trie->states = (reg_trie_state *)
1600 PerlMemShared_realloc( trie->states,
1601 next_alloc
1602 * sizeof(reg_trie_state) );
a3621e74 1603
3dab1dad 1604 /* and now dump it out before we compress it */
2b8b4781
NC
1605 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1606 revcharmap, next_alloc,
1607 depth+1)
1e2e3d02 1608 );
a3621e74 1609
c944940b
JH
1610 trie->trans = (reg_trie_trans *)
1611 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1612 {
1613 U32 state;
a3621e74
YO
1614 U32 tp = 0;
1615 U32 zp = 0;
1616
1617
1618 for( state=1 ; state < next_alloc ; state ++ ) {
1619 U32 base=0;
1620
1621 /*
1622 DEBUG_TRIE_COMPILE_MORE_r(
1623 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1624 );
1625 */
1626
1627 if (trie->states[state].trans.list) {
1628 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1629 U16 maxid=minid;
a28509cc 1630 U16 idx;
a3621e74
YO
1631
1632 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1633 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1634 if ( forid < minid ) {
1635 minid=forid;
1636 } else if ( forid > maxid ) {
1637 maxid=forid;
1638 }
a3621e74
YO
1639 }
1640 if ( transcount < tp + maxid - minid + 1) {
1641 transcount *= 2;
c944940b
JH
1642 trie->trans = (reg_trie_trans *)
1643 PerlMemShared_realloc( trie->trans,
446bd890
NC
1644 transcount
1645 * sizeof(reg_trie_trans) );
a3621e74
YO
1646 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1647 }
1648 base = trie->uniquecharcount + tp - minid;
1649 if ( maxid == minid ) {
1650 U32 set = 0;
1651 for ( ; zp < tp ; zp++ ) {
1652 if ( ! trie->trans[ zp ].next ) {
1653 base = trie->uniquecharcount + zp - minid;
1654 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1655 trie->trans[ zp ].check = state;
1656 set = 1;
1657 break;
1658 }
1659 }
1660 if ( !set ) {
1661 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1662 trie->trans[ tp ].check = state;
1663 tp++;
1664 zp = tp;
1665 }
1666 } else {
1667 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1668 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1669 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1670 trie->trans[ tid ].check = state;
1671 }
1672 tp += ( maxid - minid + 1 );
1673 }
1674 Safefree(trie->states[ state ].trans.list);
1675 }
1676 /*
1677 DEBUG_TRIE_COMPILE_MORE_r(
1678 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1679 );
1680 */
1681 trie->states[ state ].trans.base=base;
1682 }
cc601c31 1683 trie->lasttrans = tp + 1;
a3621e74
YO
1684 }
1685 } else {
1686 /*
1687 Second Pass -- Flat Table Representation.
1688
1689 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1690 We know that we will need Charcount+1 trans at most to store the data
1691 (one row per char at worst case) So we preallocate both structures
1692 assuming worst case.
1693
1694 We then construct the trie using only the .next slots of the entry
1695 structs.
1696
1697 We use the .check field of the first entry of the node temporarily to
1698 make compression both faster and easier by keeping track of how many non
1699 zero fields are in the node.
1700
1701 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1702 transition.
1703
1704 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1705 number representing the first entry of the node, and state as a
1706 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1707 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1708 are 2 entrys per node. eg:
1709
1710 A B A B
1711 1. 2 4 1. 3 7
1712 2. 0 3 3. 0 5
1713 3. 0 0 5. 0 0
1714 4. 0 0 7. 0 0
1715
1716 The table is internally in the right hand, idx form. However as we also
1717 have to deal with the states array which is indexed by nodenum we have to
1718 use TRIE_NODENUM() to convert.
1719
1720 */
1e2e3d02
YO
1721 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1722 "%*sCompiling trie using table compiler\n",
1723 (int)depth * 2 + 2, ""));
3dab1dad 1724
c944940b
JH
1725 trie->trans = (reg_trie_trans *)
1726 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1727 * trie->uniquecharcount + 1,
1728 sizeof(reg_trie_trans) );
1729 trie->states = (reg_trie_state *)
1730 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1731 sizeof(reg_trie_state) );
a3621e74
YO
1732 next_alloc = trie->uniquecharcount + 1;
1733
3dab1dad 1734
a3621e74
YO
1735 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1736
c445ea15 1737 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1738 const U8 *uc = (U8*)STRING( noper );
1739 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1740
1741 U32 state = 1; /* required init */
1742
1743 U16 charid = 0; /* sanity init */
1744 U32 accept_state = 0; /* sanity init */
1745 U8 *scan = (U8*)NULL; /* sanity init */
1746
1747 STRLEN foldlen = 0; /* required init */
07be1b83 1748 U32 wordlen = 0; /* required init */
a3621e74
YO
1749 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1750
3dab1dad 1751 if ( OP(noper) != NOTHING ) {
786e8c11 1752 for ( ; uc < e ; uc += len ) {
a3621e74 1753
786e8c11 1754 TRIE_READ_CHAR;
a3621e74 1755
786e8c11
YO
1756 if ( uvc < 256 ) {
1757 charid = trie->charmap[ uvc ];
1758 } else {
55eed653 1759 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1760 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1761 }
786e8c11
YO
1762 if ( charid ) {
1763 charid--;
1764 if ( !trie->trans[ state + charid ].next ) {
1765 trie->trans[ state + charid ].next = next_alloc;
1766 trie->trans[ state ].check++;
1767 next_alloc += trie->uniquecharcount;
1768 }
1769 state = trie->trans[ state + charid ].next;
1770 } else {
1771 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1772 }
1773 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1774 }
a3621e74 1775 }
3dab1dad
YO
1776 accept_state = TRIE_NODENUM( state );
1777 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1778
1779 } /* end second pass */
1780
3dab1dad 1781 /* and now dump it out before we compress it */
2b8b4781
NC
1782 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1783 revcharmap,
1784 next_alloc, depth+1));
a3621e74 1785
a3621e74
YO
1786 {
1787 /*
1788 * Inplace compress the table.*
1789
1790 For sparse data sets the table constructed by the trie algorithm will
1791 be mostly 0/FAIL transitions or to put it another way mostly empty.
1792 (Note that leaf nodes will not contain any transitions.)
1793
1794 This algorithm compresses the tables by eliminating most such
1795 transitions, at the cost of a modest bit of extra work during lookup:
1796
1797 - Each states[] entry contains a .base field which indicates the
1798 index in the state[] array wheres its transition data is stored.
1799
1800 - If .base is 0 there are no valid transitions from that node.
1801
1802 - If .base is nonzero then charid is added to it to find an entry in
1803 the trans array.
1804
1805 -If trans[states[state].base+charid].check!=state then the
1806 transition is taken to be a 0/Fail transition. Thus if there are fail
1807 transitions at the front of the node then the .base offset will point
1808 somewhere inside the previous nodes data (or maybe even into a node
1809 even earlier), but the .check field determines if the transition is
1810 valid.
1811
786e8c11 1812 XXX - wrong maybe?
a3621e74
YO
1813 The following process inplace converts the table to the compressed
1814 table: We first do not compress the root node 1,and mark its all its
1815 .check pointers as 1 and set its .base pointer as 1 as well. This
1816 allows to do a DFA construction from the compressed table later, and
1817 ensures that any .base pointers we calculate later are greater than
1818 0.
1819
1820 - We set 'pos' to indicate the first entry of the second node.
1821
1822 - We then iterate over the columns of the node, finding the first and
1823 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1824 and set the .check pointers accordingly, and advance pos
1825 appropriately and repreat for the next node. Note that when we copy
1826 the next pointers we have to convert them from the original
1827 NODEIDX form to NODENUM form as the former is not valid post
1828 compression.
1829
1830 - If a node has no transitions used we mark its base as 0 and do not
1831 advance the pos pointer.
1832
1833 - If a node only has one transition we use a second pointer into the
1834 structure to fill in allocated fail transitions from other states.
1835 This pointer is independent of the main pointer and scans forward
1836 looking for null transitions that are allocated to a state. When it
1837 finds one it writes the single transition into the "hole". If the
786e8c11 1838 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1839
1840 - Once compressed we can Renew/realloc the structures to release the
1841 excess space.
1842
1843 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1844 specifically Fig 3.47 and the associated pseudocode.
1845
1846 demq
1847 */
a3b680e6 1848 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1849 U32 state, charid;
a3621e74 1850 U32 pos = 0, zp=0;
1e2e3d02 1851 trie->statecount = laststate;
a3621e74
YO
1852
1853 for ( state = 1 ; state < laststate ; state++ ) {
1854 U8 flag = 0;
a28509cc
AL
1855 const U32 stateidx = TRIE_NODEIDX( state );
1856 const U32 o_used = trie->trans[ stateidx ].check;
1857 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1858 trie->trans[ stateidx ].check = 0;
1859
1860 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1861 if ( flag || trie->trans[ stateidx + charid ].next ) {
1862 if ( trie->trans[ stateidx + charid ].next ) {
1863 if (o_used == 1) {
1864 for ( ; zp < pos ; zp++ ) {
1865 if ( ! trie->trans[ zp ].next ) {
1866 break;
1867 }
1868 }
1869 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1870 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1871 trie->trans[ zp ].check = state;
1872 if ( ++zp > pos ) pos = zp;
1873 break;
1874 }
1875 used--;
1876 }
1877 if ( !flag ) {
1878 flag = 1;
1879 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1880 }
1881 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1882 trie->trans[ pos ].check = state;
1883 pos++;
1884 }
1885 }
1886 }
cc601c31 1887 trie->lasttrans = pos + 1;
c944940b
JH
1888 trie->states = (reg_trie_state *)
1889 PerlMemShared_realloc( trie->states, laststate
1890 * sizeof(reg_trie_state) );
a3621e74 1891 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1892 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1893 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1894 (int)depth * 2 + 2,"",
1895 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1896 (IV)next_alloc,
1897 (IV)pos,
a3621e74
YO
1898 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1899 );
1900
1901 } /* end table compress */
1902 }
1e2e3d02
YO
1903 DEBUG_TRIE_COMPILE_MORE_r(
1904 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1905 (int)depth * 2 + 2, "",
1906 (UV)trie->statecount,
1907 (UV)trie->lasttrans)
1908 );
cc601c31 1909 /* resize the trans array to remove unused space */
c944940b
JH
1910 trie->trans = (reg_trie_trans *)
1911 PerlMemShared_realloc( trie->trans, trie->lasttrans
1912 * sizeof(reg_trie_trans) );
a3621e74 1913
3dab1dad 1914 /* and now dump out the compressed format */
2b8b4781 1915 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1916
3dab1dad 1917 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1918 U8 nodetype =(U8)(flags & 0xFF);
1919 char *str=NULL;
786e8c11 1920
07be1b83 1921#ifdef DEBUGGING
e62cc96a 1922 regnode *optimize = NULL;
7122b237
YO
1923#ifdef RE_TRACK_PATTERN_OFFSETS
1924
b57a0404
JH
1925 U32 mjd_offset = 0;
1926 U32 mjd_nodelen = 0;
7122b237
YO
1927#endif /* RE_TRACK_PATTERN_OFFSETS */
1928#endif /* DEBUGGING */
a3621e74 1929 /*
3dab1dad
YO
1930 This means we convert either the first branch or the first Exact,
1931 depending on whether the thing following (in 'last') is a branch
1932 or not and whther first is the startbranch (ie is it a sub part of
1933 the alternation or is it the whole thing.)
1934 Assuming its a sub part we conver the EXACT otherwise we convert
1935 the whole branch sequence, including the first.
a3621e74 1936 */
3dab1dad 1937 /* Find the node we are going to overwrite */
7f69552c 1938 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1939 /* branch sub-chain */
3dab1dad 1940 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1941#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1942 DEBUG_r({
1943 mjd_offset= Node_Offset((convert));
1944 mjd_nodelen= Node_Length((convert));
1945 });
7122b237 1946#endif
7f69552c 1947 /* whole branch chain */
7122b237
YO
1948 }
1949#ifdef RE_TRACK_PATTERN_OFFSETS
1950 else {
7f69552c
YO
1951 DEBUG_r({
1952 const regnode *nop = NEXTOPER( convert );
1953 mjd_offset= Node_Offset((nop));
1954 mjd_nodelen= Node_Length((nop));
1955 });
07be1b83
YO
1956 }
1957 DEBUG_OPTIMISE_r(
1958 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1959 (int)depth * 2 + 2, "",
786e8c11 1960 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1961 );
7122b237 1962#endif
3dab1dad
YO
1963 /* But first we check to see if there is a common prefix we can
1964 split out as an EXACT and put in front of the TRIE node. */
1965 trie->startstate= 1;
55eed653 1966 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1967 U32 state;
1e2e3d02 1968 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1969 U32 ofs = 0;
8e11feef
RGS
1970 I32 idx = -1;
1971 U32 count = 0;
1972 const U32 base = trie->states[ state ].trans.base;
a3621e74 1973
3dab1dad 1974 if ( trie->states[state].wordnum )
8e11feef 1975 count = 1;
a3621e74 1976
8e11feef 1977 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1978 if ( ( base + ofs >= trie->uniquecharcount ) &&
1979 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1980 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1981 {
3dab1dad 1982 if ( ++count > 1 ) {
2b8b4781 1983 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1984 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1985 if ( state == 1 ) break;
3dab1dad
YO
1986 if ( count == 2 ) {
1987 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1988 DEBUG_OPTIMISE_r(
8e11feef
RGS
1989 PerlIO_printf(Perl_debug_log,
1990 "%*sNew Start State=%"UVuf" Class: [",
1991 (int)depth * 2 + 2, "",
786e8c11 1992 (UV)state));
be8e71aa 1993 if (idx >= 0) {
2b8b4781 1994 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1995 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1996
3dab1dad 1997 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1998 if ( folder )
1999 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2000 DEBUG_OPTIMISE_r(
f1f66076 2001 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2002 );
8e11feef
RGS
2003 }
2004 }
2005 TRIE_BITMAP_SET(trie,*ch);
2006 if ( folder )
2007 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2008 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2009 }
2010 idx = ofs;
2011 }
3dab1dad
YO
2012 }
2013 if ( count == 1 ) {
2b8b4781 2014 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2015 STRLEN len;
2016 char *ch = SvPV( *tmp, len );
de734bd5
A
2017 DEBUG_OPTIMISE_r({
2018 SV *sv=sv_newmortal();
8e11feef
RGS
2019 PerlIO_printf( Perl_debug_log,
2020 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2021 (int)depth * 2 + 2, "",
de734bd5
A
2022 (UV)state, (UV)idx,
2023 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2024 PL_colors[0], PL_colors[1],
2025 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2026 PERL_PV_ESCAPE_FIRSTCHAR
2027 )
2028 );
2029 });
3dab1dad
YO
2030 if ( state==1 ) {
2031 OP( convert ) = nodetype;
2032 str=STRING(convert);
2033 STR_LEN(convert)=0;
2034 }
c490c714
YO
2035 STR_LEN(convert) += len;
2036 while (len--)
de734bd5 2037 *str++ = *ch++;
8e11feef 2038 } else {
f9049ba1 2039#ifdef DEBUGGING
8e11feef
RGS
2040 if (state>1)
2041 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2042#endif
8e11feef
RGS
2043 break;
2044 }
2045 }
3dab1dad 2046 if (str) {
8e11feef 2047 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2048 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2049 trie->startstate = state;
07be1b83
YO
2050 trie->minlen -= (state - 1);
2051 trie->maxlen -= (state - 1);
33809eae
JH
2052#ifdef DEBUGGING
2053 /* At least the UNICOS C compiler choked on this
2054 * being argument to DEBUG_r(), so let's just have
2055 * it right here. */
2056 if (
2057#ifdef PERL_EXT_RE_BUILD
2058 1
2059#else
2060 DEBUG_r_TEST
2061#endif
2062 ) {
2063 regnode *fix = convert;
2064 U32 word = trie->wordcount;
2065 mjd_nodelen++;
2066 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2067 while( ++fix < n ) {
2068 Set_Node_Offset_Length(fix, 0, 0);
2069 }
2070 while (word--) {
2071 SV ** const tmp = av_fetch( trie_words, word, 0 );
2072 if (tmp) {
2073 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2074 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2075 else
2076 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2077 }
2078 }
2079 }
2080#endif
8e11feef
RGS
2081 if (trie->maxlen) {
2082 convert = n;
2083 } else {
3dab1dad 2084 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2085 DEBUG_r(optimize= n);
3dab1dad
YO
2086 }
2087 }
2088 }
a5ca303d
YO
2089 if (!jumper)
2090 jumper = last;
3dab1dad 2091 if ( trie->maxlen ) {
8e11feef
RGS
2092 NEXT_OFF( convert ) = (U16)(tail - convert);
2093 ARG_SET( convert, data_slot );
786e8c11
YO
2094 /* Store the offset to the first unabsorbed branch in
2095 jump[0], which is otherwise unused by the jump logic.
2096 We use this when dumping a trie and during optimisation. */
2097 if (trie->jump)
7f69552c 2098 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2099
786e8c11
YO
2100 /* XXXX */
2101 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2102 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2103 {
2104 OP( convert ) = TRIEC;
2105 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2106 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2107 trie->bitmap= NULL;
2108 } else
2109 OP( convert ) = TRIE;
a3621e74 2110
3dab1dad
YO
2111 /* store the type in the flags */
2112 convert->flags = nodetype;
a5ca303d
YO
2113 DEBUG_r({
2114 optimize = convert
2115 + NODE_STEP_REGNODE
2116 + regarglen[ OP( convert ) ];
2117 });
2118 /* XXX We really should free up the resource in trie now,
2119 as we won't use them - (which resources?) dmq */
3dab1dad 2120 }
a3621e74 2121 /* needed for dumping*/
e62cc96a 2122 DEBUG_r(if (optimize) {
07be1b83 2123 regnode *opt = convert;
bcdf7404 2124
e62cc96a 2125 while ( ++opt < optimize) {
07be1b83
YO
2126 Set_Node_Offset_Length(opt,0,0);
2127 }
786e8c11
YO
2128 /*
2129 Try to clean up some of the debris left after the
2130 optimisation.
a3621e74 2131 */
786e8c11 2132 while( optimize < jumper ) {
07be1b83 2133 mjd_nodelen += Node_Length((optimize));
a3621e74 2134 OP( optimize ) = OPTIMIZED;
07be1b83 2135 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2136 optimize++;
2137 }
07be1b83 2138 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2139 });
2140 } /* end node insert */
55eed653 2141 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2142#ifdef DEBUGGING
2143 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2144 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2145#else
2146 SvREFCNT_dec(revcharmap);
07be1b83 2147#endif
786e8c11
YO
2148 return trie->jump
2149 ? MADE_JUMP_TRIE
2150 : trie->startstate>1
2151 ? MADE_EXACT_TRIE
2152 : MADE_TRIE;
2153}
2154
2155STATIC void
2156S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2157{
2158/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2159
2160 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2161 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2162 ISBN 0-201-10088-6
2163
2164 We find the fail state for each state in the trie, this state is the longest proper
2165 suffix of the current states 'word' that is also a proper prefix of another word in our
2166 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2167 the DFA not to have to restart after its tried and failed a word at a given point, it
2168 simply continues as though it had been matching the other word in the first place.
2169 Consider
2170 'abcdgu'=~/abcdefg|cdgu/
2171 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2172 fail, which would bring use to the state representing 'd' in the second word where we would
2173 try 'g' and succeed, prodceding to match 'cdgu'.
2174 */
2175 /* add a fail transition */
3251b653
NC
2176 const U32 trie_offset = ARG(source);
2177 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2178 U32 *q;
2179 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2180 const U32 numstates = trie->statecount;
786e8c11
YO
2181 const U32 ubound = trie->lasttrans + ucharcount;
2182 U32 q_read = 0;
2183 U32 q_write = 0;
2184 U32 charid;
2185 U32 base = trie->states[ 1 ].trans.base;
2186 U32 *fail;
2187 reg_ac_data *aho;
2188 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2189 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2190
2191 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2192#ifndef DEBUGGING
2193 PERL_UNUSED_ARG(depth);
2194#endif
2195
2196
2197 ARG_SET( stclass, data_slot );
c944940b 2198 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2199 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2200 aho->trie=trie_offset;
446bd890
NC
2201 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2202 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2203 Newxz( q, numstates, U32);
c944940b 2204 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2205 aho->refcount = 1;
2206 fail = aho->fail;
2207 /* initialize fail[0..1] to be 1 so that we always have
2208 a valid final fail state */
2209 fail[ 0 ] = fail[ 1 ] = 1;
2210
2211 for ( charid = 0; charid < ucharcount ; charid++ ) {
2212 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2213 if ( newstate ) {
2214 q[ q_write ] = newstate;
2215 /* set to point at the root */
2216 fail[ q[ q_write++ ] ]=1;
2217 }
2218 }
2219 while ( q_read < q_write) {
2220 const U32 cur = q[ q_read++ % numstates ];
2221 base = trie->states[ cur ].trans.base;
2222
2223 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2224 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2225 if (ch_state) {
2226 U32 fail_state = cur;
2227 U32 fail_base;
2228 do {
2229 fail_state = fail[ fail_state ];
2230 fail_base = aho->states[ fail_state ].trans.base;
2231 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2232
2233 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2234 fail[ ch_state ] = fail_state;
2235 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2236 {
2237 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2238 }
2239 q[ q_write++ % numstates] = ch_state;
2240 }
2241 }
2242 }
2243 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2244 when we fail in state 1, this allows us to use the
2245 charclass scan to find a valid start char. This is based on the principle
2246 that theres a good chance the string being searched contains lots of stuff
2247 that cant be a start char.
2248 */
2249 fail[ 0 ] = fail[ 1 ] = 0;
2250 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2251 PerlIO_printf(Perl_debug_log,
2252 "%*sStclass Failtable (%"UVuf" states): 0",
2253 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2254 );
786e8c11
YO
2255 for( q_read=1; q_read<numstates; q_read++ ) {
2256 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2257 }
2258 PerlIO_printf(Perl_debug_log, "\n");
2259 });
2260 Safefree(q);
2261 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2262}
2263
786e8c11 2264
a3621e74 2265/*
5d1c421c
JH
2266 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2267 * These need to be revisited when a newer toolchain becomes available.
2268 */
2269#if defined(__sparc64__) && defined(__GNUC__)
2270# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2271# undef SPARC64_GCC_WORKAROUND
2272# define SPARC64_GCC_WORKAROUND 1
2273# endif
2274#endif
2275
07be1b83 2276#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2277 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2278 SV * const mysv=sv_newmortal(); \
2279 regnode *Next = regnext(scan); \
2280 regprop(RExC_rx, mysv, scan); \
7f69552c 2281 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2282 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2283 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2284 }});
07be1b83 2285
1de06328
YO
2286
2287
2288
2289
07be1b83
YO
2290#define JOIN_EXACT(scan,min,flags) \
2291 if (PL_regkind[OP(scan)] == EXACT) \
2292 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2293
be8e71aa 2294STATIC U32
07be1b83
YO
2295S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2296 /* Merge several consecutive EXACTish nodes into one. */
2297 regnode *n = regnext(scan);
2298 U32 stringok = 1;
2299 regnode *next = scan + NODE_SZ_STR(scan);
2300 U32 merged = 0;
2301 U32 stopnow = 0;
2302#ifdef DEBUGGING
2303 regnode *stop = scan;
72f13be8 2304 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2305#else
d47053eb
RGS
2306 PERL_UNUSED_ARG(depth);
2307#endif
7918f24d
NC
2308
2309 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2310#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2311 PERL_UNUSED_ARG(flags);
2312 PERL_UNUSED_ARG(val);
07be1b83 2313#endif
07be1b83
YO
2314 DEBUG_PEEP("join",scan,depth);
2315
2316 /* Skip NOTHING, merge EXACT*. */
2317 while (n &&
2318 ( PL_regkind[OP(n)] == NOTHING ||
2319 (stringok && (OP(n) == OP(scan))))
2320 && NEXT_OFF(n)
2321 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2322
2323 if (OP(n) == TAIL || n > next)
2324 stringok = 0;
2325 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2326 DEBUG_PEEP("skip:",n,depth);
2327 NEXT_OFF(scan) += NEXT_OFF(n);
2328 next = n + NODE_STEP_REGNODE;
2329#ifdef DEBUGGING
2330 if (stringok)
2331 stop = n;
2332#endif
2333 n = regnext(n);
2334 }
2335 else if (stringok) {
786e8c11 2336 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2337 regnode * const nnext = regnext(n);
2338
2339 DEBUG_PEEP("merg",n,depth);
2340
2341 merged++;
2342 if (oldl + STR_LEN(n) > U8_MAX)
2343 break;
2344 NEXT_OFF(scan) += NEXT_OFF(n);
2345 STR_LEN(scan) += STR_LEN(n);
2346 next = n + NODE_SZ_STR(n);
2347 /* Now we can overwrite *n : */
2348 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2349#ifdef DEBUGGING
2350 stop = next - 1;
2351#endif
2352 n = nnext;
2353 if (stopnow) break;
2354 }
2355
d47053eb
RGS
2356#ifdef EXPERIMENTAL_INPLACESCAN
2357 if (flags && !NEXT_OFF(n)) {
2358 DEBUG_PEEP("atch", val, depth);
2359 if (reg_off_by_arg[OP(n)]) {
2360 ARG_SET(n, val - n);
2361 }
2362 else {
2363 NEXT_OFF(n) = val - n;
2364 }
2365 stopnow = 1;
2366 }
07be1b83
YO
2367#endif
2368 }
2369
2370 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2371 /*
2372 Two problematic code points in Unicode casefolding of EXACT nodes:
2373
2374 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2375 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2376
2377 which casefold to
2378
2379 Unicode UTF-8
2380
2381 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2382 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2383
2384 This means that in case-insensitive matching (or "loose matching",
2385 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2386 length of the above casefolded versions) can match a target string
2387 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2388 This would rather mess up the minimum length computation.
2389
2390 What we'll do is to look for the tail four bytes, and then peek
2391 at the preceding two bytes to see whether we need to decrease
2392 the minimum length by four (six minus two).
2393
2394 Thanks to the design of UTF-8, there cannot be false matches:
2395 A sequence of valid UTF-8 bytes cannot be a subsequence of
2396 another valid sequence of UTF-8 bytes.
2397
2398 */
2399 char * const s0 = STRING(scan), *s, *t;
2400 char * const s1 = s0 + STR_LEN(scan) - 1;
2401 char * const s2 = s1 - 4;
e294cc5d
JH
2402#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2403 const char t0[] = "\xaf\x49\xaf\x42";
2404#else
07be1b83 2405 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2406#endif
07be1b83
YO
2407 const char * const t1 = t0 + 3;
2408
2409 for (s = s0 + 2;
2410 s < s2 && (t = ninstr(s, s1, t0, t1));
2411 s = t + 4) {
e294cc5d
JH
2412#ifdef EBCDIC
2413 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2414 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2415#else
07be1b83
YO
2416 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2417 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2418#endif
07be1b83
YO
2419 *min -= 4;
2420 }
2421 }
2422
2423#ifdef DEBUGGING
2424 /* Allow dumping */
2425 n = scan + NODE_SZ_STR(scan);
2426 while (n <= stop) {
2427 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2428 OP(n) = OPTIMIZED;
2429 NEXT_OFF(n) = 0;
2430 }
2431 n++;
2432 }
2433#endif
2434 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2435 return stopnow;
2436}
2437
653099ff
GS
2438/* REx optimizer. Converts nodes into quickier variants "in place".
2439 Finds fixed substrings. */
2440
a0288114 2441/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2442 to the position after last scanned or to NULL. */
2443
40d049e4
YO
2444#define INIT_AND_WITHP \
2445 assert(!and_withp); \
2446 Newx(and_withp,1,struct regnode_charclass_class); \
2447 SAVEFREEPV(and_withp)
07be1b83 2448
b515a41d
YO
2449/* this is a chain of data about sub patterns we are processing that
2450 need to be handled seperately/specially in study_chunk. Its so
2451 we can simulate recursion without losing state. */
2452struct scan_frame;
2453typedef struct scan_frame {
2454 regnode *last; /* last node to process in this frame */
2455 regnode *next; /* next node to process when last is reached */
2456 struct scan_frame *prev; /*previous frame*/
2457 I32 stop; /* what stopparen do we use */
2458} scan_frame;
2459
304ee84b
YO
2460
2461#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2462
e1d1eefb
YO
2463#define CASE_SYNST_FNC(nAmE) \
2464case nAmE: \
2465 if (flags & SCF_DO_STCLASS_AND) { \
2466 for (value = 0; value < 256; value++) \
2467 if (!is_ ## nAmE ## _cp(value)) \
2468 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2469 } \
2470 else { \
2471 for (value = 0; value < 256; value++) \
2472 if (is_ ## nAmE ## _cp(value)) \
2473 ANYOF_BITMAP_SET(data->start_class, value); \
2474 } \
2475 break; \
2476case N ## nAmE: \
2477 if (flags & SCF_DO_STCLASS_AND) { \
2478 for (value = 0; value < 256; value++) \
2479 if (is_ ## nAmE ## _cp(value)) \
2480 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2481 } \
2482 else { \
2483 for (value = 0; value < 256; value++) \
2484 if (!is_ ## nAmE ## _cp(value)) \
2485 ANYOF_BITMAP_SET(data->start_class, value); \
2486 } \
2487 break
2488
2489
2490
76e3520e 2491STATIC I32
40d049e4 2492S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2493 I32 *minlenp, I32 *deltap,
40d049e4
YO
2494 regnode *last,
2495 scan_data_t *data,
2496 I32 stopparen,
2497 U8* recursed,
2498 struct regnode_charclass_class *and_withp,
2499 U32 flags, U32 depth)
c277df42
IZ
2500 /* scanp: Start here (read-write). */
2501 /* deltap: Write maxlen-minlen here. */
2502 /* last: Stop before this one. */
40d049e4
YO
2503 /* data: string data about the pattern */
2504 /* stopparen: treat close N as END */
2505 /* recursed: which subroutines have we recursed into */
2506 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2507{
97aff369 2508 dVAR;
c277df42
IZ
2509 I32 min = 0, pars = 0, code;
2510 regnode *scan = *scanp, *next;
2511 I32 delta = 0;
2512 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2513 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2514 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2515 scan_data_t data_fake;
a3621e74 2516 SV *re_trie_maxbuff = NULL;
786e8c11 2517 regnode *first_non_open = scan;
e2e6a0f1 2518 I32 stopmin = I32_MAX;
8aa23a47 2519 scan_frame *frame = NULL;
a3621e74 2520 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2521
7918f24d
NC
2522 PERL_ARGS_ASSERT_STUDY_CHUNK;
2523
13a24bad 2524#ifdef DEBUGGING
40d049e4 2525 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2526#endif
40d049e4 2527
786e8c11 2528 if ( depth == 0 ) {
40d049e4 2529 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2530 first_non_open=regnext(first_non_open);
2531 }
2532
b81d288d 2533
8aa23a47
YO
2534 fake_study_recurse:
2535 while ( scan && OP(scan) != END && scan < last ){
2536 /* Peephole optimizer: */
304ee84b 2537 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2538 DEBUG_PEEP("Peep",scan,depth);
2539 JOIN_EXACT(scan,&min,0);
2540
2541 /* Follow the next-chain of the current node and optimize
2542 away all the NOTHINGs from it. */
2543 if (OP(scan) != CURLYX) {
2544 const int max = (reg_off_by_arg[OP(scan)]
2545 ? I32_MAX
2546 /* I32 may be smaller than U16 on CRAYs! */
2547 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2548 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2549 int noff;
2550 regnode *n = scan;
2551
2552 /* Skip NOTHING and LONGJMP. */
2553 while ((n = regnext(n))
2554 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2555 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2556 && off + noff < max)
2557 off += noff;
2558 if (reg_off_by_arg[OP(scan)])
2559 ARG(scan) = off;
2560 else
2561 NEXT_OFF(scan) = off;
2562 }
a3621e74 2563
c277df42 2564
8aa23a47
YO
2565
2566 /* The principal pseudo-switch. Cannot be a switch, since we
2567 look into several different things. */
2568 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2569 || OP(scan) == IFTHEN) {
2570 next = regnext(scan);
2571 code = OP(scan);
2572 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2573
2574 if (OP(next) == code || code == IFTHEN) {
2575 /* NOTE - There is similar code to this block below for handling
2576 TRIE nodes on a re-study. If you change stuff here check there
2577 too. */
2578 I32 max1 = 0, min1 = I32_MAX, num = 0;
2579 struct regnode_charclass_class accum;
2580 regnode * const startbranch=scan;
2581
2582 if (flags & SCF_DO_SUBSTR)
304ee84b 2583 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2584 if (flags & SCF_DO_STCLASS)
2585 cl_init_zero(pRExC_state, &accum);
2586
2587 while (OP(scan) == code) {
2588 I32 deltanext, minnext, f = 0, fake;
2589 struct regnode_charclass_class this_class;
2590
2591 num++;
2592 data_fake.flags = 0;
2593 if (data) {
2594 data_fake.whilem_c = data->whilem_c;
2595 data_fake.last_closep = data->last_closep;
2596 }
2597 else
2598 data_fake.last_closep = &fake;
58e23c8d
YO
2599
2600 data_fake.pos_delta = delta;
8aa23a47
YO
2601 next = regnext(scan);
2602 scan = NEXTOPER(scan);
2603 if (code != BRANCH)
c277df42 2604 scan = NEXTOPER(scan);
8aa23a47
YO
2605 if (flags & SCF_DO_STCLASS) {
2606 cl_init(pRExC_state, &this_class);
2607 data_fake.start_class = &this_class;
2608 f = SCF_DO_STCLASS_AND;
58e23c8d 2609 }
8aa23a47
YO
2610 if (flags & SCF_WHILEM_VISITED_POS)
2611 f |= SCF_WHILEM_VISITED_POS;
2612
2613 /* we suppose the run is continuous, last=next...*/
2614 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2615 next, &data_fake,
2616 stopparen, recursed, NULL, f,depth+1);
2617 if (min1 > minnext)
2618 min1 = minnext;
2619 if (max1 < minnext + deltanext)
2620 max1 = minnext + deltanext;
2621 if (deltanext == I32_MAX)
2622 is_inf = is_inf_internal = 1;
2623 scan = next;
2624 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2625 pars++;
2626 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2627 if ( stopmin > minnext)
2628 stopmin = min + min1;
2629 flags &= ~SCF_DO_SUBSTR;
2630 if (data)
2631 data->flags |= SCF_SEEN_ACCEPT;
2632 }
2633 if (data) {
2634 if (data_fake.flags & SF_HAS_EVAL)
2635 data->flags |= SF_HAS_EVAL;
2636 data->whilem_c = data_fake.whilem_c;
3dab1dad 2637 }
8aa23a47
YO
2638 if (flags & SCF_DO_STCLASS)
2639 cl_or(pRExC_state, &accum, &this_class);
2640 }
2641 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2642 min1 = 0;
2643 if (flags & SCF_DO_SUBSTR) {
2644 data->pos_min += min1;
2645 data->pos_delta += max1 - min1;
2646 if (max1 != min1 || is_inf)
2647 data->longest = &(data->longest_float);
2648 }
2649 min += min1;
2650 delta += max1 - min1;
2651 if (flags & SCF_DO_STCLASS_OR) {
2652 cl_or(pRExC_state, data->start_class, &accum);
2653 if (min1) {
2654 cl_and(data->start_class, and_withp);
2655 flags &= ~SCF_DO_STCLASS;
653099ff 2656 }
8aa23a47
YO
2657 }
2658 else if (flags & SCF_DO_STCLASS_AND) {
2659 if (min1) {
2660 cl_and(data->start_class, &accum);
2661 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2662 }
8aa23a47
YO
2663 else {
2664 /* Switch to OR mode: cache the old value of
2665 * data->start_class */
2666 INIT_AND_WITHP;
2667 StructCopy(data->start_class, and_withp,
2668 struct regnode_charclass_class);
2669 flags &= ~SCF_DO_STCLASS_AND;
2670 StructCopy(&accum, data->start_class,
2671 struct regnode_charclass_class);
2672 flags |= SCF_DO_STCLASS_OR;
2673 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2674 }
8aa23a47 2675 }
a3621e74 2676
8aa23a47
YO
2677 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2678 /* demq.
a3621e74 2679
8aa23a47
YO
2680 Assuming this was/is a branch we are dealing with: 'scan' now
2681 points at the item that follows the branch sequence, whatever
2682 it is. We now start at the beginning of the sequence and look
2683 for subsequences of
a3621e74 2684
8aa23a47
YO
2685 BRANCH->EXACT=>x1
2686 BRANCH->EXACT=>x2
2687 tail
a3621e74 2688
8aa23a47 2689 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2690
8aa23a47
YO
2691 If we can find such a subseqence we need to turn the first
2692 element into a trie and then add the subsequent branch exact
2693 strings to the trie.
a3621e74 2694
8aa23a47 2695 We have two cases
a3621e74 2696
8aa23a47 2697 1. patterns where the whole set of branch can be converted.
a3621e74 2698
8aa23a47 2699 2. patterns where only a subset can be converted.
a3621e74 2700
8aa23a47
YO
2701 In case 1 we can replace the whole set with a single regop
2702 for the trie. In case 2 we need to keep the start and end
2703 branchs so
a3621e74 2704
8aa23a47
YO
2705 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2706 becomes BRANCH TRIE; BRANCH X;
786e8c11 2707
8aa23a47
YO
2708 There is an additional case, that being where there is a
2709 common prefix, which gets split out into an EXACT like node
2710 preceding the TRIE node.
a3621e74 2711
8aa23a47
YO
2712 If x(1..n)==tail then we can do a simple trie, if not we make
2713 a "jump" trie, such that when we match the appropriate word
2714 we "jump" to the appopriate tail node. Essentailly we turn
2715 a nested if into a case structure of sorts.
b515a41d 2716
8aa23a47
YO
2717 */
2718
2719 int made=0;
2720 if (!re_trie_maxbuff) {
2721 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2722 if (!SvIOK(re_trie_maxbuff))
2723 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2724 }
2725 if ( SvIV(re_trie_maxbuff)>=0 ) {
2726 regnode *cur;
2727 regnode *first = (regnode *)NULL;
2728 regnode *last = (regnode *)NULL;
2729 regnode *tail = scan;
2730 U8 optype = 0;
2731 U32 count=0;
a3621e74
YO
2732
2733#ifdef DEBUGGING
8aa23a47 2734 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2735#endif
8aa23a47
YO
2736 /* var tail is used because there may be a TAIL
2737 regop in the way. Ie, the exacts will point to the
2738 thing following the TAIL, but the last branch will
2739 point at the TAIL. So we advance tail. If we
2740 have nested (?:) we may have to move through several
2741 tails.
2742 */
2743
2744 while ( OP( tail ) == TAIL ) {
2745 /* this is the TAIL generated by (?:) */
2746 tail = regnext( tail );
2747 }
a3621e74 2748
8aa23a47
YO
2749
2750 DEBUG_OPTIMISE_r({
2751 regprop(RExC_rx, mysv, tail );
2752 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2753 (int)depth * 2 + 2, "",
2754 "Looking for TRIE'able sequences. Tail node is: ",
2755 SvPV_nolen_const( mysv )
2756 );
2757 });
2758
2759 /*
2760
2761 step through the branches, cur represents each
2762 branch, noper is the first thing to be matched
2763 as part of that branch and noper_next is the
2764 regnext() of that node. if noper is an EXACT
2765 and noper_next is the same as scan (our current
2766 position in the regex) then the EXACT branch is
2767 a possible optimization target. Once we have
2768 two or more consequetive such branches we can
2769 create a trie of the EXACT's contents and stich
2770 it in place. If the sequence represents all of
2771 the branches we eliminate the whole thing and
2772 replace it with a single TRIE. If it is a
2773 subsequence then we need to stitch it in. This
2774 means the first branch has to remain, and needs
2775 to be repointed at the item on the branch chain
2776 following the last branch optimized. This could
2777 be either a BRANCH, in which case the
2778 subsequence is internal, or it could be the
2779 item following the branch sequence in which
2780 case the subsequence is at the end.
2781
2782 */
2783
2784 /* dont use tail as the end marker for this traverse */
2785 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2786 regnode * const noper = NEXTOPER( cur );
b515a41d 2787#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2788 regnode * const noper_next = regnext( noper );
b515a41d
YO
2789#endif
2790
8aa23a47
YO
2791 DEBUG_OPTIMISE_r({
2792 regprop(RExC_rx, mysv, cur);
2793 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2794 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2795
2796 regprop(RExC_rx, mysv, noper);
2797 PerlIO_printf( Perl_debug_log, " -> %s",
2798 SvPV_nolen_const(mysv));
2799
2800 if ( noper_next ) {
2801 regprop(RExC_rx, mysv, noper_next );
2802 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2803 SvPV_nolen_const(mysv));
2804 }
2805 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2806 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2807 });
2808 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2809 : PL_regkind[ OP( noper ) ] == EXACT )
2810 || OP(noper) == NOTHING )
786e8c11 2811#ifdef NOJUMPTRIE
8aa23a47 2812 && noper_next == tail
786e8c11 2813#endif
8aa23a47
YO
2814 && count < U16_MAX)
2815 {
2816 count++;
2817 if ( !first || optype == NOTHING ) {
2818 if (!first) first = cur;
2819 optype = OP( noper );
2820 } else {
2821 last = cur;
2822 }
2823 } else {
a0a388a1
YO
2824/*
2825 Currently we assume that the trie can handle unicode and ascii
2826 matches fold cased matches. If this proves true then the following
2827 define will prevent tries in this situation.
2828
2829 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2830*/
2831#define TRIE_TYPE_IS_SAFE 1
2832 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2833 make_trie( pRExC_state,
2834 startbranch, first, cur, tail, count,
2835 optype, depth+1 );
2836 }
2837 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2838#ifdef NOJUMPTRIE
8aa23a47 2839 && noper_next == tail
786e8c11 2840#endif
8aa23a47
YO
2841 ){
2842 count = 1;
2843 first = cur;
2844 optype = OP( noper );
2845 } else {
2846 count = 0;
2847 first = NULL;
2848 optype = 0;
2849 }
2850 last = NULL;
2851 }
2852 }
2853 DEBUG_OPTIMISE_r({
2854 regprop(RExC_rx, mysv, cur);
2855 PerlIO_printf( Perl_debug_log,
2856 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2857 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2858
2859 });
a0a388a1
YO
2860
2861 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2862 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2863#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2864 if ( ((made == MADE_EXACT_TRIE &&
2865 startbranch == first)
2866 || ( first_non_open == first )) &&
2867 depth==0 ) {
2868 flags |= SCF_TRIE_RESTUDY;
2869 if ( startbranch == first
2870 && scan == tail )
2871 {
2872 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2873 }
2874 }
3dab1dad 2875#endif
8aa23a47
YO
2876 }
2877 }
2878
2879 } /* do trie */
2880
653099ff 2881 }
8aa23a47
YO
2882 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2883 scan = NEXTOPER(NEXTOPER(scan));
2884 } else /* single branch is optimized. */
2885 scan = NEXTOPER(scan);
2886 continue;
2887 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2888 scan_frame *newframe = NULL;
2889 I32 paren;
2890 regnode *start;
2891 regnode *end;
2892
2893 if (OP(scan) != SUSPEND) {
2894 /* set the pointer */
2895 if (OP(scan) == GOSUB) {
2896 paren = ARG(scan);
2897 RExC_recurse[ARG2L(scan)] = scan;
2898 start = RExC_open_parens[paren-1];
2899 end = RExC_close_parens[paren-1];
2900 } else {
2901 paren = 0;
f8fc2ecf 2902 start = RExC_rxi->program + 1;
8aa23a47
YO
2903 end = RExC_opend;
2904 }
2905 if (!recursed) {
2906 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2907 SAVEFREEPV(recursed);
2908 }
2909 if (!PAREN_TEST(recursed,paren+1)) {
2910 PAREN_SET(recursed,paren+1);
2911 Newx(newframe,1,scan_frame);
2912 } else {
2913 if (flags & SCF_DO_SUBSTR) {
304ee84b 2914 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2915 data->longest = &(data->longest_float);
2916 }
2917 is_inf = is_inf_internal = 1;
2918 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2919 cl_anything(pRExC_state, data->start_class);
2920 flags &= ~SCF_DO_STCLASS;
2921 }
2922 } else {
2923 Newx(newframe,1,scan_frame);
2924 paren = stopparen;
2925 start = scan+2;
2926 end = regnext(scan);
2927 }
2928 if (newframe) {
2929 assert(start);
2930 assert(end);
2931 SAVEFREEPV(newframe);
2932 newframe->next = regnext(scan);
2933 newframe->last = last;
2934 newframe->stop = stopparen;
2935 newframe->prev = frame;
2936
2937 frame = newframe;
2938 scan = start;
2939 stopparen = paren;
2940 last = end;
2941
2942 continue;
2943 }
2944 }
2945 else if (OP(scan) == EXACT) {
2946 I32 l = STR_LEN(scan);
2947 UV uc;
2948 if (UTF) {
2949 const U8 * const s = (U8*)STRING(scan);
2950 l = utf8_length(s, s + l);
2951 uc = utf8_to_uvchr(s, NULL);
2952 } else {
2953 uc = *((U8*)STRING(scan));
2954 }
2955 min += l;
2956 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2957 /* The code below prefers earlier match for fixed
2958 offset, later match for variable offset. */
2959 if (data->last_end == -1) { /* Update the start info. */
2960 data->last_start_min = data->pos_min;
2961 data->last_start_max = is_inf
2962 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2963 }
8aa23a47
YO
2964 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2965 if (UTF)
2966 SvUTF8_on(data->last_found);
2967 {
2968 SV * const sv = data->last_found;
2969 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2970 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2971 if (mg && mg->mg_len >= 0)
2972 mg->mg_len += utf8_length((U8*)STRING(scan),
2973 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2974 }
8aa23a47
YO
2975 data->last_end = data->pos_min + l;
2976 data->pos_min += l; /* As in the first entry. */
2977 data->flags &= ~SF_BEFORE_EOL;
2978 }
2979 if (flags & SCF_DO_STCLASS_AND) {
2980 /* Check whether it is compatible with what we know already! */
2981 int compat = 1;
2982
2983 if (uc >= 0x100 ||
2984 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2985 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2986 && (!(data->start_class->flags & ANYOF_FOLD)
2987 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2988 )
2989 compat = 0;
2990 ANYOF_CLASS_ZERO(data->start_class);
2991 ANYOF_BITMAP_ZERO(data->start_class);
2992 if (compat)
2993 ANYOF_BITMAP_SET(data->start_class, uc);
2994 data->start_class->flags &= ~ANYOF_EOS;
2995 if (uc < 0x100)
2996 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2997 }
2998 else if (flags & SCF_DO_STCLASS_OR) {
2999 /* false positive possible if the class is case-folded */
3000 if (uc < 0x100)
3001 ANYOF_BITMAP_SET(data->start_class, uc);
3002 else
3003 data->start_class->flags |= ANYOF_UNICODE_ALL;
3004 data->start_class->flags &= ~ANYOF_EOS;
3005 cl_and(data->start_class, and_withp);
3006 }
3007 flags &= ~SCF_DO_STCLASS;
3008 }
3009 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3010 I32 l = STR_LEN(scan);
3011 UV uc = *((U8*)STRING(scan));
3012
3013 /* Search for fixed substrings supports EXACT only. */
3014 if (flags & SCF_DO_SUBSTR) {
3015 assert(data);
304ee84b 3016 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3017 }
3018 if (UTF) {
3019 const U8 * const s = (U8 *)STRING(scan);
3020 l = utf8_length(s, s + l);
3021 uc = utf8_to_uvchr(s, NULL);
3022 }
3023 min += l;
3024 if (flags & SCF_DO_SUBSTR)
3025 data->pos_min += l;
3026 if (flags & SCF_DO_STCLASS_AND) {
3027 /* Check whether it is compatible with what we know already! */
3028 int compat = 1;
3029
3030 if (uc >= 0x100 ||
3031 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3032 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3033 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3034 compat = 0;
3035 ANYOF_CLASS_ZERO(data->start_class);
3036 ANYOF_BITMAP_ZERO(data->start_class);
3037 if (compat) {
3038 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3039 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3040 data->start_class->flags |= ANYOF_FOLD;
3041 if (OP(scan) == EXACTFL)
3042 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3043 }
8aa23a47
YO
3044 }
3045 else if (flags & SCF_DO_STCLASS_OR) {
3046 if (data->start_class->flags & ANYOF_FOLD) {
3047 /* false positive possible if the class is case-folded.
3048 Assume that the locale settings are the same... */
1aa99e6b
IH
3049 if (uc < 0x100)
3050 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3051 data->start_class->flags &= ~ANYOF_EOS;
3052 }
8aa23a47 3053 cl_and(data->start_class, and_withp);
653099ff 3054 }
8aa23a47
YO
3055 flags &= ~SCF_DO_STCLASS;
3056 }
3057 else if (strchr((const char*)PL_varies,OP(scan))) {
3058 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3059 I32 f = flags, pos_before = 0;
3060 regnode * const oscan = scan;
3061 struct regnode_charclass_class this_class;
3062 struct regnode_charclass_class *oclass = NULL;
3063 I32 next_is_eval = 0;
3064
3065 switch (PL_regkind[OP(scan)]) {
3066 case WHILEM: /* End of (?:...)* . */
3067 scan = NEXTOPER(scan);
3068 goto finish;
3069 case PLUS:
3070 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3071 next = NEXTOPER(scan);
3072 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3073 mincount = 1;
3074 maxcount = REG_INFTY;
3075 next = regnext(scan);
3076 scan = NEXTOPER(scan);
3077 goto do_curly;
3078 }
3079 }
3080 if (flags & SCF_DO_SUBSTR)
3081 data->pos_min++;
3082 min++;
3083 /* Fall through. */
3084 case STAR:
3085 if (flags & SCF_DO_STCLASS) {
3086 mincount = 0;
3087 maxcount = REG_INFTY;
3088 next = regnext(scan);
3089 scan = NEXTOPER(scan);
3090 goto do_curly;
3091 }
3092 is_inf = is_inf_internal = 1;
3093 scan = regnext(scan);
c277df42 3094 if (flags & SCF_DO_SUBSTR) {
304ee84b 3095 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3096 data->longest = &(data->longest_float);
c277df42 3097 }
8aa23a47
YO
3098 goto optimize_curly_tail;
3099 case CURLY:
3100 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3101 && (scan->flags == stopparen))
3102 {
3103 mincount = 1;
3104 maxcount = 1;
3105 } else {
3106 mincount = ARG1(scan);
3107 maxcount = ARG2(scan);
653099ff 3108 }
8aa23a47
YO
3109 next = regnext(scan);
3110 if (OP(scan) == CURLYX) {
3111 I32 lp = (data ? *(data->last_closep) : 0);
3112 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3113 }
8aa23a47
YO
3114 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3115 next_is_eval = (OP(scan) == EVAL);
3116 do_curly:
3117 if (flags & SCF_DO_SUBSTR) {
304ee84b 3118 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3119 pos_before = data->pos_min;
b45f050a 3120 }
8aa23a47
YO
3121 if (data) {
3122 fl = data->flags;
3123 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3124 if (is_inf)
3125 data->flags |= SF_IS_INF;
3126 }
3127 if (flags & SCF_DO_STCLASS) {
3128 cl_init(pRExC_state, &this_class);
3129 oclass = data->start_class;
3130 data->start_class = &this_class;
3131 f |= SCF_DO_STCLASS_AND;
3132 f &= ~SCF_DO_STCLASS_OR;
3133 }
3134 /* These are the cases when once a subexpression
3135 fails at a particular position, it cannot succeed
3136 even after backtracking at the enclosing scope.
3137
3138 XXXX what if minimal match and we are at the
3139 initial run of {n,m}? */
3140 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3141 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3142
8aa23a47
YO
3143 /* This will finish on WHILEM, setting scan, or on NULL: */
3144 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3145 last, data, stopparen, recursed, NULL,
3146 (mincount == 0
3147 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3148
8aa23a47
YO
3149 if (flags & SCF_DO_STCLASS)
3150 data->start_class = oclass;
3151 if (mincount == 0 || minnext == 0) {
3152 if (flags & SCF_DO_STCLASS_OR) {
3153 cl_or(pRExC_state, data->start_class, &this_class);
3154 }
3155 else if (flags & SCF_DO_STCLASS_AND) {
3156 /* Switch to OR mode: cache the old value of
3157 * data->start_class */
3158 INIT_AND_WITHP;
3159 StructCopy(data->start_class, and_withp,
3160 struct regnode_charclass_class);
3161 flags &= ~SCF_DO_STCLASS_AND;
3162 StructCopy(&this_class, data->start_class,
3163 struct regnode_charclass_class);
3164 flags |= SCF_DO_STCLASS_OR;
3165 data->start_class->flags |= ANYOF_EOS;
3166 }
3167 } else { /* Non-zero len */
3168 if (flags & SCF_DO_STCLASS_OR) {
3169 cl_or(pRExC_state, data->start_class, &this_class);
3170 cl_and(data->start_class, and_withp);
3171 }
3172 else if (flags & SCF_DO_STCLASS_AND)
3173 cl_and(data->start_class, &this_class);
3174 flags &= ~SCF_DO_STCLASS;
3175 }
3176 if (!scan) /* It was not CURLYX, but CURLY. */
3177 scan = next;
3178 if ( /* ? quantifier ok, except for (?{ ... }) */
3179 (next_is_eval || !(mincount == 0 && maxcount == 1))
3180 && (minnext == 0) && (deltanext == 0)
3181 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3182 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3183 && ckWARN(WARN_REGEXP))
3184 {
3185 vWARN(RExC_parse,
3186 "Quantifier unexpected on zero-length expression");
3187 }
3188
3189 min += minnext * mincount;
3190 is_inf_internal |= ((maxcount == REG_INFTY
3191 && (minnext + deltanext) > 0)
3192 || deltanext == I32_MAX);
3193 is_inf |= is_inf_internal;
3194 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3195
3196 /* Try powerful optimization CURLYX => CURLYN. */
3197 if ( OP(oscan) == CURLYX && data
3198 && data->flags & SF_IN_PAR
3199 && !(data->flags & SF_HAS_EVAL)
3200 && !deltanext && minnext == 1 ) {
3201 /* Try to optimize to CURLYN. */
3202 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3203 regnode * const nxt1 = nxt;
497b47a8 3204#ifdef DEBUGGING
8aa23a47 3205 regnode *nxt2;
497b47a8 3206#endif
c277df42 3207
8aa23a47
YO
3208 /* Skip open. */
3209 nxt = regnext(nxt);
3210 if (!strchr((const char*)PL_simple,OP(nxt))
3211 && !(PL_regkind[OP(nxt)] == EXACT
3212 && STR_LEN(nxt) == 1))
3213 goto nogo;
497b47a8 3214#ifdef DEBUGGING
8aa23a47 3215 nxt2 = nxt;
497b47a8 3216#endif
8aa23a47
YO
3217 nxt = regnext(nxt);
3218 if (OP(nxt) != CLOSE)
3219 goto nogo;
3220 if (RExC_open_parens) {
3221 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3222 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3223 }
3224 /* Now we know that nxt2 is the only contents: */
3225 oscan->flags = (U8)ARG(nxt);
3226 OP(oscan) = CURLYN;
3227 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3228
c277df42 3229#ifdef DEBUGGING
8aa23a47
YO
3230 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3231 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3232 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3233 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3234 OP(nxt + 1) = OPTIMIZED; /* was count. */
3235 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3236#endif
8aa23a47
YO
3237 }
3238 nogo:
3239
3240 /* Try optimization CURLYX => CURLYM. */
3241 if ( OP(oscan) == CURLYX && data
3242 && !(data->flags & SF_HAS_PAR)
3243 && !(data->flags & SF_HAS_EVAL)
3244 && !deltanext /* atom is fixed width */
3245 && minnext != 0 /* CURLYM can't handle zero width */
3246 ) {
3247 /* XXXX How to optimize if data == 0? */
3248 /* Optimize to a simpler form. */
3249 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3250 regnode *nxt2;
3251
3252 OP(oscan) = CURLYM;
3253 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3254 && (OP(nxt2) != WHILEM))
3255 nxt = nxt2;
3256 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3257 /* Need to optimize away parenths. */
3258 if (data->flags & SF_IN_PAR) {
3259 /* Set the parenth number. */
3260 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3261
3262 if (OP(nxt) != CLOSE)
3263 FAIL("Panic opt close");
3264 oscan->flags = (U8)ARG(nxt);
3265 if (RExC_open_parens) {
3266 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3267 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3268 }
8aa23a47
YO
3269 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3270 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3271
c277df42 3272#ifdef DEBUGGING
8aa23a47
YO
3273 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3274 OP(nxt + 1) = OPTIMIZED; /* was count. */
3275 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3276 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3277#endif
c277df42 3278#if 0
8aa23a47
YO
3279 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3280 regnode *nnxt = regnext(nxt1);
3281
3282 if (nnxt == nxt) {
3283 if (reg_off_by_arg[OP(nxt1)])
3284 ARG_SET(nxt1, nxt2 - nxt1);
3285 else if (nxt2 - nxt1 < U16_MAX)
3286 NEXT_OFF(nxt1) = nxt2 - nxt1;
3287 else
3288 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3289 }
8aa23a47 3290 nxt1 = nnxt;
c277df42 3291 }
5d1c421c 3292#endif
8aa23a47
YO
3293 /* Optimize again: */
3294 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3295 NULL, stopparen, recursed, NULL, 0,depth+1);
3296 }
3297 else
3298 oscan->flags = 0;
3299 }
3300 else if ((OP(oscan) == CURLYX)
3301 && (flags & SCF_WHILEM_VISITED_POS)
3302 /* See the comment on a similar expression above.
3303 However, this time it not a subexpression
3304 we care about, but the expression itself. */
3305 && (maxcount == REG_INFTY)
3306 && data && ++data->whilem_c < 16) {
3307 /* This stays as CURLYX, we can put the count/of pair. */
3308 /* Find WHILEM (as in regexec.c) */
3309 regnode *nxt = oscan + NEXT_OFF(oscan);
3310
3311 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3312 nxt += ARG(nxt);
3313 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3314 | (RExC_whilem_seen << 4)); /* On WHILEM */
3315 }
3316 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3317 pars++;
3318 if (flags & SCF_DO_SUBSTR) {
3319 SV *last_str = NULL;
3320 int counted = mincount != 0;
a0ed51b3 3321
8aa23a47
YO
3322 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3323#if defined(SPARC64_GCC_WORKAROUND)
3324 I32 b = 0;
3325 STRLEN l = 0;
3326 const char *s = NULL;
3327 I32 old = 0;
b515a41d 3328
8aa23a47
YO
3329 if (pos_before >= data->last_start_min)
3330 b = pos_before;
3331 else
3332 b = data->last_start_min;
b515a41d 3333
8aa23a47
YO
3334 l = 0;
3335 s = SvPV_const(data->last_found, l);
3336 old = b - data->last_start_min;
3337
3338#else
3339 I32 b = pos_before >= data->last_start_min
3340 ? pos_before : data->last_start_min;
3341 STRLEN l;
3342 const char * const s = SvPV_const(data->last_found, l);
3343 I32 old = b - data->last_start_min;
3344#endif
3345
3346 if (UTF)
3347 old = utf8_hop((U8*)s, old) - (U8*)s;
3348
3349 l -= old;
3350 /* Get the added string: */
740cce10 3351 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3352 if (deltanext == 0 && pos_before == b) {
3353 /* What was added is a constant string */
3354 if (mincount > 1) {
3355 SvGROW(last_str, (mincount * l) + 1);
3356 repeatcpy(SvPVX(last_str) + l,
3357 SvPVX_const(last_str), l, mincount - 1);
3358 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3359 /* Add additional parts. */
3360 SvCUR_set(data->last_found,
3361 SvCUR(data->last_found) - l);
3362 sv_catsv(data->last_found, last_str);
3363 {
3364 SV * sv = data->last_found;
3365 MAGIC *mg =
3366 SvUTF8(sv) && SvMAGICAL(sv) ?
3367 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3368 if (mg && mg->mg_len >= 0)
bd94e887 3369 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3370 }
8aa23a47 3371 data->last_end += l * (mincount - 1);
b515a41d 3372 }
8aa23a47
YO
3373 } else {
3374 /* start offset must point into the last copy */
3375 data->last_start_min += minnext * (mincount - 1);
3376 data->last_start_max += is_inf ? I32_MAX
3377 : (maxcount - 1) * (minnext + data->pos_delta);
3378 }
c277df42 3379 }
8aa23a47
YO
3380 /* It is counted once already... */
3381 data->pos_min += minnext * (mincount - counted);
3382 data->pos_delta += - counted * deltanext +
3383 (minnext + deltanext) * maxcount - minnext * mincount;
3384 if (mincount != maxcount) {
3385 /* Cannot extend fixed substrings found inside
3386 the group. */
304ee84b 3387 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3388 if (mincount && last_str) {
3389 SV * const sv = data->last_found;
3390 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3391 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3392
3393 if (mg)
3394 mg->mg_len = -1;
3395 sv_setsv(sv, last_str);
3396 data->last_end = data->pos_min;
3397 data->last_start_min =
3398 data->pos_min - CHR_SVLEN(last_str);
3399 data->last_start_max = is_inf
3400 ? I32_MAX
3401 : data->pos_min + data->pos_delta
3402 - CHR_SVLEN(last_str);
3403 }
3404 data->longest = &(data->longest_float);
3405 }
3406 SvREFCNT_dec(last_str);
c277df42 3407 }
8aa23a47
YO
3408 if (data && (fl & SF_HAS_EVAL))
3409 data->flags |= SF_HAS_EVAL;
3410 optimize_curly_tail:
3411 if (OP(oscan) != CURLYX) {
3412 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3413 && NEXT_OFF(next))
3414 NEXT_OFF(oscan) += NEXT_OFF(next);
3415 }
3416 continue;
3417 default: /* REF and CLUMP only? */
3418 if (flags & SCF_DO_SUBSTR) {
304ee84b 3419 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3420 data->longest = &(data->longest_float);
3421 }
3422 is_inf = is_inf_internal = 1;
3423 if (flags & SCF_DO_STCLASS_OR)
3424 cl_anything(pRExC_state, data->start_class);
3425 flags &= ~SCF_DO_STCLASS;
3426 break;
c277df42 3427 }
8aa23a47 3428 }
e1d1eefb
YO
3429 else if (OP(scan) == LNBREAK) {
3430 if (flags & SCF_DO_STCLASS) {
3431 int value = 0;
3432 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3433 if (flags & SCF_DO_STCLASS_AND) {
3434 for (value = 0; value < 256; value++)
e64b1bd1 3435 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3436 ANYOF_BITMAP_CLEAR(data->start_class, value);
3437 }
3438 else {
3439 for (value = 0; value < 256; value++)
e64b1bd1 3440 if (is_VERTWS_cp(value))
e1d1eefb
YO
3441 ANYOF_BITMAP_SET(data->start_class, value);
3442 }
3443 if (flags & SCF_DO_STCLASS_OR)
3444 cl_and(data->start_class, and_withp);
3445 flags &= ~SCF_DO_STCLASS;
3446 }
3447 min += 1;
f9a79580 3448 delta += 1;
e1d1eefb
YO
3449 if (flags & SCF_DO_SUBSTR) {
3450 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3451 data->pos_min += 1;
f9a79580 3452 data->pos_delta += 1;
e1d1eefb
YO
3453 data->longest = &(data->longest_float);
3454 }
3455
3456 }
f9a79580
RGS
3457 else if (OP(scan) == FOLDCHAR) {
3458 int d = ARG(scan)==0xDF ? 1 : 2;
3459 flags &= ~SCF_DO_STCLASS;
3460 min += 1;
3461 delta += d;
3462 if (flags & SCF_DO_SUBSTR) {
3463 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3464 data->pos_min += 1;
3465 data->pos_delta += d;
3466 data->longest = &(data->longest_float);
3467 }
3468 }
8aa23a47
YO
3469 else if (strchr((const char*)PL_simple,OP(scan))) {
3470 int value = 0;
653099ff 3471
8aa23a47 3472 if (flags & SCF_DO_SUBSTR) {
304ee84b 3473 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3474 data->pos_min++;
3475 }
3476 min++;
3477 if (flags & SCF_DO_STCLASS) {
3478 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3479
8aa23a47
YO
3480 /* Some of the logic below assumes that switching
3481 locale on will only add false positives. */
3482 switch (PL_regkind[OP(scan)]) {
3483 case SANY:
3484 default:
3485 do_default:
3486 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3487 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3488 cl_anything(pRExC_state, data->start_class);
3489 break;
3490 case REG_ANY:
3491 if (OP(scan) == SANY)
3492 goto do_default;
3493 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3494 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3495 || (data->start_class->flags & ANYOF_CLASS));
3496 cl_anything(pRExC_state, data->start_class);
653099ff 3497 }
8aa23a47
YO
3498 if (flags & SCF_DO_STCLASS_AND || !value)
3499 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3500 break;
3501 case ANYOF:
3502 if (flags & SCF_DO_STCLASS_AND)
3503 cl_and(data->start_class,
3504 (struct regnode_charclass_class*)scan);
653099ff 3505 else
8aa23a47
YO
3506 cl_or(pRExC_state, data->start_class,
3507 (struct regnode_charclass_class*)scan);
3508 break;
3509 case ALNUM:
3510 if (flags & SCF_DO_STCLASS_AND) {
3511 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3512 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3513 for (value = 0; value < 256; value++)
3514 if (!isALNUM(value))
3515 ANYOF_BITMAP_CLEAR(data->start_class, value);
3516 }
653099ff 3517 }
8aa23a47
YO
3518 else {
3519 if (data->start_class->flags & ANYOF_LOCALE)
3520 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3521 else {
3522 for (value = 0; value < 256; value++)
3523 if (isALNUM(value))
3524 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3525 }
8aa23a47
YO
3526 }
3527 break;
3528 case ALNUML:
3529 if (flags & SCF_DO_STCLASS_AND) {
3530 if (data->start_class->flags & ANYOF_LOCALE)
3531 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3532 }
3533 else {
3534 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3535 data->start_class->flags |= ANYOF_LOCALE;
3536 }
3537 break;
3538 case NALNUM:
3539 if (flags & SCF_DO_STCLASS_AND) {
3540 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3541 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3542 for (value = 0; value < 256; value++)
3543 if (isALNUM(value))
3544 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3545 }
3546 }
8aa23a47
YO
3547 else {
3548 if (data->start_class->flags & ANYOF_LOCALE)
3549 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3550 else {
3551 for (value = 0; value < 256; value++)
3552 if (!isALNUM(value))
3553 ANYOF_BITMAP_SET(data->start_class, value);
3554 }
653099ff 3555 }
8aa23a47
YO
3556 break;
3557 case NALNUML:
3558 if (flags & SCF_DO_STCLASS_AND) {
3559 if (data->start_class->flags & ANYOF_LOCALE)
3560 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3561 }
8aa23a47
YO
3562 else {
3563 data->start_class->flags |= ANYOF_LOCALE;
3564 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3565 }
3566 break;
3567 case SPACE:
3568 if (flags & SCF_DO_STCLASS_AND) {
3569 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3570 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3571 for (value = 0; value < 256; value++)
3572 if (!isSPACE(value))
3573 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3574 }
3575 }
8aa23a47
YO
3576 else {
3577 if (data->start_class->flags & ANYOF_LOCALE)
3578 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3579 else {
3580 for (value = 0; value < 256; value++)
3581 if (isSPACE(value))
3582 ANYOF_BITMAP_SET(data->start_class, value);
3583 }
653099ff 3584 }
8aa23a47
YO
3585 break;
3586 case SPACEL:
3587 if (flags & SCF_DO_STCLASS_AND) {
3588 if (data->start_class->flags & ANYOF_LOCALE)
3589 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3590 }
3591 else {
3592 data->start_class->flags |= ANYOF_LOCALE;
3593 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3594 }
3595 break;
3596 case NSPACE:
3597 if (flags & SCF_DO_STCLASS_AND) {
3598 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3599 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3600 for (value = 0; value < 256; value++)
3601 if (isSPACE(value))
3602 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3603 }
8aa23a47
YO
3604 }
3605 else {
3606 if (data->start_class->flags & ANYOF_LOCALE)
3607 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3608 else {
3609 for (value = 0; value < 256; value++)
3610 if (!isSPACE(value))
3611 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3612 }
3613 }
8aa23a47
YO
3614 break;
3615 case NSPACEL:
3616 if (flags & SCF_DO_STCLASS_AND) {
3617 if (data->start_class->flags & ANYOF_LOCALE) {
3618 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3619 for (value = 0; value < 256; value++)
3620 if (!isSPACE(value))
3621 ANYOF_BITMAP_CLEAR(data->start_class, value);
3622 }
653099ff 3623 }
8aa23a47
YO
3624 else {
3625 data->start_class->flags |= ANYOF_LOCALE;
3626 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3627 }
3628 break;
3629 case DIGIT:
3630 if (flags & SCF_DO_STCLASS_AND) {
3631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3632 for (value = 0; value < 256; value++)
3633 if (!isDIGIT(value))
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 }
3636 else {
3637 if (data->start_class->flags & ANYOF_LOCALE)
3638 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3639 else {
3640 for (value = 0; value < 256; value++)
3641 if (isDIGIT(value))
3642 ANYOF_BITMAP_SET(data->start_class, value);
3643 }
3644 }
3645 break;
3646 case NDIGIT:
3647 if (flags & SCF_DO_STCLASS_AND) {
3648 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3649 for (value = 0; value < 256; value++)
3650 if (isDIGIT(value))
3651 ANYOF_BITMAP_CLEAR(data->start_class, value);
3652 }
3653 else {
3654 if (data->start_class->flags & ANYOF_LOCALE)
3655 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3656 else {
3657 for (value = 0; value < 256; value++)
3658 if (!isDIGIT(value))
3659 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3660 }
3661 }
8aa23a47 3662 break;
e1d1eefb
YO
3663 CASE_SYNST_FNC(VERTWS);
3664 CASE_SYNST_FNC(HORIZWS);
3665
8aa23a47
YO
3666 }
3667 if (flags & SCF_DO_STCLASS_OR)
3668 cl_and(data->start_class, and_withp);
3669 flags &= ~SCF_DO_STCLASS;
3670 }
3671 }
3672 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3673 data->flags |= (OP(scan) == MEOL
3674 ? SF_BEFORE_MEOL
3675 : SF_BEFORE_SEOL);
3676 }
3677 else if ( PL_regkind[OP(scan)] == BRANCHJ
3678 /* Lookbehind, or need to calculate parens/evals/stclass: */
3679 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3680 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3681 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3682 || OP(scan) == UNLESSM )
3683 {
3684 /* Negative Lookahead/lookbehind
3685 In this case we can't do fixed string optimisation.
3686 */
1de06328 3687
8aa23a47
YO
3688 I32 deltanext, minnext, fake = 0;
3689 regnode *nscan;
3690 struct regnode_charclass_class intrnl;
3691 int f = 0;
1de06328 3692
8aa23a47
YO
3693 data_fake.flags = 0;
3694 if (data) {
3695 data_fake.whilem_c = data->whilem_c;
3696 data_fake.last_closep = data->last_closep;
c277df42 3697 }
8aa23a47
YO
3698 else
3699 data_fake.last_closep = &fake;
58e23c8d 3700 data_fake.pos_delta = delta;
8aa23a47
YO
3701 if ( flags & SCF_DO_STCLASS && !scan->flags
3702 && OP(scan) == IFMATCH ) { /* Lookahead */
3703 cl_init(pRExC_state, &intrnl);
3704 data_fake.start_class = &intrnl;
3705 f |= SCF_DO_STCLASS_AND;
3706 }
3707 if (flags & SCF_WHILEM_VISITED_POS)
3708 f |= SCF_WHILEM_VISITED_POS;
3709 next = regnext(scan);
3710 nscan = NEXTOPER(NEXTOPER(scan));
3711 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3712 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3713 if (scan->flags) {
3714 if (deltanext) {
58e23c8d 3715 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3716 }
3717 else if (minnext > (I32)U8_MAX) {
58e23c8d 3718 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3719 }
3720 scan->flags = (U8)minnext;
3721 }
3722 if (data) {
3723 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3724 pars++;
3725 if (data_fake.flags & SF_HAS_EVAL)
3726 data->flags |= SF_HAS_EVAL;
3727 data->whilem_c = data_fake.whilem_c;
3728 }
3729 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3730 if (flags & SCF_DO_STCLASS_OR) {
3731 /* OR before, AND after: ideally we would recurse with
3732 * data_fake to get the AND applied by study of the
3733 * remainder of the pattern, and then derecurse;
3734 * *** HACK *** for now just treat as "no information".
3735 * See [perl #56690].
3736 */
3737 cl_init(pRExC_state, data->start_class);
3738 } else {
3739 /* AND before and after: combine and continue */
3740 const int was = (data->start_class->flags & ANYOF_EOS);
3741
3742 cl_and(data->start_class, &intrnl);
3743 if (was)
3744 data->start_class->flags |= ANYOF_EOS;
3745 }
8aa23a47 3746 }
cb434fcc 3747 }
8aa23a47
YO
3748#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3749 else {
3750 /* Positive Lookahead/lookbehind
3751 In this case we can do fixed string optimisation,
3752 but we must be careful about it. Note in the case of
3753 lookbehind the positions will be offset by the minimum
3754 length of the pattern, something we won't know about
3755 until after the recurse.
3756 */
3757 I32 deltanext, fake = 0;
3758 regnode *nscan;
3759 struct regnode_charclass_class intrnl;
3760 int f = 0;
3761 /* We use SAVEFREEPV so that when the full compile
3762 is finished perl will clean up the allocated
3763 minlens when its all done. This was we don't
3764 have to worry about freeing them when we know
3765 they wont be used, which would be a pain.
3766 */
3767 I32 *minnextp;
3768 Newx( minnextp, 1, I32 );
3769 SAVEFREEPV(minnextp);
3770
3771 if (data) {
3772 StructCopy(data, &data_fake, scan_data_t);
3773 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3774 f |= SCF_DO_SUBSTR;
3775 if (scan->flags)
304ee84b 3776 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3777 data_fake.last_found=newSVsv(data->last_found);
3778 }
3779 }
3780 else
3781 data_fake.last_closep = &fake;
3782 data_fake.flags = 0;
58e23c8d 3783 data_fake.pos_delta = delta;
8aa23a47
YO
3784 if (is_inf)
3785 data_fake.flags |= SF_IS_INF;
3786 if ( flags & SCF_DO_STCLASS && !scan->flags
3787 && OP(scan) == IFMATCH ) { /* Lookahead */
3788 cl_init(pRExC_state, &intrnl);
3789 data_fake.start_class = &intrnl;
3790 f |= SCF_DO_STCLASS_AND;
3791 }
3792 if (flags & SCF_WHILEM_VISITED_POS)
3793 f |= SCF_WHILEM_VISITED_POS;
3794 next = regnext(scan);
3795 nscan = NEXTOPER(NEXTOPER(scan));
3796
3797 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3798 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3799 if (scan->flags) {
3800 if (deltanext) {
58e23c8d 3801 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3802 }
3803 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3804 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3805 }
3806 scan->flags = (U8)*minnextp;
3807 }
3808
3809 *minnextp += min;
3810
3811 if (f & SCF_DO_STCLASS_AND) {
3812 const int was = (data->start_class->flags & ANYOF_EOS);
3813
3814 cl_and(data->start_class, &intrnl);
3815 if (was)
3816 data->start_class->flags |= ANYOF_EOS;
3817 }
3818 if (data) {
3819 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3820 pars++;
3821 if (data_fake.flags & SF_HAS_EVAL)
3822 data->flags |= SF_HAS_EVAL;
3823 data->whilem_c = data_fake.whilem_c;
3824 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3825 if (RExC_rx->minlen<*minnextp)
3826 RExC_rx->minlen=*minnextp;
304ee84b 3827 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3828 SvREFCNT_dec(data_fake.last_found);
3829
3830 if ( data_fake.minlen_fixed != minlenp )
3831 {
3832 data->offset_fixed= data_fake.offset_fixed;
3833 data->minlen_fixed= data_fake.minlen_fixed;
3834 data->lookbehind_fixed+= scan->flags;
3835 }
3836 if ( data_fake.minlen_float != minlenp )
3837 {
3838 data->minlen_float= data_fake.minlen_float;
3839 data->offset_float_min=data_fake.offset_float_min;
3840 data->offset_float_max=data_fake.offset_float_max;
3841 data->lookbehind_float+= scan->flags;
3842 }
3843 }
3844 }
3845
3846
40d049e4 3847 }
8aa23a47
YO
3848#endif
3849 }
3850 else if (OP(scan) == OPEN) {
3851 if (stopparen != (I32)ARG(scan))
3852 pars++;
3853 }
3854 else if (OP(scan) == CLOSE) {
3855 if (stopparen == (I32)ARG(scan)) {
3856 break;
3857 }
3858 if ((I32)ARG(scan) == is_par) {
3859 next = regnext(scan);
b515a41d 3860
8aa23a47
YO
3861 if ( next && (OP(next) != WHILEM) && next < last)
3862 is_par = 0; /* Disable optimization */
40d049e4 3863 }
8aa23a47
YO
3864 if (data)
3865 *(data->last_closep) = ARG(scan);
3866 }
3867 else if (OP(scan) == EVAL) {
c277df42
IZ
3868 if (data)
3869 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3870 }
3871 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3872 if (flags & SCF_DO_SUBSTR) {
304ee84b 3873 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3874 flags &= ~SCF_DO_SUBSTR;
40d049e4 3875 }
8aa23a47
YO
3876 if (data && OP(scan)==ACCEPT) {
3877 data->flags |= SCF_SEEN_ACCEPT;
3878 if (stopmin > min)
3879 stopmin = min;
e2e6a0f1 3880 }
8aa23a47
YO
3881 }
3882 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3883 {
0f5d15d6 3884 if (flags & SCF_DO_SUBSTR) {
304ee84b 3885 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3886 data->longest = &(data->longest_float);
3887 }
3888 is_inf = is_inf_internal = 1;
653099ff 3889 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3890 cl_anything(pRExC_state, data->start_class);
96776eda 3891 flags &= ~SCF_DO_STCLASS;
8aa23a47 3892 }
58e23c8d 3893 else if (OP(scan) == GPOS) {
bbe252da 3894 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3895 !(delta || is_inf || (data && data->pos_delta)))
3896 {
bbe252da
YO
3897 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3898 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3899 if (RExC_rx->gofs < (U32)min)
3900 RExC_rx->gofs = min;
3901 } else {
bbe252da 3902 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3903 RExC_rx->gofs = 0;
3904 }
3905 }
786e8c11 3906#ifdef TRIE_STUDY_OPT
40d049e4 3907#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3908 else if (PL_regkind[OP(scan)] == TRIE) {
3909 /* NOTE - There is similar code to this block above for handling
3910 BRANCH nodes on the initial study. If you change stuff here
3911 check there too. */
3912 regnode *trie_node= scan;
3913 regnode *tail= regnext(scan);
f8fc2ecf 3914 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3915 I32 max1 = 0, min1 = I32_MAX;
3916 struct regnode_charclass_class accum;
3917
3918 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3919 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3920 if (flags & SCF_DO_STCLASS)
3921 cl_init_zero(pRExC_state, &accum);
3922
3923 if (!trie->jump) {
3924 min1= trie->minlen;
3925 max1= trie->maxlen;
3926 } else {
3927 const regnode *nextbranch= NULL;
3928 U32 word;
3929
3930 for ( word=1 ; word <= trie->wordcount ; word++)
3931 {
3932 I32 deltanext=0, minnext=0, f = 0, fake;
3933 struct regnode_charclass_class this_class;
3934
3935 data_fake.flags = 0;
3936 if (data) {
3937 data_fake.whilem_c = data->whilem_c;
3938 data_fake.last_closep = data->last_closep;
3939 }
3940 else
3941 data_fake.last_closep = &fake;
58e23c8d 3942 data_fake.pos_delta = delta;
8aa23a47
YO
3943 if (flags & SCF_DO_STCLASS) {
3944 cl_init(pRExC_state, &this_class);
3945 data_fake.start_class = &this_class;
3946 f = SCF_DO_STCLASS_AND;
3947 }
3948 if (flags & SCF_WHILEM_VISITED_POS)
3949 f |= SCF_WHILEM_VISITED_POS;
3950
3951 if (trie->jump[word]) {
3952 if (!nextbranch)
3953 nextbranch = trie_node + trie->jump[0];
3954 scan= trie_node + trie->jump[word];
3955 /* We go from the jump point to the branch that follows
3956 it. Note this means we need the vestigal unused branches
3957 even though they arent otherwise used.
3958 */
3959 minnext = study_chunk(pRExC_state, &scan, minlenp,
3960 &deltanext, (regnode *)nextbranch, &data_fake,
3961 stopparen, recursed, NULL, f,depth+1);
3962 }
3963 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3964 nextbranch= regnext((regnode*)nextbranch);
3965
3966 if (min1 > (I32)(minnext + trie->minlen))
3967 min1 = minnext + trie->minlen;
3968 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3969 max1 = minnext + deltanext + trie->maxlen;
3970 if (deltanext == I32_MAX)
3971 is_inf = is_inf_internal = 1;
3972
3973 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3974 pars++;
3975 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3976 if ( stopmin > min + min1)
3977 stopmin = min + min1;
3978 flags &= ~SCF_DO_SUBSTR;
3979 if (data)
3980 data->flags |= SCF_SEEN_ACCEPT;
3981 }
3982 if (data) {
3983 if (data_fake.flags & SF_HAS_EVAL)
3984 data->flags |= SF_HAS_EVAL;
3985 data->whilem_c = data_fake.whilem_c;
3986 }
3987 if (flags & SCF_DO_STCLASS)
3988 cl_or(pRExC_state, &accum, &this_class);
3989 }
3990 }
3991 if (flags & SCF_DO_SUBSTR) {
3992 data->pos_min += min1;
3993 data->pos_delta += max1 - min1;
3994 if (max1 != min1 || is_inf)
3995 data->longest = &(data->longest_float);
3996 }
3997 min += min1;
3998 delta += max1 - min1;
3999 if (flags & SCF_DO_STCLASS_OR) {
4000 cl_or(pRExC_state, data->start_class, &accum);
4001 if (min1) {
4002 cl_and(data->start_class, and_withp);
4003 flags &= ~SCF_DO_STCLASS;
4004 }
4005 }
4006 else if (flags & SCF_DO_STCLASS_AND) {
4007 if (min1) {
4008 cl_and(data->start_class, &accum);
4009 flags &= ~SCF_DO_STCLASS;
4010 }
4011 else {
4012 /* Switch to OR mode: cache the old value of
4013 * data->start_class */
4014 INIT_AND_WITHP;
4015 StructCopy(data->start_class, and_withp,
4016 struct regnode_charclass_class);
4017 flags &= ~SCF_DO_STCLASS_AND;
4018 StructCopy(&accum, data->start_class,
4019 struct regnode_charclass_class);
4020 flags |= SCF_DO_STCLASS_OR;
4021 data->start_class->flags |= ANYOF_EOS;
4022 }
4023 }
4024 scan= tail;
4025 continue;
4026 }
786e8c11 4027#else
8aa23a47 4028 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4029 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4030 U8*bang=NULL;
4031
4032 min += trie->minlen;
4033 delta += (trie->maxlen - trie->minlen);
4034 flags &= ~SCF_DO_STCLASS; /* xxx */
4035 if (flags & SCF_DO_SUBSTR) {
304ee84b 4036 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4037 data->pos_min += trie->minlen;
4038 data->pos_delta += (trie->maxlen - trie->minlen);
4039 if (trie->maxlen != trie->minlen)
4040 data->longest = &(data->longest_float);
4041 }
4042 if (trie->jump) /* no more substrings -- for now /grr*/
4043 flags &= ~SCF_DO_SUBSTR;
b515a41d 4044 }
8aa23a47
YO
4045#endif /* old or new */
4046#endif /* TRIE_STUDY_OPT */
e1d1eefb 4047
8aa23a47
YO
4048 /* Else: zero-length, ignore. */
4049 scan = regnext(scan);
4050 }
4051 if (frame) {
4052 last = frame->last;
4053 scan = frame->next;
4054 stopparen = frame->stop;
4055 frame = frame->prev;
4056 goto fake_study_recurse;
c277df42
IZ
4057 }
4058
4059 finish:
8aa23a47 4060 assert(!frame);
304ee84b 4061 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4062
c277df42 4063 *scanp = scan;
aca2d497 4064 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4065 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4066 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4067 if (is_par > (I32)U8_MAX)
c277df42
IZ
4068 is_par = 0;
4069 if (is_par && pars==1 && data) {
4070 data->flags |= SF_IN_PAR;
4071 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4072 }
4073 else if (pars && data) {
c277df42
IZ
4074 data->flags |= SF_HAS_PAR;
4075 data->flags &= ~SF_IN_PAR;
4076 }
653099ff 4077 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4078 cl_and(data->start_class, and_withp);
786e8c11
YO
4079 if (flags & SCF_TRIE_RESTUDY)
4080 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4081
304ee84b 4082 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4083
e2e6a0f1 4084 return min < stopmin ? min : stopmin;
c277df42
IZ
4085}
4086
2eccd3b2
NC
4087STATIC U32
4088S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4089{
4a4e7719
NC
4090 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4091
7918f24d
NC
4092 PERL_ARGS_ASSERT_ADD_DATA;
4093
4a4e7719
NC
4094 Renewc(RExC_rxi->data,
4095 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4096 char, struct reg_data);
4097 if(count)
f8fc2ecf 4098 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4099 else
f8fc2ecf 4100 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4101 RExC_rxi->data->count = count + n;
4102 Copy(s, RExC_rxi->data->what + count, n, U8);
4103 return count;
c277df42
IZ
4104}
4105
f8149455 4106/*XXX: todo make this not included in a non debugging perl */
76234dfb 4107#ifndef PERL_IN_XSUB_RE
d88dccdf 4108void
864dbfa3 4109Perl_reginitcolors(pTHX)
d88dccdf 4110{
97aff369 4111 dVAR;
1df70142 4112 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4113 if (s) {
1df70142
AL
4114 char *t = savepv(s);
4115 int i = 0;
4116 PL_colors[0] = t;
d88dccdf 4117 while (++i < 6) {
1df70142
AL
4118 t = strchr(t, '\t');
4119 if (t) {
4120 *t = '\0';
4121 PL_colors[i] = ++t;
d88dccdf
IZ
4122 }
4123 else
1df70142 4124 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4125 }
4126 } else {
1df70142 4127 int i = 0;
b81d288d 4128 while (i < 6)
06b5626a 4129 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4130 }
4131 PL_colorset = 1;
4132}
76234dfb 4133#endif
8615cb43 4134
07be1b83 4135
786e8c11
YO
4136#ifdef TRIE_STUDY_OPT
4137#define CHECK_RESTUDY_GOTO \
4138 if ( \
4139 (data.flags & SCF_TRIE_RESTUDY) \
4140 && ! restudied++ \
4141 ) goto reStudy
4142#else
4143#define CHECK_RESTUDY_GOTO
4144#endif
f9f4320a 4145
a687059c 4146/*
e50aee73 4147 - pregcomp - compile a regular expression into internal code
a687059c
LW
4148 *
4149 * We can't allocate space until we know how big the compiled form will be,
4150 * but we can't compile it (and thus know how big it is) until we've got a
4151 * place to put the code. So we cheat: we compile it twice, once with code
4152 * generation turned off and size counting turned on, and once "for real".
4153 * This also means that we don't allocate space until we are sure that the
4154 * thing really will compile successfully, and we never have to move the
4155 * code and thus invalidate pointers into it. (Note that it has to be in
4156 * one piece because free() must be able to free it all.) [NB: not true in perl]
4157 *
4158 * Beware that the optimization-preparation code in here knows about some
4159 * of the structure of the compiled regexp. [I'll say.]
4160 */
b9b4dddf
YO
4161
4162
4163
f9f4320a 4164#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4165#define RE_ENGINE_PTR &PL_core_reg_engine
4166#else
f9f4320a
YO
4167extern const struct regexp_engine my_reg_engine;
4168#define RE_ENGINE_PTR &my_reg_engine
4169#endif
6d5c990f
RGS
4170
4171#ifndef PERL_IN_XSUB_RE
3ab4a224 4172REGEXP *
1593ad57 4173Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4174{
97aff369 4175 dVAR;
6d5c990f 4176 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4177
4178 PERL_ARGS_ASSERT_PREGCOMP;
4179
f9f4320a
YO
4180 /* Dispatch a request to compile a regexp to correct
4181 regexp engine. */
f9f4320a
YO
4182 if (table) {
4183 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4184 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4185 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4186 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4187 DEBUG_COMPILE_r({
8d8756e7 4188 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4189 SvIV(*ptr));
4190 });
3ab4a224 4191 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4192 }
b9b4dddf 4193 }
3ab4a224 4194 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4195}
6d5c990f 4196#endif
2a5d9b1d 4197
3ab4a224 4198REGEXP *
1593ad57 4199Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4200{
4201 dVAR;
288b8c02
NC
4202 REGEXP *rx;
4203 struct regexp *r;
f8fc2ecf 4204 register regexp_internal *ri;
3ab4a224 4205 STRLEN plen;
1593ad57 4206 char *exp = SvPV(pattern, plen);
3ab4a224 4207 char* xend = exp + plen;
c277df42 4208 regnode *scan;
a0d0e21e 4209 I32 flags;
a0d0e21e
LW
4210 I32 minlen = 0;
4211 I32 sawplus = 0;
4212 I32 sawopen = 0;
2c2d71f5 4213 scan_data_t data;
830247a4 4214 RExC_state_t RExC_state;
be8e71aa 4215 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4216#ifdef TRIE_STUDY_OPT
4217 int restudied= 0;
4218 RExC_state_t copyRExC_state;
4219#endif
2a5d9b1d 4220 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4221
4222 PERL_ARGS_ASSERT_RE_COMPILE;
4223
6d5c990f 4224 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4225
b9ad30b4 4226 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4227
a3621e74 4228 DEBUG_COMPILE_r({
ab3bbdeb
YO
4229 SV *dsv= sv_newmortal();
4230 RE_PV_QUOTED_DECL(s, RExC_utf8,
3ab4a224 4231 dsv, exp, plen, 60);
ab3bbdeb
YO
4232 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4233 PL_colors[4],PL_colors[5],s);
a5961de5 4234 });
02daf0ab
YO
4235
4236redo_first_pass:
4237 RExC_precomp = exp;
c737faaf 4238 RExC_flags = pm_flags;
830247a4 4239 RExC_sawback = 0;
bbce6d69 4240
830247a4
IZ
4241 RExC_seen = 0;
4242 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4243 RExC_seen_evals = 0;
4244 RExC_extralen = 0;
c277df42 4245
bbce6d69 4246 /* First pass: determine size, legality. */
830247a4 4247 RExC_parse = exp;
fac92740 4248 RExC_start = exp;
830247a4
IZ
4249 RExC_end = xend;
4250 RExC_naughty = 0;
4251 RExC_npar = 1;
e2e6a0f1 4252 RExC_nestroot = 0;
830247a4
IZ
4253 RExC_size = 0L;
4254 RExC_emit = &PL_regdummy;
4255 RExC_whilem_seen = 0;
fc8cd66c 4256 RExC_charnames = NULL;
40d049e4
YO
4257 RExC_open_parens = NULL;
4258 RExC_close_parens = NULL;
4259 RExC_opend = NULL;
81714fb9 4260 RExC_paren_names = NULL;
1f1031fe
YO
4261#ifdef DEBUGGING
4262 RExC_paren_name_list = NULL;
4263#endif
40d049e4
YO
4264 RExC_recurse = NULL;
4265 RExC_recurse_count = 0;
81714fb9 4266
85ddcde9
JH
4267#if 0 /* REGC() is (currently) a NOP at the first pass.
4268 * Clever compilers notice this and complain. --jhi */
830247a4 4269 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4270#endif
3dab1dad
YO
4271 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4272 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4273 RExC_precomp = NULL;
a0d0e21e
LW
4274 return(NULL);
4275 }
02daf0ab 4276 if (RExC_utf8 && !RExC_orig_utf8) {
38a44b82 4277 /* It's possible to write a regexp in ascii that represents Unicode
02daf0ab
YO
4278 codepoints outside of the byte range, such as via \x{100}. If we
4279 detect such a sequence we have to convert the entire pattern to utf8
4280 and then recompile, as our sizing calculation will have been based
4281 on 1 byte == 1 character, but we will need to use utf8 to encode
4282 at least some part of the pattern, and therefore must convert the whole
4283 thing.
4284 XXX: somehow figure out how to make this less expensive...
4285 -- dmq */
3ab4a224 4286 STRLEN len = plen;
02daf0ab
YO
4287 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4288 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4289 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4290 xend = exp + len;
4291 RExC_orig_utf8 = RExC_utf8;
4292 SAVEFREEPV(exp);
4293 goto redo_first_pass;
4294 }
07be1b83 4295 DEBUG_PARSE_r({
81714fb9
YO
4296 PerlIO_printf(Perl_debug_log,
4297 "Required size %"IVdf" nodes\n"
4298 "Starting second pass (creation)\n",
4299 (IV)RExC_size);
07be1b83
YO
4300 RExC_lastnum=0;
4301 RExC_lastparse=NULL;
4302 });
c277df42
IZ
4303 /* Small enough for pointer-storage convention?
4304 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4305 if (RExC_size >= 0x10000L && RExC_extralen)
4306 RExC_size += RExC_extralen;
c277df42 4307 else
830247a4
IZ
4308 RExC_extralen = 0;
4309 if (RExC_whilem_seen > 15)
4310 RExC_whilem_seen = 15;
a0d0e21e 4311
f9f4320a
YO
4312 /* Allocate space and zero-initialize. Note, the two step process
4313 of zeroing when in debug mode, thus anything assigned has to
4314 happen after that */
d2f13c59 4315 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4316 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4317 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4318 char, regexp_internal);
4319 if ( r == NULL || ri == NULL )
b45f050a 4320 FAIL("Regexp out of space");
0f79a09d
GS
4321#ifdef DEBUGGING
4322 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4323 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4324#else
f8fc2ecf
YO
4325 /* bulk initialize base fields with 0. */
4326 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4327#endif
58e23c8d
YO
4328
4329 /* non-zero initialization begins here */
f8fc2ecf 4330 RXi_SET( r, ri );
f9f4320a 4331 r->engine= RE_ENGINE_PTR;
c737faaf 4332 r->extflags = pm_flags;
bcdf7404 4333 {
f7819f85 4334 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bcdf7404
YO
4335 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4336 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4337 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4338 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4339 const char *fptr = STD_PAT_MODS; /*"msix"*/
4340 char *p;
9d17798d 4341 const STRLEN wraplen = plen + has_minus + has_p + has_runon
bcdf7404
YO
4342 + (sizeof(STD_PAT_MODS) - 1)
4343 + (sizeof("(?:)") - 1);
4344
ad64d0ec 4345 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
9d17798d 4346 SvCUR_set(rx, wraplen);
f7c278bf 4347 SvPOK_on(rx);
8f6ae13c 4348 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4349 *p++='('; *p++='?';
f7819f85
A
4350 if (has_p)
4351 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404
YO
4352 {
4353 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4354 char *colon = r + 1;
4355 char ch;
4356
4357 while((ch = *fptr++)) {
4358 if(reganch & 1)
4359 *p++ = ch;
4360 else
4361 *r-- = ch;
4362 reganch >>= 1;
4363 }
4364 if(has_minus) {
4365 *r = '-';
4366 p = colon;
4367 }
4368 }
4369
28d8d7f4 4370 *p++ = ':';
bb661a58 4371 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4372 assert ((RX_WRAPPED(rx) - p) < 16);
4373 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4374 p += plen;
bcdf7404 4375 if (has_runon)
28d8d7f4
YO
4376 *p++ = '\n';
4377 *p++ = ')';
4378 *p = 0;
bcdf7404
YO
4379 }
4380
bbe252da 4381 r->intflags = 0;
830247a4 4382 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4383
6bda09f9 4384 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4385 Newxz(RExC_open_parens, RExC_npar,regnode *);
4386 SAVEFREEPV(RExC_open_parens);
4387 Newxz(RExC_close_parens,RExC_npar,regnode *);
4388 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4389 }
4390
4391 /* Useful during FAIL. */
7122b237
YO
4392#ifdef RE_TRACK_PATTERN_OFFSETS
4393 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4394 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4395 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4396 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4397 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4398#endif
4399 SetProgLen(ri,RExC_size);
288b8c02 4400 RExC_rx_sv = rx;
830247a4 4401 RExC_rx = r;
f8fc2ecf 4402 RExC_rxi = ri;
bbce6d69 4403
4404 /* Second pass: emit code. */
c737faaf 4405 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4406 RExC_parse = exp;
4407 RExC_end = xend;
4408 RExC_naughty = 0;
4409 RExC_npar = 1;
f8fc2ecf
YO
4410 RExC_emit_start = ri->program;
4411 RExC_emit = ri->program;
3b57cd43
YO
4412 RExC_emit_bound = ri->program + RExC_size + 1;
4413
2cd61cdb 4414 /* Store the count of eval-groups for security checks: */
f8149455 4415 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4416 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4417 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4418 ReREFCNT_dec(rx);
a0d0e21e 4419 return(NULL);
80757612 4420 }
07be1b83
YO
4421 /* XXXX To minimize changes to RE engine we always allocate
4422 3-units-long substrs field. */
4423 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4424 if (RExC_recurse_count) {
4425 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4426 SAVEFREEPV(RExC_recurse);
4427 }
a0d0e21e 4428
07be1b83 4429reStudy:
1de06328 4430 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4431 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4432
07be1b83 4433#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4434 if (!restudied) {
4435 StructCopy(&zero_scan_data, &data, scan_data_t);
4436 copyRExC_state = RExC_state;
4437 } else {
5d458dd8 4438 U32 seen=RExC_seen;
07be1b83 4439 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4440
4441 RExC_state = copyRExC_state;
4442 if (seen & REG_TOP_LEVEL_BRANCHES)
4443 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4444 else
4445 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4446 if (data.last_found) {
07be1b83 4447 SvREFCNT_dec(data.longest_fixed);
07be1b83 4448 SvREFCNT_dec(data.longest_float);
07be1b83 4449 SvREFCNT_dec(data.last_found);
1de06328 4450 }
40d049e4 4451 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4452 }
40d049e4
YO
4453#else
4454 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4455#endif
fc8cd66c 4456
a0d0e21e 4457 /* Dig out information for optimizations. */
f7819f85 4458 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4459 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4460
a0ed51b3 4461 if (UTF)
8f6ae13c 4462 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4463 ri->regstclass = NULL;
830247a4 4464 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4465 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4466 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4467
1de06328
YO
4468 /* testing for BRANCH here tells us whether there is "must appear"
4469 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4470 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4471 I32 fake;
c5254dd6 4472 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4473 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4474 int stclass_flag;
07be1b83 4475 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4476 regnode *first= scan;
4477 regnode *first_next= regnext(first);
4478
639081d6
YO
4479 /*
4480 * Skip introductions and multiplicators >= 1
4481 * so that we can extract the 'meat' of the pattern that must
4482 * match in the large if() sequence following.
4483 * NOTE that EXACT is NOT covered here, as it is normally
4484 * picked up by the optimiser separately.
4485 *
4486 * This is unfortunate as the optimiser isnt handling lookahead
4487 * properly currently.
4488 *
4489 */
a0d0e21e 4490 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4491 /* An OR of *one* alternative - should not happen now. */
5339e136 4492 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4493 /* for now we can't handle lookbehind IFMATCH*/
4494 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4495 (OP(first) == PLUS) ||
4496 (OP(first) == MINMOD) ||
653099ff 4497 /* An {n,m} with n>0 */
5339e136
YO
4498 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4499 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4500 {
639081d6
YO
4501 /*
4502 * the only op that could be a regnode is PLUS, all the rest
4503 * will be regnode_1 or regnode_2.
4504 *
4505 */
a0d0e21e
LW
4506 if (OP(first) == PLUS)
4507 sawplus = 1;
4508 else
3dab1dad 4509 first += regarglen[OP(first)];
639081d6
YO
4510
4511 first = NEXTOPER(first);
5339e136 4512 first_next= regnext(first);
a687059c
LW
4513 }
4514
a0d0e21e
LW
4515 /* Starting-point info. */
4516 again:
786e8c11 4517 DEBUG_PEEP("first:",first,0);
07be1b83 4518 /* Ignore EXACT as we deal with it later. */
3dab1dad 4519 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4520 if (OP(first) == EXACT)
6f207bd3 4521 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4522 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4523 ri->regstclass = first;
b3c9acc1 4524 }
07be1b83 4525#ifdef TRIE_STCLASS
786e8c11 4526 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4527 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4528 {
786e8c11 4529 regnode *trie_op;
07be1b83 4530 /* this can happen only on restudy */
786e8c11 4531 if ( OP(first) == TRIE ) {
c944940b 4532 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4533 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4534 StructCopy(first,trieop,struct regnode_1);
4535 trie_op=(regnode *)trieop;
4536 } else {
c944940b 4537 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4538 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4539 StructCopy(first,trieop,struct regnode_charclass);
4540 trie_op=(regnode *)trieop;
4541 }
1de06328 4542 OP(trie_op)+=2;
786e8c11 4543 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4544 ri->regstclass = trie_op;
07be1b83
YO
4545 }
4546#endif
bfed75c6 4547 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4548 ri->regstclass = first;
3dab1dad
YO
4549 else if (PL_regkind[OP(first)] == BOUND ||
4550 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4551 ri->regstclass = first;
3dab1dad 4552 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4553 r->extflags |= (OP(first) == MBOL
4554 ? RXf_ANCH_MBOL
cad2e5aa 4555 : (OP(first) == SBOL
bbe252da
YO
4556 ? RXf_ANCH_SBOL
4557 : RXf_ANCH_BOL));
a0d0e21e 4558 first = NEXTOPER(first);
774d564b 4559 goto again;
4560 }
4561 else if (OP(first) == GPOS) {
bbe252da 4562 r->extflags |= RXf_ANCH_GPOS;
774d564b 4563 first = NEXTOPER(first);
4564 goto again;
a0d0e21e 4565 }
cf2a2b69
YO
4566 else if ((!sawopen || !RExC_sawback) &&
4567 (OP(first) == STAR &&
3dab1dad 4568 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4569 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4570 {
4571 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4572 const int type =
4573 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4574 ? RXf_ANCH_MBOL
4575 : RXf_ANCH_SBOL;
4576 r->extflags |= type;
4577 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4578 first = NEXTOPER(first);
774d564b 4579 goto again;
a0d0e21e 4580 }
b81d288d 4581 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4582 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4583 /* x+ must match at the 1st pos of run of x's */
bbe252da 4584 r->intflags |= PREGf_SKIP;
a0d0e21e 4585
c277df42 4586 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4587#ifdef TRIE_STUDY_OPT
81714fb9 4588 DEBUG_PARSE_r(
be8e71aa
YO
4589 if (!restudied)
4590 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4591 (IV)(first - scan + 1))
4592 );
4593#else
81714fb9 4594 DEBUG_PARSE_r(
be8e71aa
YO
4595 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4596 (IV)(first - scan + 1))
4597 );
4598#endif
4599
4600
a0d0e21e
LW
4601 /*
4602 * If there's something expensive in the r.e., find the
4603 * longest literal string that must appear and make it the
4604 * regmust. Resolve ties in favor of later strings, since
4605 * the regstart check works with the beginning of the r.e.
4606 * and avoiding duplication strengthens checking. Not a
4607 * strong reason, but sufficient in the absence of others.
4608 * [Now we resolve ties in favor of the earlier string if
c277df42 4609 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4610 * earlier string may buy us something the later one won't.]
4611 */
de8c5301 4612
396482e1
GA
4613 data.longest_fixed = newSVpvs("");
4614 data.longest_float = newSVpvs("");
4615 data.last_found = newSVpvs("");
c277df42
IZ
4616 data.longest = &(data.longest_fixed);
4617 first = scan;
f8fc2ecf 4618 if (!ri->regstclass) {
830247a4 4619 cl_init(pRExC_state, &ch_class);
653099ff
GS
4620 data.start_class = &ch_class;
4621 stclass_flag = SCF_DO_STCLASS_AND;
4622 } else /* XXXX Check for BOUND? */
4623 stclass_flag = 0;
cb434fcc 4624 data.last_closep = &last_close;
de8c5301 4625
1de06328 4626 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4627 &data, -1, NULL, NULL,
4628 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4629
07be1b83 4630
786e8c11
YO
4631 CHECK_RESTUDY_GOTO;
4632
4633
830247a4 4634 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4635 && data.last_start_min == 0 && data.last_end > 0
830247a4 4636 && !RExC_seen_zerolen
2bf803e2 4637 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4638 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4639 r->extflags |= RXf_CHECK_ALL;
304ee84b 4640 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4641 SvREFCNT_dec(data.last_found);
4642
1de06328
YO
4643 /* Note that code very similar to this but for anchored string
4644 follows immediately below, changes may need to be made to both.
4645 Be careful.
4646 */
a0ed51b3 4647 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4648 if (longest_float_length
c277df42
IZ
4649 || (data.flags & SF_FL_BEFORE_EOL
4650 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4651 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4652 {
1182767e 4653 I32 t,ml;
cf93c79d 4654
1de06328 4655 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4656 && data.offset_fixed == data.offset_float_min
4657 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4658 goto remove_float; /* As in (a)+. */
4659
1de06328
YO
4660 /* copy the information about the longest float from the reg_scan_data
4661 over to the program. */
33b8afdf
JH
4662 if (SvUTF8(data.longest_float)) {
4663 r->float_utf8 = data.longest_float;
c445ea15 4664 r->float_substr = NULL;
33b8afdf
JH
4665 } else {
4666 r->float_substr = data.longest_float;
c445ea15 4667 r->float_utf8 = NULL;
33b8afdf 4668 }
1de06328
YO
4669 /* float_end_shift is how many chars that must be matched that
4670 follow this item. We calculate it ahead of time as once the
4671 lookbehind offset is added in we lose the ability to correctly
4672 calculate it.*/
4673 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4674 : (I32)longest_float_length;
1de06328
YO
4675 r->float_end_shift = ml - data.offset_float_min
4676 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4677 + data.lookbehind_float;
4678 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4679 r->float_max_offset = data.offset_float_max;
1182767e 4680 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4681 r->float_max_offset -= data.lookbehind_float;
4682
cf93c79d
IZ
4683 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4684 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4685 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4686 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4687 }
4688 else {
aca2d497 4689 remove_float:
c445ea15 4690 r->float_substr = r->float_utf8 = NULL;
c277df42 4691 SvREFCNT_dec(data.longest_float);
c5254dd6 4692 longest_float_length = 0;
a0d0e21e 4693 }
c277df42 4694
1de06328
YO
4695 /* Note that code very similar to this but for floating string
4696 is immediately above, changes may need to be made to both.
4697 Be careful.
4698 */
a0ed51b3 4699 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4700 if (longest_fixed_length
c277df42
IZ
4701 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4702 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4703 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4704 {
1182767e 4705 I32 t,ml;
cf93c79d 4706
1de06328
YO
4707 /* copy the information about the longest fixed
4708 from the reg_scan_data over to the program. */
33b8afdf
JH
4709 if (SvUTF8(data.longest_fixed)) {
4710 r->anchored_utf8 = data.longest_fixed;
c445ea15 4711 r->anchored_substr = NULL;
33b8afdf
JH
4712 } else {
4713 r->anchored_substr = data.longest_fixed;
c445ea15 4714 r->anchored_utf8 = NULL;
33b8afdf 4715 }
1de06328
YO
4716 /* fixed_end_shift is how many chars that must be matched that
4717 follow this item. We calculate it ahead of time as once the
4718 lookbehind offset is added in we lose the ability to correctly
4719 calculate it.*/
4720 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4721 : (I32)longest_fixed_length;
1de06328
YO
4722 r->anchored_end_shift = ml - data.offset_fixed
4723 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4724 + data.lookbehind_fixed;
4725 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4726
cf93c79d
IZ
4727 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4728 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4729 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4730 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4731 }
4732 else {
c445ea15 4733 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4734 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4735 longest_fixed_length = 0;
a0d0e21e 4736 }
f8fc2ecf
YO
4737 if (ri->regstclass
4738 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4739 ri->regstclass = NULL;
33b8afdf
JH
4740 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4741 && stclass_flag
653099ff 4742 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4743 && !cl_is_anything(data.start_class))
4744 {
2eccd3b2 4745 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4746
f8fc2ecf 4747 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4748 struct regnode_charclass_class);
4749 StructCopy(data.start_class,
f8fc2ecf 4750 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4751 struct regnode_charclass_class);
f8fc2ecf 4752 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4753 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4754 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4755 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4756 PerlIO_printf(Perl_debug_log,
a0288114 4757 "synthetic stclass \"%s\".\n",
3f7c398e 4758 SvPVX_const(sv));});
653099ff 4759 }
c277df42
IZ
4760
4761 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4762 if (longest_fixed_length > longest_float_length) {
1de06328 4763 r->check_end_shift = r->anchored_end_shift;
c277df42 4764 r->check_substr = r->anchored_substr;
33b8afdf 4765 r->check_utf8 = r->anchored_utf8;
c277df42 4766 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4767 if (r->extflags & RXf_ANCH_SINGLE)
4768 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4769 }
4770 else {
1de06328 4771 r->check_end_shift = r->float_end_shift;
c277df42 4772 r->check_substr = r->float_substr;
33b8afdf 4773 r->check_utf8 = r->float_utf8;
1de06328
YO
4774 r->check_offset_min = r->float_min_offset;
4775 r->check_offset_max = r->float_max_offset;
a0d0e21e 4776 }
30382c73
IZ
4777 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4778 This should be changed ASAP! */
bbe252da
YO
4779 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4780 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4781 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4782 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4783 }
1de06328
YO
4784 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4785 if ( (STRLEN)minlen < longest_float_length )
4786 minlen= longest_float_length;
4787 if ( (STRLEN)minlen < longest_fixed_length )
4788 minlen= longest_fixed_length;
4789 */
a0ed51b3
LW
4790 }
4791 else {
c277df42
IZ
4792 /* Several toplevels. Best we can is to set minlen. */
4793 I32 fake;
653099ff 4794 struct regnode_charclass_class ch_class;
cb434fcc 4795 I32 last_close = 0;
c277df42 4796
5d458dd8 4797 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4798
f8fc2ecf 4799 scan = ri->program + 1;
830247a4 4800 cl_init(pRExC_state, &ch_class);
653099ff 4801 data.start_class = &ch_class;
cb434fcc 4802 data.last_closep = &last_close;
07be1b83 4803
de8c5301 4804
1de06328 4805 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4806 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4807
786e8c11 4808 CHECK_RESTUDY_GOTO;
07be1b83 4809
33b8afdf 4810 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4811 = r->float_substr = r->float_utf8 = NULL;
653099ff 4812 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4813 && !cl_is_anything(data.start_class))
4814 {
2eccd3b2 4815 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4816
f8fc2ecf 4817 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4818 struct regnode_charclass_class);
4819 StructCopy(data.start_class,
f8fc2ecf 4820 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4821 struct regnode_charclass_class);
f8fc2ecf 4822 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4823 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4824 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4825 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4826 PerlIO_printf(Perl_debug_log,
a0288114 4827 "synthetic stclass \"%s\".\n",
3f7c398e 4828 SvPVX_const(sv));});
653099ff 4829 }
a0d0e21e
LW
4830 }
4831
1de06328
YO
4832 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4833 the "real" pattern. */
cf9788e3
RGS
4834 DEBUG_OPTIMISE_r({
4835 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4836 (IV)minlen, (IV)r->minlen);
cf9788e3 4837 });
de8c5301 4838 r->minlenret = minlen;
1de06328
YO
4839 if (r->minlen < minlen)
4840 r->minlen = minlen;
4841
b81d288d 4842 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4843 r->extflags |= RXf_GPOS_SEEN;
830247a4 4844 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4845 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4846 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4847 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4848 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4849 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4850 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4851 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4852 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4853 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 4854 if (RExC_paren_names)
85fbaab2 4855 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 4856 else
5daac39c 4857 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 4858
7bd1e614 4859#ifdef STUPID_PATTERN_CHECKS
5509d87a 4860 if (RX_PRELEN(rx) == 0)
640f820d 4861 r->extflags |= RXf_NULL;
5509d87a 4862 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
4863 /* XXX: this should happen BEFORE we compile */
4864 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 4865 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 4866 r->extflags |= RXf_WHITE;
5509d87a 4867 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 4868 r->extflags |= RXf_START_ONLY;
f1b875a0 4869#else
5509d87a 4870 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
4871 /* XXX: this should happen BEFORE we compile */
4872 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4873 else {
4874 regnode *first = ri->program + 1;
39aa8307
JH
4875 U8 fop = OP(first);
4876 U8 nop = OP(NEXTOPER(first));
7bd1e614 4877
640f820d
AB
4878 if (PL_regkind[fop] == NOTHING && nop == END)
4879 r->extflags |= RXf_NULL;
4880 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
4881 r->extflags |= RXf_START_ONLY;
4882 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4883 r->extflags |= RXf_WHITE;
4884 }
f1b875a0 4885#endif
1f1031fe
YO
4886#ifdef DEBUGGING
4887 if (RExC_paren_names) {
4888 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4889 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4890 } else
1f1031fe 4891#endif
cde0cee5 4892 ri->name_list_idx = 0;
1f1031fe 4893
40d049e4
YO
4894 if (RExC_recurse_count) {
4895 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4896 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4897 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4898 }
4899 }
f0ab9afb 4900 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
4901 /* assume we don't need to swap parens around before we match */
4902
be8e71aa
YO
4903 DEBUG_DUMP_r({
4904 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4905 regdump(r);
4906 });
7122b237
YO
4907#ifdef RE_TRACK_PATTERN_OFFSETS
4908 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4909 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4910 U32 i;
4911 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4912 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4913 for (i = 1; i <= len; i++) {
7122b237 4914 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4915 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4916 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4917 }
4918 PerlIO_printf(Perl_debug_log, "\n");
4919 });
7122b237 4920#endif
288b8c02 4921 return rx;
a687059c
LW
4922}
4923
f9f4320a 4924#undef RE_ENGINE_PTR
3dab1dad 4925
93b32b6d 4926
81714fb9 4927SV*
192b9cd1
AB
4928Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4929 const U32 flags)
4930{
7918f24d
NC
4931 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4932
192b9cd1
AB
4933 PERL_UNUSED_ARG(value);
4934
f1b875a0 4935 if (flags & RXapif_FETCH) {
192b9cd1 4936 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 4937 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
f1f66076 4938 Perl_croak(aTHX_ "%s", PL_no_modify);
192b9cd1 4939 return NULL;
f1b875a0 4940 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
4941 return reg_named_buff_exists(rx, key, flags)
4942 ? &PL_sv_yes
4943 : &PL_sv_no;
f1b875a0 4944 } else if (flags & RXapif_REGNAMES) {
192b9cd1 4945 return reg_named_buff_all(rx, flags);
f1b875a0 4946 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
4947 return reg_named_buff_scalar(rx, flags);
4948 } else {
4949 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4950 return NULL;
4951 }
4952}
4953
4954SV*
4955Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4956 const U32 flags)
4957{
7918f24d 4958 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
4959 PERL_UNUSED_ARG(lastkey);
4960
f1b875a0 4961 if (flags & RXapif_FIRSTKEY)
192b9cd1 4962 return reg_named_buff_firstkey(rx, flags);
f1b875a0 4963 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
4964 return reg_named_buff_nextkey(rx, flags);
4965 else {
4966 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4967 return NULL;
4968 }
4969}
4970
4971SV*
288b8c02
NC
4972Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4973 const U32 flags)
81714fb9 4974{
44a2ac75
YO
4975 AV *retarray = NULL;
4976 SV *ret;
288b8c02 4977 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
4978
4979 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4980
f1b875a0 4981 if (flags & RXapif_ALL)
44a2ac75 4982 retarray=newAV();
93b32b6d 4983
5daac39c
NC
4984 if (rx && RXp_PAREN_NAMES(rx)) {
4985 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
4986 if (he_str) {
4987 IV i;
4988 SV* sv_dat=HeVAL(he_str);
4989 I32 *nums=(I32*)SvPVX(sv_dat);
4990 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
4991 if ((I32)(rx->nparens) >= nums[i]
4992 && rx->offs[nums[i]].start != -1
4993 && rx->offs[nums[i]].end != -1)
93b32b6d 4994 {
49d7dfbc 4995 ret = newSVpvs("");
288b8c02 4996 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
4997 if (!retarray)
4998 return ret;
4999 } else {
5000 ret = newSVsv(&PL_sv_undef);
5001 }
ec83ea38 5002 if (retarray)
93b32b6d 5003 av_push(retarray, ret);
81714fb9 5004 }
93b32b6d 5005 if (retarray)
ad64d0ec 5006 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5007 }
5008 }
5009 return NULL;
5010}
5011
5012bool
288b8c02 5013Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5014 const U32 flags)
5015{
288b8c02 5016 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5017
5018 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5019
5daac39c 5020 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5021 if (flags & RXapif_ALL) {
5daac39c 5022 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5023 } else {
288b8c02 5024 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5025 if (sv) {
5026 SvREFCNT_dec(sv);
192b9cd1
AB
5027 return TRUE;
5028 } else {
5029 return FALSE;
5030 }
5031 }
5032 } else {
5033 return FALSE;
5034 }
5035}
5036
5037SV*
288b8c02 5038Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5039{
288b8c02 5040 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5041
5042 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5043
5daac39c
NC
5044 if ( rx && RXp_PAREN_NAMES(rx) ) {
5045 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5046
288b8c02 5047 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5048 } else {
5049 return FALSE;
5050 }
192b9cd1
AB
5051}
5052
5053SV*
288b8c02 5054Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5055{
288b8c02 5056 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5057 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5058
5059 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5060
5daac39c
NC
5061 if (rx && RXp_PAREN_NAMES(rx)) {
5062 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5063 HE *temphe;
5064 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5065 IV i;
5066 IV parno = 0;
5067 SV* sv_dat = HeVAL(temphe);
5068 I32 *nums = (I32*)SvPVX(sv_dat);
5069 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5070 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5071 rx->offs[nums[i]].start != -1 &&
5072 rx->offs[nums[i]].end != -1)
5073 {
5074 parno = nums[i];
5075 break;
5076 }
5077 }
f1b875a0 5078 if (parno || flags & RXapif_ALL) {
a663657d 5079 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5080 }
81714fb9
YO
5081 }
5082 }
44a2ac75
YO
5083 return NULL;
5084}
5085
192b9cd1 5086SV*
288b8c02 5087Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5088{
5089 SV *ret;
5090 AV *av;
5091 I32 length;
288b8c02 5092 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5093
7918f24d
NC
5094 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5095
5daac39c 5096 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5097 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5098 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5099 } else if (flags & RXapif_ONE) {
288b8c02 5100 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5101 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5102 length = av_len(av);
ec83ea38 5103 SvREFCNT_dec(ret);
192b9cd1
AB
5104 return newSViv(length + 1);
5105 } else {
5106 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5107 return NULL;
5108 }
5109 }
5110 return &PL_sv_undef;
5111}
5112
5113SV*
288b8c02 5114Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5115{
288b8c02 5116 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5117 AV *av = newAV();
5118
7918f24d
NC
5119 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5120
5daac39c
NC
5121 if (rx && RXp_PAREN_NAMES(rx)) {
5122 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5123 HE *temphe;
5124 (void)hv_iterinit(hv);
5125 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5126 IV i;
5127 IV parno = 0;
5128 SV* sv_dat = HeVAL(temphe);
5129 I32 *nums = (I32*)SvPVX(sv_dat);
5130 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5131 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5132 rx->offs[nums[i]].start != -1 &&
5133 rx->offs[nums[i]].end != -1)
5134 {
5135 parno = nums[i];
5136 break;
5137 }
5138 }
f1b875a0 5139 if (parno || flags & RXapif_ALL) {
a663657d 5140 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5141 }
5142 }
5143 }
5144
ad64d0ec 5145 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5146}
5147
49d7dfbc 5148void
288b8c02
NC
5149Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5150 SV * const sv)
44a2ac75 5151{
288b8c02 5152 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5153 char *s = NULL;
a9d504c3 5154 I32 i = 0;
44a2ac75 5155 I32 s1, t1;
7918f24d
NC
5156
5157 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5158
cde0cee5
YO
5159 if (!rx->subbeg) {
5160 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5161 return;
cde0cee5
YO
5162 }
5163 else
f1b875a0 5164 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5165 /* $` */
f0ab9afb 5166 i = rx->offs[0].start;
cde0cee5 5167 s = rx->subbeg;
44a2ac75
YO
5168 }
5169 else
f1b875a0 5170 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5171 /* $' */
f0ab9afb
NC
5172 s = rx->subbeg + rx->offs[0].end;
5173 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5174 }
5175 else
5176 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5177 (s1 = rx->offs[paren].start) != -1 &&
5178 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5179 {
5180 /* $& $1 ... */
5181 i = t1 - s1;
5182 s = rx->subbeg + s1;
cde0cee5
YO
5183 } else {
5184 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5185 return;
cde0cee5
YO
5186 }
5187 assert(rx->sublen >= (s - rx->subbeg) + i );
5188 if (i >= 0) {
5189 const int oldtainted = PL_tainted;
5190 TAINT_NOT;
5191 sv_setpvn(sv, s, i);
5192 PL_tainted = oldtainted;
5193 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5194 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5195 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5196 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5197 {
5198 SvUTF8_on(sv);
5199 }
5200 else
5201 SvUTF8_off(sv);
5202 if (PL_tainting) {
07bc277f 5203 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5204 if (SvTYPE(sv) >= SVt_PVMG) {
5205 MAGIC* const mg = SvMAGIC(sv);
5206 MAGIC* mgt;
5207 PL_tainted = 1;
5208 SvMAGIC_set(sv, mg->mg_moremagic);
5209 SvTAINT(sv);
5210 if ((mgt = SvMAGIC(sv))) {
5211 mg->mg_moremagic = mgt;
5212 SvMAGIC_set(sv, mg);
44a2ac75 5213 }
cde0cee5
YO
5214 } else {
5215 PL_tainted = 1;
5216 SvTAINT(sv);
5217 }
5218 } else
5219 SvTAINTED_off(sv);
44a2ac75 5220 }
81714fb9 5221 } else {
44a2ac75 5222 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5223 return;
81714fb9
YO
5224 }
5225}
93b32b6d 5226
2fdbfb4d
AB
5227void
5228Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5229 SV const * const value)
5230{
7918f24d
NC
5231 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5232
2fdbfb4d
AB
5233 PERL_UNUSED_ARG(rx);
5234 PERL_UNUSED_ARG(paren);
5235 PERL_UNUSED_ARG(value);
5236
5237 if (!PL_localizing)
f1f66076 5238 Perl_croak(aTHX_ "%s", PL_no_modify);
2fdbfb4d
AB
5239}
5240
5241I32
288b8c02 5242Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5243 const I32 paren)
5244{
288b8c02 5245 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5246 I32 i;
5247 I32 s1, t1;
5248
7918f24d
NC
5249 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5250
2fdbfb4d
AB
5251 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5252 switch (paren) {
192b9cd1 5253 /* $` / ${^PREMATCH} */
f1b875a0 5254 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5255 if (rx->offs[0].start != -1) {
5256 i = rx->offs[0].start;
5257 if (i > 0) {
5258 s1 = 0;
5259 t1 = i;
5260 goto getlen;
5261 }
5262 }
5263 return 0;
192b9cd1 5264 /* $' / ${^POSTMATCH} */
f1b875a0 5265 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5266 if (rx->offs[0].end != -1) {
5267 i = rx->sublen - rx->offs[0].end;
5268 if (i > 0) {
5269 s1 = rx->offs[0].end;
5270 t1 = rx->sublen;
5271 goto getlen;
5272 }
5273 }
5274 return 0;
192b9cd1
AB
5275 /* $& / ${^MATCH}, $1, $2, ... */
5276 default:
2fdbfb4d
AB
5277 if (paren <= (I32)rx->nparens &&
5278 (s1 = rx->offs[paren].start) != -1 &&
5279 (t1 = rx->offs[paren].end) != -1)
5280 {
5281 i = t1 - s1;
5282 goto getlen;
5283 } else {
5284 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5285 report_uninit((const SV *)sv);
2fdbfb4d
AB
5286 return 0;
5287 }
5288 }
5289 getlen:
07bc277f 5290 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5291 const char * const s = rx->subbeg + s1;
5292 const U8 *ep;
5293 STRLEN el;
5294
5295 i = t1 - s1;
5296 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5297 i = el;
5298 }
5299 return i;
5300}
5301
fe578d7f 5302SV*
49d7dfbc 5303Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5304{
7918f24d 5305 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5306 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5307 if (0)
5308 return NULL;
5309 else
5310 return newSVpvs("Regexp");
fe578d7f 5311}
0a4db386 5312
894be9b7 5313/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5314 * If flags is REG_RSN_RETURN_NULL returns null.
5315 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5316 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5317 * to the parsed name as looked up in the RExC_paren_names hash.
5318 * If there is an error throws a vFAIL().. type exception.
894be9b7 5319 */
0a4db386
YO
5320
5321#define REG_RSN_RETURN_NULL 0
5322#define REG_RSN_RETURN_NAME 1
5323#define REG_RSN_RETURN_DATA 2
5324
894be9b7 5325STATIC SV*
7918f24d
NC
5326S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5327{
894be9b7 5328 char *name_start = RExC_parse;
1f1031fe 5329
7918f24d
NC
5330 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5331
1f1031fe
YO
5332 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5333 /* skip IDFIRST by using do...while */
5334 if (UTF)
5335 do {
5336 RExC_parse += UTF8SKIP(RExC_parse);
5337 } while (isALNUM_utf8((U8*)RExC_parse));
5338 else
5339 do {
5340 RExC_parse++;
5341 } while (isALNUM(*RExC_parse));
894be9b7 5342 }
1f1031fe 5343
0a4db386 5344 if ( flags ) {
59cd0e26
NC
5345 SV* sv_name
5346 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5347 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5348 if ( flags == REG_RSN_RETURN_NAME)
5349 return sv_name;
5350 else if (flags==REG_RSN_RETURN_DATA) {
5351 HE *he_str = NULL;
5352 SV *sv_dat = NULL;
5353 if ( ! sv_name ) /* should not happen*/
5354 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5355 if (RExC_paren_names)
5356 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5357 if ( he_str )
5358 sv_dat = HeVAL(he_str);
5359 if ( ! sv_dat )
5360 vFAIL("Reference to nonexistent named group");
5361 return sv_dat;
5362 }
5363 else {
5364 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5365 }
5366 /* NOT REACHED */
894be9b7 5367 }
0a4db386 5368 return NULL;
894be9b7
YO
5369}
5370
3dab1dad
YO
5371#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5372 int rem=(int)(RExC_end - RExC_parse); \
5373 int cut; \
5374 int num; \
5375 int iscut=0; \
5376 if (rem>10) { \
5377 rem=10; \
5378 iscut=1; \
5379 } \
5380 cut=10-rem; \
5381 if (RExC_lastparse!=RExC_parse) \
5382 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5383 rem, RExC_parse, \
5384 cut + 4, \
5385 iscut ? "..." : "<" \
5386 ); \
5387 else \
5388 PerlIO_printf(Perl_debug_log,"%16s",""); \
5389 \
5390 if (SIZE_ONLY) \
3b57cd43 5391 num = RExC_size + 1; \
3dab1dad
YO
5392 else \
5393 num=REG_NODE_NUM(RExC_emit); \
5394 if (RExC_lastnum!=num) \
0a4db386 5395 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5396 else \
0a4db386 5397 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5398 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5399 (int)((depth*2)), "", \
3dab1dad
YO
5400 (funcname) \
5401 ); \
5402 RExC_lastnum=num; \
5403 RExC_lastparse=RExC_parse; \
5404})
5405
07be1b83
YO
5406
5407
3dab1dad
YO
5408#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5409 DEBUG_PARSE_MSG((funcname)); \
5410 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5411})
6bda09f9
YO
5412#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5413 DEBUG_PARSE_MSG((funcname)); \
5414 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5415})
a687059c
LW
5416/*
5417 - reg - regular expression, i.e. main body or parenthesized thing
5418 *
5419 * Caller must absorb opening parenthesis.
5420 *
5421 * Combining parenthesis handling with the base level of regular expression
5422 * is a trifle forced, but the need to tie the tails of the branches to what
5423 * follows makes it hard to avoid.
5424 */
07be1b83
YO
5425#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5426#ifdef DEBUGGING
5427#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5428#else
5429#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5430#endif
3dab1dad 5431
76e3520e 5432STATIC regnode *
3dab1dad 5433S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5434 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5435{
27da23d5 5436 dVAR;
c277df42
IZ
5437 register regnode *ret; /* Will be the head of the group. */
5438 register regnode *br;
5439 register regnode *lastbr;
cbbf8932 5440 register regnode *ender = NULL;
a0d0e21e 5441 register I32 parno = 0;
cbbf8932 5442 I32 flags;
f7819f85 5443 U32 oregflags = RExC_flags;
6136c704
AL
5444 bool have_branch = 0;
5445 bool is_open = 0;
594d7033
YO
5446 I32 freeze_paren = 0;
5447 I32 after_freeze = 0;
9d1d55b5
JP
5448
5449 /* for (?g), (?gc), and (?o) warnings; warning
5450 about (?c) will warn about (?g) -- japhy */
5451
6136c704
AL
5452#define WASTED_O 0x01
5453#define WASTED_G 0x02
5454#define WASTED_C 0x04
5455#define WASTED_GC (0x02|0x04)
cbbf8932 5456 I32 wastedflags = 0x00;
9d1d55b5 5457
fac92740 5458 char * parse_start = RExC_parse; /* MJD */
a28509cc 5459 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5460
3dab1dad 5461 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5462
5463 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5464 DEBUG_PARSE("reg ");
5465
821b33a5 5466 *flagp = 0; /* Tentatively. */
a0d0e21e 5467
9d1d55b5 5468
a0d0e21e
LW
5469 /* Make an OPEN node, if parenthesized. */
5470 if (paren) {
e2e6a0f1
YO
5471 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5472 char *start_verb = RExC_parse;
5473 STRLEN verb_len = 0;
5474 char *start_arg = NULL;
5475 unsigned char op = 0;
5476 int argok = 1;
5477 int internal_argval = 0; /* internal_argval is only useful if !argok */
5478 while ( *RExC_parse && *RExC_parse != ')' ) {
5479 if ( *RExC_parse == ':' ) {
5480 start_arg = RExC_parse + 1;
5481 break;
5482 }
5483 RExC_parse++;
5484 }
5485 ++start_verb;
5486 verb_len = RExC_parse - start_verb;
5487 if ( start_arg ) {
5488 RExC_parse++;
5489 while ( *RExC_parse && *RExC_parse != ')' )
5490 RExC_parse++;
5491 if ( *RExC_parse != ')' )
5492 vFAIL("Unterminated verb pattern argument");
5493 if ( RExC_parse == start_arg )
5494 start_arg = NULL;
5495 } else {
5496 if ( *RExC_parse != ')' )
5497 vFAIL("Unterminated verb pattern");
5498 }
5d458dd8 5499
e2e6a0f1
YO
5500 switch ( *start_verb ) {
5501 case 'A': /* (*ACCEPT) */
568a785a 5502 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5503 op = ACCEPT;
5504 internal_argval = RExC_nestroot;
5505 }
5506 break;
5507 case 'C': /* (*COMMIT) */
568a785a 5508 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5509 op = COMMIT;
e2e6a0f1
YO
5510 break;
5511 case 'F': /* (*FAIL) */
568a785a 5512 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5513 op = OPFAIL;
5514 argok = 0;
5515 }
5516 break;
5d458dd8
YO
5517 case ':': /* (*:NAME) */
5518 case 'M': /* (*MARK:NAME) */
568a785a 5519 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5520 op = MARKPOINT;
5d458dd8
YO
5521 argok = -1;
5522 }
5523 break;
5524 case 'P': /* (*PRUNE) */
568a785a 5525 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5526 op = PRUNE;
e2e6a0f1 5527 break;
5d458dd8 5528 case 'S': /* (*SKIP) */
568a785a 5529 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5530 op = SKIP;
5531 break;
5532 case 'T': /* (*THEN) */
5533 /* [19:06] <TimToady> :: is then */
568a785a 5534 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5535 op = CUTGROUP;
5536 RExC_seen |= REG_SEEN_CUTGROUP;
5537 }
e2e6a0f1
YO
5538 break;
5539 }
5540 if ( ! op ) {
5541 RExC_parse++;
5542 vFAIL3("Unknown verb pattern '%.*s'",
5543 verb_len, start_verb);
5544 }
5545 if ( argok ) {
5546 if ( start_arg && internal_argval ) {
5547 vFAIL3("Verb pattern '%.*s' may not have an argument",
5548 verb_len, start_verb);
5549 } else if ( argok < 0 && !start_arg ) {
5550 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5551 verb_len, start_verb);
5552 } else {
5553 ret = reganode(pRExC_state, op, internal_argval);
5554 if ( ! internal_argval && ! SIZE_ONLY ) {
5555 if (start_arg) {
5556 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5557 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5558 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5559 ret->flags = 0;
5560 } else {
5561 ret->flags = 1;
5562 }
5563 }
5564 }
5565 if (!internal_argval)
5566 RExC_seen |= REG_SEEN_VERBARG;
5567 } else if ( start_arg ) {
5568 vFAIL3("Verb pattern '%.*s' may not have an argument",
5569 verb_len, start_verb);
5570 } else {
5571 ret = reg_node(pRExC_state, op);
5572 }
5573 nextchar(pRExC_state);
5574 return ret;
5575 } else
fac92740 5576 if (*RExC_parse == '?') { /* (?...) */
6136c704 5577 bool is_logical = 0;
a28509cc 5578 const char * const seqstart = RExC_parse;
ca9dfc88 5579
830247a4
IZ
5580 RExC_parse++;
5581 paren = *RExC_parse++;
c277df42 5582 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5583 switch (paren) {
894be9b7 5584
1f1031fe
YO
5585 case 'P': /* (?P...) variants for those used to PCRE/Python */
5586 paren = *RExC_parse++;
5587 if ( paren == '<') /* (?P<...>) named capture */
5588 goto named_capture;
5589 else if (paren == '>') { /* (?P>name) named recursion */
5590 goto named_recursion;
5591 }
5592 else if (paren == '=') { /* (?P=...) named backref */
5593 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5594 you change this make sure you change that */
5595 char* name_start = RExC_parse;
5596 U32 num = 0;
5597 SV *sv_dat = reg_scan_name(pRExC_state,
5598 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5599 if (RExC_parse == name_start || *RExC_parse != ')')
5600 vFAIL2("Sequence %.3s... not terminated",parse_start);
5601
5602 if (!SIZE_ONLY) {
5603 num = add_data( pRExC_state, 1, "S" );
5604 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5605 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5606 }
5607 RExC_sawback = 1;
5608 ret = reganode(pRExC_state,
5609 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5610 num);
5611 *flagp |= HASWIDTH;
5612
5613 Set_Node_Offset(ret, parse_start+1);
5614 Set_Node_Cur_Length(ret); /* MJD */
5615
5616 nextchar(pRExC_state);
5617 return ret;
5618 }
57b84237
YO
5619 RExC_parse++;
5620 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5621 /*NOTREACHED*/
5622 case '<': /* (?<...) */
b81d288d 5623 if (*RExC_parse == '!')
c277df42 5624 paren = ',';
0a4db386 5625 else if (*RExC_parse != '=')
1f1031fe 5626 named_capture:
0a4db386 5627 { /* (?<...>) */
81714fb9 5628 char *name_start;
894be9b7 5629 SV *svname;
81714fb9
YO
5630 paren= '>';
5631 case '\'': /* (?'...') */
5632 name_start= RExC_parse;
0a4db386
YO
5633 svname = reg_scan_name(pRExC_state,
5634 SIZE_ONLY ? /* reverse test from the others */
5635 REG_RSN_RETURN_NAME :
5636 REG_RSN_RETURN_NULL);
57b84237
YO
5637 if (RExC_parse == name_start) {
5638 RExC_parse++;
5639 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5640 /*NOTREACHED*/
5641 }
81714fb9
YO
5642 if (*RExC_parse != paren)
5643 vFAIL2("Sequence (?%c... not terminated",
5644 paren=='>' ? '<' : paren);
5645 if (SIZE_ONLY) {
e62cc96a
YO
5646 HE *he_str;
5647 SV *sv_dat = NULL;
894be9b7
YO
5648 if (!svname) /* shouldnt happen */
5649 Perl_croak(aTHX_
5650 "panic: reg_scan_name returned NULL");
81714fb9
YO
5651 if (!RExC_paren_names) {
5652 RExC_paren_names= newHV();
ad64d0ec 5653 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5654#ifdef DEBUGGING
5655 RExC_paren_name_list= newAV();
ad64d0ec 5656 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5657#endif
81714fb9
YO
5658 }
5659 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5660 if ( he_str )
81714fb9 5661 sv_dat = HeVAL(he_str);
e62cc96a 5662 if ( ! sv_dat ) {
81714fb9 5663 /* croak baby croak */
e62cc96a
YO
5664 Perl_croak(aTHX_
5665 "panic: paren_name hash element allocation failed");
5666 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5667 /* (?|...) can mean we have dupes so scan to check
5668 its already been stored. Maybe a flag indicating
5669 we are inside such a construct would be useful,
5670 but the arrays are likely to be quite small, so
5671 for now we punt -- dmq */
5672 IV count = SvIV(sv_dat);
5673 I32 *pv = (I32*)SvPVX(sv_dat);
5674 IV i;
5675 for ( i = 0 ; i < count ; i++ ) {
5676 if ( pv[i] == RExC_npar ) {
5677 count = 0;
5678 break;
5679 }
5680 }
5681 if ( count ) {
5682 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5683 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5684 pv[count] = RExC_npar;
3a92e6ae 5685 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5686 }
81714fb9
YO
5687 } else {
5688 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5689 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5690 SvIOK_on(sv_dat);
3ec35e0f 5691 SvIV_set(sv_dat, 1);
e62cc96a 5692 }
1f1031fe
YO
5693#ifdef DEBUGGING
5694 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5695 SvREFCNT_dec(svname);
5696#endif
e62cc96a 5697
81714fb9
YO
5698 /*sv_dump(sv_dat);*/
5699 }
5700 nextchar(pRExC_state);
5701 paren = 1;
5702 goto capturing_parens;
5703 }
5704 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5705 RExC_parse++;
fac92740 5706 case '=': /* (?=...) */
89c6a13e
AB
5707 RExC_seen_zerolen++;
5708 break;
fac92740 5709 case '!': /* (?!...) */
830247a4 5710 RExC_seen_zerolen++;
e2e6a0f1
YO
5711 if (*RExC_parse == ')') {
5712 ret=reg_node(pRExC_state, OPFAIL);
5713 nextchar(pRExC_state);
5714 return ret;
5715 }
594d7033
YO
5716 break;
5717 case '|': /* (?|...) */
5718 /* branch reset, behave like a (?:...) except that
5719 buffers in alternations share the same numbers */
5720 paren = ':';
5721 after_freeze = freeze_paren = RExC_npar;
5722 break;
fac92740
MJD
5723 case ':': /* (?:...) */
5724 case '>': /* (?>...) */
a0d0e21e 5725 break;
fac92740
MJD
5726 case '$': /* (?$...) */
5727 case '@': /* (?@...) */
8615cb43 5728 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5729 break;
fac92740 5730 case '#': /* (?#...) */
830247a4
IZ
5731 while (*RExC_parse && *RExC_parse != ')')
5732 RExC_parse++;
5733 if (*RExC_parse != ')')
c277df42 5734 FAIL("Sequence (?#... not terminated");
830247a4 5735 nextchar(pRExC_state);
a0d0e21e
LW
5736 *flagp = TRYAGAIN;
5737 return NULL;
894be9b7
YO
5738 case '0' : /* (?0) */
5739 case 'R' : /* (?R) */
5740 if (*RExC_parse != ')')
6bda09f9 5741 FAIL("Sequence (?R) not terminated");
1a147d38 5742 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5743 *flagp |= POSTPONED;
7f69552c
YO
5744 nextchar(pRExC_state);
5745 return ret;
5746 /*notreached*/
894be9b7
YO
5747 { /* named and numeric backreferences */
5748 I32 num;
894be9b7
YO
5749 case '&': /* (?&NAME) */
5750 parse_start = RExC_parse - 1;
1f1031fe 5751 named_recursion:
894be9b7 5752 {
0a4db386
YO
5753 SV *sv_dat = reg_scan_name(pRExC_state,
5754 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5755 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5756 }
5757 goto gen_recurse_regop;
5758 /* NOT REACHED */
542fa716
YO
5759 case '+':
5760 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5761 RExC_parse++;
5762 vFAIL("Illegal pattern");
5763 }
5764 goto parse_recursion;
5765 /* NOT REACHED*/
5766 case '-': /* (?-1) */
5767 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5768 RExC_parse--; /* rewind to let it be handled later */
5769 goto parse_flags;
5770 }
5771 /*FALLTHROUGH */
6bda09f9
YO
5772 case '1': case '2': case '3': case '4': /* (?1) */
5773 case '5': case '6': case '7': case '8': case '9':
5774 RExC_parse--;
542fa716 5775 parse_recursion:
894be9b7
YO
5776 num = atoi(RExC_parse);
5777 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5778 if (*RExC_parse == '-')
5779 RExC_parse++;
6bda09f9
YO
5780 while (isDIGIT(*RExC_parse))
5781 RExC_parse++;
5782 if (*RExC_parse!=')')
5783 vFAIL("Expecting close bracket");
894be9b7
YO
5784
5785 gen_recurse_regop:
542fa716
YO
5786 if ( paren == '-' ) {
5787 /*
5788 Diagram of capture buffer numbering.
5789 Top line is the normal capture buffer numbers
5790 Botton line is the negative indexing as from
5791 the X (the (?-2))
5792
5793 + 1 2 3 4 5 X 6 7
5794 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5795 - 5 4 3 2 1 X x x
5796
5797 */
5798 num = RExC_npar + num;
5799 if (num < 1) {
5800 RExC_parse++;
5801 vFAIL("Reference to nonexistent group");
5802 }
5803 } else if ( paren == '+' ) {
5804 num = RExC_npar + num - 1;
5805 }
5806
1a147d38 5807 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5808 if (!SIZE_ONLY) {
5809 if (num > (I32)RExC_rx->nparens) {
5810 RExC_parse++;
5811 vFAIL("Reference to nonexistent group");
5812 }
40d049e4 5813 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5814 RExC_emit++;
226de585 5815 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5816 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5817 } else {
6bda09f9 5818 RExC_size++;
6bda09f9 5819 }
0a4db386 5820 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5821 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5822 Set_Node_Offset(ret, parse_start); /* MJD */
5823
a3b492c3 5824 *flagp |= POSTPONED;
6bda09f9
YO
5825 nextchar(pRExC_state);
5826 return ret;
894be9b7
YO
5827 } /* named and numeric backreferences */
5828 /* NOT REACHED */
5829
fac92740 5830 case '?': /* (??...) */
6136c704 5831 is_logical = 1;
57b84237
YO
5832 if (*RExC_parse != '{') {
5833 RExC_parse++;
5834 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5835 /*NOTREACHED*/
5836 }
a3b492c3 5837 *flagp |= POSTPONED;
830247a4 5838 paren = *RExC_parse++;
0f5d15d6 5839 /* FALL THROUGH */
fac92740 5840 case '{': /* (?{...}) */
c277df42 5841 {
2eccd3b2
NC
5842 I32 count = 1;
5843 U32 n = 0;
c277df42 5844 char c;
830247a4 5845 char *s = RExC_parse;
c277df42 5846
830247a4
IZ
5847 RExC_seen_zerolen++;
5848 RExC_seen |= REG_SEEN_EVAL;
5849 while (count && (c = *RExC_parse)) {
6136c704
AL
5850 if (c == '\\') {
5851 if (RExC_parse[1])
5852 RExC_parse++;
5853 }
b81d288d 5854 else if (c == '{')
c277df42 5855 count++;
b81d288d 5856 else if (c == '}')
c277df42 5857 count--;
830247a4 5858 RExC_parse++;
c277df42 5859 }
6136c704 5860 if (*RExC_parse != ')') {
b81d288d 5861 RExC_parse = s;
b45f050a
JF
5862 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5863 }
c277df42 5864 if (!SIZE_ONLY) {
f3548bdc 5865 PAD *pad;
6136c704
AL
5866 OP_4tree *sop, *rop;
5867 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5868
569233ed
SB
5869 ENTER;
5870 Perl_save_re_context(aTHX);
f3548bdc 5871 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5872 sop->op_private |= OPpREFCOUNTED;
5873 /* re_dup will OpREFCNT_inc */
5874 OpREFCNT_set(sop, 1);
569233ed 5875 LEAVE;
c277df42 5876
830247a4 5877 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5878 RExC_rxi->data->data[n] = (void*)rop;
5879 RExC_rxi->data->data[n+1] = (void*)sop;
5880 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5881 SvREFCNT_dec(sv);
a0ed51b3 5882 }
e24b16f9 5883 else { /* First pass */
830247a4 5884 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5885 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5886 /* No compiled RE interpolated, has runtime
5887 components ===> unsafe. */
5888 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5889 if (PL_tainting && PL_tainted)
cc6b7395 5890 FAIL("Eval-group in insecure regular expression");
54df2634 5891#if PERL_VERSION > 8
923e4eb5 5892 if (IN_PERL_COMPILETIME)
b5c19bd7 5893 PL_cv_has_eval = 1;
54df2634 5894#endif
c277df42 5895 }
b5c19bd7 5896
830247a4 5897 nextchar(pRExC_state);
6136c704 5898 if (is_logical) {
830247a4 5899 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5900 if (!SIZE_ONLY)
5901 ret->flags = 2;
3dab1dad 5902 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5903 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5904 return ret;
5905 }
ccb2c380
MP
5906 ret = reganode(pRExC_state, EVAL, n);
5907 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5908 Set_Node_Offset(ret, parse_start);
5909 return ret;
c277df42 5910 }
fac92740 5911 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5912 {
0a4db386 5913 int is_define= 0;
fac92740 5914 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5915 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5916 || RExC_parse[1] == '<'
830247a4 5917 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5918 I32 flag;
5919
830247a4 5920 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5921 if (!SIZE_ONLY)
5922 ret->flags = 1;
3dab1dad 5923 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5924 goto insert_if;
b81d288d 5925 }
a0ed51b3 5926 }
0a4db386
YO
5927 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5928 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5929 {
5930 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5931 char *name_start= RExC_parse++;
2eccd3b2 5932 U32 num = 0;
0a4db386
YO
5933 SV *sv_dat=reg_scan_name(pRExC_state,
5934 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5935 if (RExC_parse == name_start || *RExC_parse != ch)
5936 vFAIL2("Sequence (?(%c... not terminated",
5937 (ch == '>' ? '<' : ch));
5938 RExC_parse++;
5939 if (!SIZE_ONLY) {
5940 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5941 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5942 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
5943 }
5944 ret = reganode(pRExC_state,NGROUPP,num);
5945 goto insert_if_check_paren;
5946 }
5947 else if (RExC_parse[0] == 'D' &&
5948 RExC_parse[1] == 'E' &&
5949 RExC_parse[2] == 'F' &&
5950 RExC_parse[3] == 'I' &&
5951 RExC_parse[4] == 'N' &&
5952 RExC_parse[5] == 'E')
5953 {
5954 ret = reganode(pRExC_state,DEFINEP,0);
5955 RExC_parse +=6 ;
5956 is_define = 1;
5957 goto insert_if_check_paren;
5958 }
5959 else if (RExC_parse[0] == 'R') {
5960 RExC_parse++;
5961 parno = 0;
5962 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5963 parno = atoi(RExC_parse++);
5964 while (isDIGIT(*RExC_parse))
5965 RExC_parse++;
5966 } else if (RExC_parse[0] == '&') {
5967 SV *sv_dat;
5968 RExC_parse++;
5969 sv_dat = reg_scan_name(pRExC_state,
5970 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5971 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5972 }
1a147d38 5973 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5974 goto insert_if_check_paren;
5975 }
830247a4 5976 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5977 /* (?(1)...) */
6136c704 5978 char c;
830247a4 5979 parno = atoi(RExC_parse++);
c277df42 5980
830247a4
IZ
5981 while (isDIGIT(*RExC_parse))
5982 RExC_parse++;
fac92740 5983 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5984
0a4db386 5985 insert_if_check_paren:
830247a4 5986 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5987 vFAIL("Switch condition not recognized");
c277df42 5988 insert_if:
3dab1dad
YO
5989 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5990 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5991 if (br == NULL)
830247a4 5992 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5993 else
3dab1dad 5994 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5995 c = *nextchar(pRExC_state);
d1b80229
IZ
5996 if (flags&HASWIDTH)
5997 *flagp |= HASWIDTH;
c277df42 5998 if (c == '|') {
0a4db386
YO
5999 if (is_define)
6000 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6001 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6002 regbranch(pRExC_state, &flags, 1,depth+1);
6003 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6004 if (flags&HASWIDTH)
6005 *flagp |= HASWIDTH;
830247a4 6006 c = *nextchar(pRExC_state);
a0ed51b3
LW
6007 }
6008 else
c277df42
IZ
6009 lastbr = NULL;
6010 if (c != ')')
8615cb43 6011 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6012 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6013 REGTAIL(pRExC_state, br, ender);
c277df42 6014 if (lastbr) {
3dab1dad
YO
6015 REGTAIL(pRExC_state, lastbr, ender);
6016 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6017 }
6018 else
3dab1dad 6019 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6020 RExC_size++; /* XXX WHY do we need this?!!
6021 For large programs it seems to be required
6022 but I can't figure out why. -- dmq*/
c277df42 6023 return ret;
a0ed51b3
LW
6024 }
6025 else {
830247a4 6026 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6027 }
6028 }
1b1626e4 6029 case 0:
830247a4 6030 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6031 vFAIL("Sequence (? incomplete");
1b1626e4 6032 break;
a0d0e21e 6033 default:
cde0cee5
YO
6034 --RExC_parse;
6035 parse_flags: /* (?i) */
6036 {
6037 U32 posflags = 0, negflags = 0;
6038 U32 *flagsp = &posflags;
6039
6040 while (*RExC_parse) {
6041 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6042 /* (?g), (?gc) and (?o) are useless here
6043 and must be globally applied -- japhy */
cde0cee5
YO
6044 switch (*RExC_parse) {
6045 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
f7819f85
A
6046 case ONCE_PAT_MOD: /* 'o' */
6047 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6048 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6049 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6050 if (! (wastedflags & wflagbit) ) {
6051 wastedflags |= wflagbit;
6052 vWARN5(
6053 RExC_parse + 1,
6054 "Useless (%s%c) - %suse /%c modifier",
6055 flagsp == &negflags ? "?-" : "?",
6056 *RExC_parse,
6057 flagsp == &negflags ? "don't " : "",
6058 *RExC_parse
6059 );
6060 }
6061 }
cde0cee5
YO
6062 break;
6063
f7819f85 6064 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6065 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6066 if (! (wastedflags & WASTED_C) ) {
6067 wastedflags |= WASTED_GC;
9d1d55b5
JP
6068 vWARN3(
6069 RExC_parse + 1,
6070 "Useless (%sc) - %suse /gc modifier",
6071 flagsp == &negflags ? "?-" : "?",
6072 flagsp == &negflags ? "don't " : ""
6073 );
6074 }
6075 }
cde0cee5 6076 break;
f7819f85 6077 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5
YO
6078 if (flagsp == &negflags) {
6079 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
f7819f85 6080 vWARN(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6081 } else {
6082 *flagsp |= RXf_PMf_KEEPCOPY;
6083 }
6084 break;
6085 case '-':
57b84237
YO
6086 if (flagsp == &negflags) {
6087 RExC_parse++;
6088 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6089 /*NOTREACHED*/
6090 }
cde0cee5
YO
6091 flagsp = &negflags;
6092 wastedflags = 0; /* reset so (?g-c) warns twice */
6093 break;
6094 case ':':
6095 paren = ':';
6096 /*FALLTHROUGH*/
6097 case ')':
6098 RExC_flags |= posflags;
6099 RExC_flags &= ~negflags;
f7819f85
A
6100 if (paren != ':') {
6101 oregflags |= posflags;
6102 oregflags &= ~negflags;
6103 }
cde0cee5
YO
6104 nextchar(pRExC_state);
6105 if (paren != ':') {
6106 *flagp = TRYAGAIN;
6107 return NULL;
6108 } else {
6109 ret = NULL;
6110 goto parse_rest;
6111 }
6112 /*NOTREACHED*/
6113 default:
cde0cee5
YO
6114 RExC_parse++;
6115 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6116 /*NOTREACHED*/
6117 }
830247a4 6118 ++RExC_parse;
48c036b1 6119 }
cde0cee5 6120 }} /* one for the default block, one for the switch */
a0d0e21e 6121 }
fac92740 6122 else { /* (...) */
81714fb9 6123 capturing_parens:
830247a4
IZ
6124 parno = RExC_npar;
6125 RExC_npar++;
e2e6a0f1 6126
830247a4 6127 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6128 if (!SIZE_ONLY ){
6129 if (!RExC_nestroot)
6130 RExC_nestroot = parno;
c009da3d
YO
6131 if (RExC_seen & REG_SEEN_RECURSE
6132 && !RExC_open_parens[parno-1])
6133 {
e2e6a0f1 6134 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6135 "Setting open paren #%"IVdf" to %d\n",
6136 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6137 RExC_open_parens[parno-1]= ret;
6138 }
6bda09f9 6139 }
fac92740
MJD
6140 Set_Node_Length(ret, 1); /* MJD */
6141 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6142 is_open = 1;
a0d0e21e 6143 }
a0ed51b3 6144 }
fac92740 6145 else /* ! paren */
a0d0e21e 6146 ret = NULL;
cde0cee5
YO
6147
6148 parse_rest:
a0d0e21e 6149 /* Pick up the branches, linking them together. */
fac92740 6150 parse_start = RExC_parse; /* MJD */
3dab1dad 6151 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6152
6153 if (freeze_paren) {
6154 if (RExC_npar > after_freeze)
6155 after_freeze = RExC_npar;
6156 RExC_npar = freeze_paren;
6157 }
6158
fac92740 6159 /* branch_len = (paren != 0); */
2af232bd 6160
a0d0e21e
LW
6161 if (br == NULL)
6162 return(NULL);
830247a4
IZ
6163 if (*RExC_parse == '|') {
6164 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6165 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6166 }
fac92740 6167 else { /* MJD */
6bda09f9 6168 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6169 Set_Node_Length(br, paren != 0);
6170 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6171 }
c277df42
IZ
6172 have_branch = 1;
6173 if (SIZE_ONLY)
830247a4 6174 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6175 }
6176 else if (paren == ':') {
c277df42
IZ
6177 *flagp |= flags&SIMPLE;
6178 }
6136c704 6179 if (is_open) { /* Starts with OPEN. */
3dab1dad 6180 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6181 }
6182 else if (paren != '?') /* Not Conditional */
a0d0e21e 6183 ret = br;
8ae10a67 6184 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6185 lastbr = br;
830247a4
IZ
6186 while (*RExC_parse == '|') {
6187 if (!SIZE_ONLY && RExC_extralen) {
6188 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6189 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6190 }
6191 if (SIZE_ONLY)
830247a4
IZ
6192 RExC_extralen += 2; /* Account for LONGJMP. */
6193 nextchar(pRExC_state);
594d7033
YO
6194 if (freeze_paren) {
6195 if (RExC_npar > after_freeze)
6196 after_freeze = RExC_npar;
6197 RExC_npar = freeze_paren;
6198 }
3dab1dad 6199 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6200
a687059c 6201 if (br == NULL)
a0d0e21e 6202 return(NULL);
3dab1dad 6203 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6204 lastbr = br;
8ae10a67 6205 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6206 }
6207
c277df42
IZ
6208 if (have_branch || paren != ':') {
6209 /* Make a closing node, and hook it on the end. */
6210 switch (paren) {
6211 case ':':
830247a4 6212 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6213 break;
6214 case 1:
830247a4 6215 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6216 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6217 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6218 "Setting close paren #%"IVdf" to %d\n",
6219 (IV)parno, REG_NODE_NUM(ender)));
6220 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6221 if (RExC_nestroot == parno)
6222 RExC_nestroot = 0;
40d049e4 6223 }
fac92740
MJD
6224 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6225 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6226 break;
6227 case '<':
c277df42
IZ
6228 case ',':
6229 case '=':
6230 case '!':
c277df42 6231 *flagp &= ~HASWIDTH;
821b33a5
IZ
6232 /* FALL THROUGH */
6233 case '>':
830247a4 6234 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6235 break;
6236 case 0:
830247a4 6237 ender = reg_node(pRExC_state, END);
40d049e4
YO
6238 if (!SIZE_ONLY) {
6239 assert(!RExC_opend); /* there can only be one! */
6240 RExC_opend = ender;
6241 }
c277df42
IZ
6242 break;
6243 }
eaf3ca90 6244 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6245
9674d46a 6246 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6247 if (depth==1)
6248 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6249
c277df42 6250 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6251 for (br = ret; br; br = regnext(br)) {
6252 const U8 op = PL_regkind[OP(br)];
6253 if (op == BRANCH) {
07be1b83 6254 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6255 }
6256 else if (op == BRANCHJ) {
07be1b83 6257 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6258 }
c277df42
IZ
6259 }
6260 }
a0d0e21e 6261 }
c277df42
IZ
6262
6263 {
e1ec3a88
AL
6264 const char *p;
6265 static const char parens[] = "=!<,>";
c277df42
IZ
6266
6267 if (paren && (p = strchr(parens, paren))) {
eb160463 6268 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6269 int flag = (p - parens) > 1;
6270
6271 if (paren == '>')
6272 node = SUSPEND, flag = 0;
6bda09f9 6273 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6274 Set_Node_Cur_Length(ret);
6275 Set_Node_Offset(ret, parse_start + 1);
c277df42 6276 ret->flags = flag;
07be1b83 6277 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6278 }
a0d0e21e
LW
6279 }
6280
6281 /* Check for proper termination. */
ce3e6498 6282 if (paren) {
e2509266 6283 RExC_flags = oregflags;
830247a4
IZ
6284 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6285 RExC_parse = oregcomp_parse;
380a0633 6286 vFAIL("Unmatched (");
ce3e6498 6287 }
a0ed51b3 6288 }
830247a4
IZ
6289 else if (!paren && RExC_parse < RExC_end) {
6290 if (*RExC_parse == ')') {
6291 RExC_parse++;
380a0633 6292 vFAIL("Unmatched )");
a0ed51b3
LW
6293 }
6294 else
b45f050a 6295 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6296 /* NOTREACHED */
6297 }
594d7033
YO
6298 if (after_freeze)
6299 RExC_npar = after_freeze;
a0d0e21e 6300 return(ret);
a687059c
LW
6301}
6302
6303/*
6304 - regbranch - one alternative of an | operator
6305 *
6306 * Implements the concatenation operator.
6307 */
76e3520e 6308STATIC regnode *
3dab1dad 6309S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6310{
97aff369 6311 dVAR;
c277df42
IZ
6312 register regnode *ret;
6313 register regnode *chain = NULL;
6314 register regnode *latest;
6315 I32 flags = 0, c = 0;
3dab1dad 6316 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6317
6318 PERL_ARGS_ASSERT_REGBRANCH;
6319
3dab1dad 6320 DEBUG_PARSE("brnc");
02daf0ab 6321
b81d288d 6322 if (first)
c277df42
IZ
6323 ret = NULL;
6324 else {
b81d288d 6325 if (!SIZE_ONLY && RExC_extralen)
830247a4 6326 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6327 else {
830247a4 6328 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6329 Set_Node_Length(ret, 1);
6330 }
c277df42
IZ
6331 }
6332
b81d288d 6333 if (!first && SIZE_ONLY)
830247a4 6334 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6335
c277df42 6336 *flagp = WORST; /* Tentatively. */
a0d0e21e 6337
830247a4
IZ
6338 RExC_parse--;
6339 nextchar(pRExC_state);
6340 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6341 flags &= ~TRYAGAIN;
3dab1dad 6342 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6343 if (latest == NULL) {
6344 if (flags & TRYAGAIN)
6345 continue;
6346 return(NULL);
a0ed51b3
LW
6347 }
6348 else if (ret == NULL)
c277df42 6349 ret = latest;
8ae10a67 6350 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6351 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6352 *flagp |= flags&SPSTART;
6353 else {
830247a4 6354 RExC_naughty++;
3dab1dad 6355 REGTAIL(pRExC_state, chain, latest);
a687059c 6356 }
a0d0e21e 6357 chain = latest;
c277df42
IZ
6358 c++;
6359 }
6360 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6361 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6362 if (ret == NULL)
6363 ret = chain;
6364 }
6365 if (c == 1) {
6366 *flagp |= flags&SIMPLE;
a0d0e21e 6367 }
a687059c 6368
d4c19fe8 6369 return ret;
a687059c
LW
6370}
6371
6372/*
6373 - regpiece - something followed by possible [*+?]
6374 *
6375 * Note that the branching code sequences used for ? and the general cases
6376 * of * and + are somewhat optimized: they use the same NOTHING node as
6377 * both the endmarker for their branch list and the body of the last branch.
6378 * It might seem that this node could be dispensed with entirely, but the
6379 * endmarker role is not redundant.
6380 */
76e3520e 6381STATIC regnode *
3dab1dad 6382S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6383{
97aff369 6384 dVAR;
c277df42 6385 register regnode *ret;
a0d0e21e
LW
6386 register char op;
6387 register char *next;
6388 I32 flags;
1df70142 6389 const char * const origparse = RExC_parse;
a0d0e21e 6390 I32 min;
c277df42 6391 I32 max = REG_INFTY;
fac92740 6392 char *parse_start;
10edeb5d 6393 const char *maxpos = NULL;
3dab1dad 6394 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6395
6396 PERL_ARGS_ASSERT_REGPIECE;
6397
3dab1dad 6398 DEBUG_PARSE("piec");
a0d0e21e 6399
3dab1dad 6400 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6401 if (ret == NULL) {
6402 if (flags & TRYAGAIN)
6403 *flagp |= TRYAGAIN;
6404 return(NULL);
6405 }
6406
830247a4 6407 op = *RExC_parse;
a0d0e21e 6408
830247a4 6409 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6410 maxpos = NULL;
fac92740 6411 parse_start = RExC_parse; /* MJD */
830247a4 6412 next = RExC_parse + 1;
a0d0e21e
LW
6413 while (isDIGIT(*next) || *next == ',') {
6414 if (*next == ',') {
6415 if (maxpos)
6416 break;
6417 else
6418 maxpos = next;
a687059c 6419 }
a0d0e21e
LW
6420 next++;
6421 }
6422 if (*next == '}') { /* got one */
6423 if (!maxpos)
6424 maxpos = next;
830247a4
IZ
6425 RExC_parse++;
6426 min = atoi(RExC_parse);
a0d0e21e
LW
6427 if (*maxpos == ',')
6428 maxpos++;
6429 else
830247a4 6430 maxpos = RExC_parse;
a0d0e21e
LW
6431 max = atoi(maxpos);
6432 if (!max && *maxpos != '0')
c277df42
IZ
6433 max = REG_INFTY; /* meaning "infinity" */
6434 else if (max >= REG_INFTY)
8615cb43 6435 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6436 RExC_parse = next;
6437 nextchar(pRExC_state);
a0d0e21e
LW
6438
6439 do_curly:
6440 if ((flags&SIMPLE)) {
830247a4 6441 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6442 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6443 Set_Node_Offset(ret, parse_start+1); /* MJD */
6444 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6445 }
6446 else {
3dab1dad 6447 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6448
6449 w->flags = 0;
3dab1dad 6450 REGTAIL(pRExC_state, ret, w);
830247a4 6451 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6452 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6453 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6454 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6455 }
6bda09f9 6456 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6457 /* MJD hk */
6458 Set_Node_Offset(ret, parse_start+1);
2af232bd 6459 Set_Node_Length(ret,
fac92740 6460 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6461
830247a4 6462 if (!SIZE_ONLY && RExC_extralen)
c277df42 6463 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6464 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6465 if (SIZE_ONLY)
830247a4
IZ
6466 RExC_whilem_seen++, RExC_extralen += 3;
6467 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6468 }
c277df42 6469 ret->flags = 0;
a0d0e21e
LW
6470
6471 if (min > 0)
821b33a5
IZ
6472 *flagp = WORST;
6473 if (max > 0)
6474 *flagp |= HASWIDTH;
8fa23287 6475 if (max < min)
8615cb43 6476 vFAIL("Can't do {n,m} with n > m");
c277df42 6477 if (!SIZE_ONLY) {
eb160463
GS
6478 ARG1_SET(ret, (U16)min);
6479 ARG2_SET(ret, (U16)max);
a687059c 6480 }
a687059c 6481
a0d0e21e 6482 goto nest_check;
a687059c 6483 }
a0d0e21e 6484 }
a687059c 6485
a0d0e21e
LW
6486 if (!ISMULT1(op)) {
6487 *flagp = flags;
a687059c 6488 return(ret);
a0d0e21e 6489 }
bb20fd44 6490
c277df42 6491#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6492
6493 /* if this is reinstated, don't forget to put this back into perldiag:
6494
6495 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6496
6497 (F) The part of the regexp subject to either the * or + quantifier
6498 could match an empty string. The {#} shows in the regular
6499 expression about where the problem was discovered.
6500
6501 */
6502
bb20fd44 6503 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6504 vFAIL("Regexp *+ operand could be empty");
b81d288d 6505#endif
bb20fd44 6506
fac92740 6507 parse_start = RExC_parse;
830247a4 6508 nextchar(pRExC_state);
a0d0e21e 6509
821b33a5 6510 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6511
6512 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6513 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6514 ret->flags = 0;
830247a4 6515 RExC_naughty += 4;
a0d0e21e
LW
6516 }
6517 else if (op == '*') {
6518 min = 0;
6519 goto do_curly;
a0ed51b3
LW
6520 }
6521 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6522 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6523 ret->flags = 0;
830247a4 6524 RExC_naughty += 3;
a0d0e21e
LW
6525 }
6526 else if (op == '+') {
6527 min = 1;
6528 goto do_curly;
a0ed51b3
LW
6529 }
6530 else if (op == '?') {
a0d0e21e
LW
6531 min = 0; max = 1;
6532 goto do_curly;
6533 }
6534 nest_check:
a3b492c3 6535 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 6536 vWARN3(RExC_parse,
b45f050a 6537 "%.*s matches null string many times",
afd78fd5 6538 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 6539 origparse);
a0d0e21e
LW
6540 }
6541
b9b4dddf 6542 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6543 nextchar(pRExC_state);
6bda09f9 6544 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6545 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6546 }
b9b4dddf
YO
6547#ifndef REG_ALLOW_MINMOD_SUSPEND
6548 else
6549#endif
6550 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6551 regnode *ender;
6552 nextchar(pRExC_state);
6553 ender = reg_node(pRExC_state, SUCCEED);
6554 REGTAIL(pRExC_state, ret, ender);
6555 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6556 ret->flags = 0;
6557 ender = reg_node(pRExC_state, TAIL);
6558 REGTAIL(pRExC_state, ret, ender);
6559 /*ret= ender;*/
6560 }
6561
6562 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6563 RExC_parse++;
b45f050a
JF
6564 vFAIL("Nested quantifiers");
6565 }
a0d0e21e
LW
6566
6567 return(ret);
a687059c
LW
6568}
6569
fc8cd66c
YO
6570
6571/* reg_namedseq(pRExC_state,UVp)
6572
6573 This is expected to be called by a parser routine that has
afefe6bf 6574 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6575 expected to point at the first char following the N at the time
6576 of the call.
6577
6578 If valuep is non-null then it is assumed that we are parsing inside
6579 of a charclass definition and the first codepoint in the resolved
6580 string is returned via *valuep and the routine will return NULL.
6581 In this mode if a multichar string is returned from the charnames
6582 handler a warning will be issued, and only the first char in the
6583 sequence will be examined. If the string returned is zero length
6584 then the value of *valuep is undefined and NON-NULL will
6585 be returned to indicate failure. (This will NOT be a valid pointer
6586 to a regnode.)
6587
afefe6bf 6588 If valuep is null then it is assumed that we are parsing normal text
fc8cd66c
YO
6589 and inserts a new EXACT node into the program containing the resolved
6590 string and returns a pointer to the new node. If the string is
6591 zerolength a NOTHING node is emitted.
afefe6bf 6592
fc8cd66c
YO
6593 On success RExC_parse is set to the char following the endbrace.
6594 Parsing failures will generate a fatal errorvia vFAIL(...)
6595
6596 NOTE: We cache all results from the charnames handler locally in
6597 the RExC_charnames hash (created on first use) to prevent a charnames
6598 handler from playing silly-buggers and returning a short string and
6599 then a long string for a given pattern. Since the regexp program
6600 size is calculated during an initial parse this would result
6601 in a buffer overrun so we cache to prevent the charname result from
6602 changing during the course of the parse.
6603
6604 */
6605STATIC regnode *
afefe6bf 6606S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c
YO
6607{
6608 char * name; /* start of the content of the name */
6609 char * endbrace; /* endbrace following the name */
6610 SV *sv_str = NULL;
6611 SV *sv_name = NULL;
6612 STRLEN len; /* this has various purposes throughout the code */
6613 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6614 regnode *ret = NULL;
7918f24d
NC
6615
6616 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6617
afefe6bf
RGS
6618 if (*RExC_parse != '{' ||
6619 (*RExC_parse == '{' && RExC_parse[1]
6620 && strchr("0123456789", RExC_parse[1])))
6621 {
6622 GET_RE_DEBUG_FLAGS_DECL;
6623 if (valuep)
6624 /* no bare \N in a charclass */
6625 vFAIL("Missing braces on \\N{}");
6626 GET_RE_DEBUG_FLAGS;
6627 nextchar(pRExC_state);
6628 ret = reg_node(pRExC_state, REG_ANY);
6629 *flagp |= HASWIDTH|SIMPLE;
6630 RExC_naughty++;
6631 RExC_parse--;
6632 Set_Node_Length(ret, 1); /* MJD */
6633 return ret;
fc8cd66c
YO
6634 }
6635 name = RExC_parse+1;
6636 endbrace = strchr(RExC_parse, '}');
6637 if ( ! endbrace ) {
6638 RExC_parse++;
6639 vFAIL("Missing right brace on \\N{}");
6640 }
6641 RExC_parse = endbrace + 1;
6642
6643
6644 /* RExC_parse points at the beginning brace,
6645 endbrace points at the last */
6646 if ( name[0]=='U' && name[1]=='+' ) {
38a44b82 6647 /* its a "Unicode hex" notation {U+89AB} */
fc8cd66c
YO
6648 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6649 | PERL_SCAN_DISALLOW_PREFIX
6650 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6651 UV cp;
196f1508 6652 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6653 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6654 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6655 cp = 0xFFFD;
6656 }
fc8cd66c 6657 if ( valuep ) {
a4893424 6658 if (cp > 0xff) RExC_utf8 = 1;
fc8cd66c
YO
6659 *valuep = cp;
6660 return NULL;
6661 }
a4893424
KW
6662
6663 /* Need to convert to utf8 if either: won't fit into a byte, or the re
6664 * is going to be in utf8 and the representation changes under utf8. */
6665 if (cp > 0xff || (RExC_utf8 && ! UNI_IS_INVARIANT(cp))) {
6666 U8 string[UTF8_MAXBYTES+1];
6667 U8 *tmps;
6668 RExC_utf8 = 1;
6669 tmps = uvuni_to_utf8(string, cp);
6670 sv_str = newSVpvn_utf8((char*)string, tmps - string, TRUE);
6671 } else { /* Otherwise, no need for utf8, can skip that step */
6672 char string;
6673 string = (char)cp;
6674 sv_str= newSVpvn(&string, 1);
6675 }
fc8cd66c
YO
6676 } else {
6677 /* fetch the charnames handler for this scope */
6678 HV * const table = GvHV(PL_hintgv);
6679 SV **cvp= table ?
6680 hv_fetchs(table, "charnames", FALSE) :
6681 NULL;
6682 SV *cv= cvp ? *cvp : NULL;
6683 HE *he_str;
6684 int count;
6685 /* create an SV with the name as argument */
6686 sv_name = newSVpvn(name, endbrace - name);
6687
6688 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6689 vFAIL2("Constant(\\N{%s}) unknown: "
6690 "(possibly a missing \"use charnames ...\")",
6691 SvPVX(sv_name));
6692 }
6693 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6694 vFAIL2("Constant(\\N{%s}): "
6695 "$^H{charnames} is not defined",SvPVX(sv_name));
6696 }
6697
6698
6699
6700 if (!RExC_charnames) {
6701 /* make sure our cache is allocated */
6702 RExC_charnames = newHV();
ad64d0ec 6703 sv_2mortal(MUTABLE_SV(RExC_charnames));
fc8cd66c
YO
6704 }
6705 /* see if we have looked this one up before */
6706 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6707 if ( he_str ) {
6708 sv_str = HeVAL(he_str);
6709 cached = 1;
6710 } else {
6711 dSP ;
6712
6713 ENTER ;
6714 SAVETMPS ;
6715 PUSHMARK(SP) ;
6716
6717 XPUSHs(sv_name);
6718
6719 PUTBACK ;
6720
6721 count= call_sv(cv, G_SCALAR);
6722
6723 if (count == 1) { /* XXXX is this right? dmq */
6724 sv_str = POPs;
6725 SvREFCNT_inc_simple_void(sv_str);
6726 }
6727
6728 SPAGAIN ;
6729 PUTBACK ;
6730 FREETMPS ;
6731 LEAVE ;
6732
6733 if ( !sv_str || !SvOK(sv_str) ) {
6734 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6735 "did not return a defined value",SvPVX(sv_name));
6736 }
6737 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6738 cached = 1;
6739 }
6740 }
6741 if (valuep) {
6742 char *p = SvPV(sv_str, len);
6743 if (len) {
6744 STRLEN numlen = 1;
6745 if ( SvUTF8(sv_str) ) {
196f1508 6746 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6747 if (*valuep > 0x7F)
6748 RExC_utf8 = 1;
6749 /* XXXX
6750 We have to turn on utf8 for high bit chars otherwise
6751 we get failures with
6752
6753 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6754 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6755
6756 This is different from what \x{} would do with the same
6757 codepoint, where the condition is > 0xFF.
6758 - dmq
6759 */
6760
6761
6762 } else {
6763 *valuep = (UV)*p;
6764 /* warn if we havent used the whole string? */
6765 }
6766 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6767 vWARN2(RExC_parse,
6768 "Ignoring excess chars from \\N{%s} in character class",
6769 SvPVX(sv_name)
6770 );
6771 }
6772 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6773 vWARN2(RExC_parse,
6774 "Ignoring zero length \\N{%s} in character class",
6775 SvPVX(sv_name)
6776 );
6777 }
6778 if (sv_name)
6779 SvREFCNT_dec(sv_name);
6780 if (!cached)
6781 SvREFCNT_dec(sv_str);
6782 return len ? NULL : (regnode *)&len;
6783 } else if(SvCUR(sv_str)) {
6784
6785 char *s;
6786 char *p, *pend;
6787 STRLEN charlen = 1;
d008bc60 6788#ifdef DEBUGGING
fc8cd66c 6789 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6790#endif
fc8cd66c
YO
6791 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6792
6793 ret = reg_node(pRExC_state,
6794 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6795 s= STRING(ret);
6796
6797 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6798 sv_utf8_upgrade(sv_str);
6799 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6800 RExC_utf8= 1;
6801 }
6802
6803 p = SvPV(sv_str, len);
6804 pend = p + len;
6805 /* len is the length written, charlen is the size the char read */
6806 for ( len = 0; p < pend; p += charlen ) {
6807 if (UTF) {
196f1508 6808 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6809 if (FOLD) {
6810 STRLEN foldlen,numlen;
6811 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6812 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6813 /* Emit all the Unicode characters. */
6814
6815 for (foldbuf = tmpbuf;
6816 foldlen;
6817 foldlen -= numlen)
6818 {
6819 uvc = utf8_to_uvchr(foldbuf, &numlen);
6820 if (numlen > 0) {
6821 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6822 s += unilen;
6823 len += unilen;
6824 /* In EBCDIC the numlen
6825 * and unilen can differ. */
6826 foldbuf += numlen;
6827 if (numlen >= foldlen)
6828 break;
6829 }
6830 else
6831 break; /* "Can't happen." */
6832 }
6833 } else {
6834 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6835 if (unilen > 0) {
6836 s += unilen;
6837 len += unilen;
6838 }
6839 }
6840 } else {
6841 len++;
6842 REGC(*p, s++);
6843 }
6844 }
6845 if (SIZE_ONLY) {
6846 RExC_size += STR_SZ(len);
6847 } else {
6848 STR_LEN(ret) = len;
6849 RExC_emit += STR_SZ(len);
6850 }
6851 Set_Node_Cur_Length(ret); /* MJD */
6852 RExC_parse--;
6853 nextchar(pRExC_state);
a4893424 6854 } else { /* zero length */
fc8cd66c
YO
6855 ret = reg_node(pRExC_state,NOTHING);
6856 }
6857 if (!cached) {
6858 SvREFCNT_dec(sv_str);
6859 }
6860 if (sv_name) {
6861 SvREFCNT_dec(sv_name);
6862 }
6863 return ret;
6864
6865}
6866
6867
9e08bc66
TS
6868/*
6869 * reg_recode
6870 *
6871 * It returns the code point in utf8 for the value in *encp.
6872 * value: a code value in the source encoding
6873 * encp: a pointer to an Encode object
6874 *
6875 * If the result from Encode is not a single character,
6876 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6877 */
6878STATIC UV
6879S_reg_recode(pTHX_ const char value, SV **encp)
6880{
6881 STRLEN numlen = 1;
59cd0e26 6882 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 6883 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
6884 const STRLEN newlen = SvCUR(sv);
6885 UV uv = UNICODE_REPLACEMENT;
6886
7918f24d
NC
6887 PERL_ARGS_ASSERT_REG_RECODE;
6888
9e08bc66
TS
6889 if (newlen)
6890 uv = SvUTF8(sv)
6891 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6892 : *(U8*)s;
6893
6894 if (!newlen || numlen != newlen) {
6895 uv = UNICODE_REPLACEMENT;
c86f7df5 6896 *encp = NULL;
9e08bc66
TS
6897 }
6898 return uv;
6899}
6900
fc8cd66c 6901
a687059c
LW
6902/*
6903 - regatom - the lowest level
ee9b8eae
YO
6904
6905 Try to identify anything special at the start of the pattern. If there
6906 is, then handle it as required. This may involve generating a single regop,
6907 such as for an assertion; or it may involve recursing, such as to
6908 handle a () structure.
6909
6910 If the string doesn't start with something special then we gobble up
6911 as much literal text as we can.
6912
6913 Once we have been able to handle whatever type of thing started the
6914 sequence, we return.
6915
6916 Note: we have to be careful with escapes, as they can be both literal
6917 and special, and in the case of \10 and friends can either, depending
6918 on context. Specifically there are two seperate switches for handling
6919 escape sequences, with the one for handling literal escapes requiring
6920 a dummy entry for all of the special escapes that are actually handled
6921 by the other.
6922*/
6923
76e3520e 6924STATIC regnode *
3dab1dad 6925S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6926{
97aff369 6927 dVAR;
cbbf8932 6928 register regnode *ret = NULL;
a0d0e21e 6929 I32 flags;
45948336 6930 char *parse_start = RExC_parse;
3dab1dad
YO
6931 GET_RE_DEBUG_FLAGS_DECL;
6932 DEBUG_PARSE("atom");
a0d0e21e
LW
6933 *flagp = WORST; /* Tentatively. */
6934
7918f24d 6935 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 6936
a0d0e21e 6937tryagain:
f9a79580 6938 switch ((U8)*RExC_parse) {
a0d0e21e 6939 case '^':
830247a4
IZ
6940 RExC_seen_zerolen++;
6941 nextchar(pRExC_state);
bbe252da 6942 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6943 ret = reg_node(pRExC_state, MBOL);
bbe252da 6944 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6945 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6946 else
830247a4 6947 ret = reg_node(pRExC_state, BOL);
fac92740 6948 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6949 break;
6950 case '$':
830247a4 6951 nextchar(pRExC_state);
b81d288d 6952 if (*RExC_parse)
830247a4 6953 RExC_seen_zerolen++;
bbe252da 6954 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6955 ret = reg_node(pRExC_state, MEOL);
bbe252da 6956 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6957 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6958 else
830247a4 6959 ret = reg_node(pRExC_state, EOL);
fac92740 6960 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6961 break;
6962 case '.':
830247a4 6963 nextchar(pRExC_state);
bbe252da 6964 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6965 ret = reg_node(pRExC_state, SANY);
6966 else
6967 ret = reg_node(pRExC_state, REG_ANY);
6968 *flagp |= HASWIDTH|SIMPLE;
830247a4 6969 RExC_naughty++;
fac92740 6970 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6971 break;
6972 case '[':
b45f050a 6973 {
3dab1dad
YO
6974 char * const oregcomp_parse = ++RExC_parse;
6975 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6976 if (*RExC_parse != ']') {
6977 RExC_parse = oregcomp_parse;
b45f050a
JF
6978 vFAIL("Unmatched [");
6979 }
830247a4 6980 nextchar(pRExC_state);
a0d0e21e 6981 *flagp |= HASWIDTH|SIMPLE;
fac92740 6982 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6983 break;
b45f050a 6984 }
a0d0e21e 6985 case '(':
830247a4 6986 nextchar(pRExC_state);
3dab1dad 6987 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6988 if (ret == NULL) {
bf93d4cc 6989 if (flags & TRYAGAIN) {
830247a4 6990 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6991 /* Make parent create an empty node if needed. */
6992 *flagp |= TRYAGAIN;
6993 return(NULL);
6994 }
a0d0e21e 6995 goto tryagain;
bf93d4cc 6996 }
a0d0e21e
LW
6997 return(NULL);
6998 }
a3b492c3 6999 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
7000 break;
7001 case '|':
7002 case ')':
7003 if (flags & TRYAGAIN) {
7004 *flagp |= TRYAGAIN;
7005 return NULL;
7006 }
b45f050a 7007 vFAIL("Internal urp");
a0d0e21e
LW
7008 /* Supposed to be caught earlier. */
7009 break;
85afd4ae 7010 case '{':
830247a4
IZ
7011 if (!regcurly(RExC_parse)) {
7012 RExC_parse++;
85afd4ae
CS
7013 goto defchar;
7014 }
7015 /* FALL THROUGH */
a0d0e21e
LW
7016 case '?':
7017 case '+':
7018 case '*':
830247a4 7019 RExC_parse++;
b45f050a 7020 vFAIL("Quantifier follows nothing");
a0d0e21e 7021 break;
f9a79580
RGS
7022 case 0xDF:
7023 case 0xC3:
7024 case 0xCE:
a0a388a1 7025 do_foldchar:
56d400ed 7026 if (!LOC && FOLD) {
e64b1bd1 7027 U32 len,cp;
7cf3a6a3 7028 len=0; /* silence a spurious compiler warning */
56d400ed 7029 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7030 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7031 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7032 ret = reganode(pRExC_state, FOLDCHAR, cp);
7033 Set_Node_Length(ret, 1); /* MJD */
7034 nextchar(pRExC_state); /* kill whitespace under /x */
7035 return ret;
7036 }
7037 }
7038 goto outer_default;
a0d0e21e 7039 case '\\':
ee9b8eae
YO
7040 /* Special Escapes
7041
7042 This switch handles escape sequences that resolve to some kind
7043 of special regop and not to literal text. Escape sequnces that
7044 resolve to literal text are handled below in the switch marked
7045 "Literal Escapes".
7046
7047 Every entry in this switch *must* have a corresponding entry
7048 in the literal escape switch. However, the opposite is not
7049 required, as the default for this switch is to jump to the
7050 literal text handling code.
7051 */
a0a388a1
YO
7052 switch ((U8)*++RExC_parse) {
7053 case 0xDF:
7054 case 0xC3:
7055 case 0xCE:
7056 goto do_foldchar;
ee9b8eae 7057 /* Special Escapes */
a0d0e21e 7058 case 'A':
830247a4
IZ
7059 RExC_seen_zerolen++;
7060 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7061 *flagp |= SIMPLE;
ee9b8eae 7062 goto finish_meta_pat;
a0d0e21e 7063 case 'G':
830247a4
IZ
7064 ret = reg_node(pRExC_state, GPOS);
7065 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7066 *flagp |= SIMPLE;
ee9b8eae
YO
7067 goto finish_meta_pat;
7068 case 'K':
7069 RExC_seen_zerolen++;
7070 ret = reg_node(pRExC_state, KEEPS);
7071 *flagp |= SIMPLE;
37923168
RGS
7072 /* XXX:dmq : disabling in-place substitution seems to
7073 * be necessary here to avoid cases of memory corruption, as
7074 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7075 */
7076 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7077 goto finish_meta_pat;
a0d0e21e 7078 case 'Z':
830247a4 7079 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7080 *flagp |= SIMPLE;
a1917ab9 7081 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7082 goto finish_meta_pat;
b85d18e9 7083 case 'z':
830247a4 7084 ret = reg_node(pRExC_state, EOS);
b85d18e9 7085 *flagp |= SIMPLE;
830247a4 7086 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7087 goto finish_meta_pat;
4a2d328f 7088 case 'C':
f33976b4
DB
7089 ret = reg_node(pRExC_state, CANY);
7090 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7091 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7092 goto finish_meta_pat;
a0ed51b3 7093 case 'X':
830247a4 7094 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7095 *flagp |= HASWIDTH;
ee9b8eae 7096 goto finish_meta_pat;
a0d0e21e 7097 case 'w':
eb160463 7098 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 7099 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7100 goto finish_meta_pat;
a0d0e21e 7101 case 'W':
eb160463 7102 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 7103 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7104 goto finish_meta_pat;
a0d0e21e 7105 case 'b':
830247a4
IZ
7106 RExC_seen_zerolen++;
7107 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7108 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 7109 *flagp |= SIMPLE;
ee9b8eae 7110 goto finish_meta_pat;
a0d0e21e 7111 case 'B':
830247a4
IZ
7112 RExC_seen_zerolen++;
7113 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7114 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 7115 *flagp |= SIMPLE;
ee9b8eae 7116 goto finish_meta_pat;
a0d0e21e 7117 case 's':
eb160463 7118 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 7119 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7120 goto finish_meta_pat;
a0d0e21e 7121 case 'S':
eb160463 7122 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 7123 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7124 goto finish_meta_pat;
a0d0e21e 7125 case 'd':
ffc61ed2 7126 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7127 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7128 goto finish_meta_pat;
a0d0e21e 7129 case 'D':
ffc61ed2 7130 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7131 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7132 goto finish_meta_pat;
e1d1eefb
YO
7133 case 'R':
7134 ret = reg_node(pRExC_state, LNBREAK);
7135 *flagp |= HASWIDTH|SIMPLE;
7136 goto finish_meta_pat;
7137 case 'h':
7138 ret = reg_node(pRExC_state, HORIZWS);
7139 *flagp |= HASWIDTH|SIMPLE;
7140 goto finish_meta_pat;
7141 case 'H':
7142 ret = reg_node(pRExC_state, NHORIZWS);
7143 *flagp |= HASWIDTH|SIMPLE;
7144 goto finish_meta_pat;
ee9b8eae 7145 case 'v':
e1d1eefb
YO
7146 ret = reg_node(pRExC_state, VERTWS);
7147 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7148 goto finish_meta_pat;
7149 case 'V':
e1d1eefb
YO
7150 ret = reg_node(pRExC_state, NVERTWS);
7151 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7152 finish_meta_pat:
830247a4 7153 nextchar(pRExC_state);
fac92740 7154 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7155 break;
a14b48bc
LW
7156 case 'p':
7157 case 'P':
3568d838 7158 {
3dab1dad 7159 char* const oldregxend = RExC_end;
d008bc60 7160#ifdef DEBUGGING
ccb2c380 7161 char* parse_start = RExC_parse - 2;
d008bc60 7162#endif
a14b48bc 7163
830247a4 7164 if (RExC_parse[1] == '{') {
3568d838 7165 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7166 RExC_end = strchr(RExC_parse, '}');
7167 if (!RExC_end) {
3dab1dad 7168 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7169 RExC_parse += 2;
7170 RExC_end = oldregxend;
0da60cf5 7171 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7172 }
830247a4 7173 RExC_end++;
a14b48bc 7174 }
af6f566e 7175 else {
830247a4 7176 RExC_end = RExC_parse + 2;
af6f566e
HS
7177 if (RExC_end > oldregxend)
7178 RExC_end = oldregxend;
7179 }
830247a4 7180 RExC_parse--;
a14b48bc 7181
3dab1dad 7182 ret = regclass(pRExC_state,depth+1);
a14b48bc 7183
830247a4
IZ
7184 RExC_end = oldregxend;
7185 RExC_parse--;
ccb2c380
MP
7186
7187 Set_Node_Offset(ret, parse_start + 2);
7188 Set_Node_Cur_Length(ret);
830247a4 7189 nextchar(pRExC_state);
a14b48bc
LW
7190 *flagp |= HASWIDTH|SIMPLE;
7191 }
7192 break;
fc8cd66c 7193 case 'N':
afefe6bf 7194 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7195 multicharacter. join_exact() will join them up later on.
7196 Also this makes sure that things like /\N{BLAH}+/ and
7197 \N{BLAH} being multi char Just Happen. dmq*/
7198 ++RExC_parse;
afefe6bf 7199 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7200 break;
0a4db386 7201 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7202 parse_named_seq:
81714fb9
YO
7203 {
7204 char ch= RExC_parse[1];
1f1031fe
YO
7205 if (ch != '<' && ch != '\'' && ch != '{') {
7206 RExC_parse++;
7207 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7208 } else {
1f1031fe
YO
7209 /* this pretty much dupes the code for (?P=...) in reg(), if
7210 you change this make sure you change that */
81714fb9 7211 char* name_start = (RExC_parse += 2);
2eccd3b2 7212 U32 num = 0;
0a4db386
YO
7213 SV *sv_dat = reg_scan_name(pRExC_state,
7214 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7215 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7216 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7217 vFAIL2("Sequence %.3s... not terminated",parse_start);
7218
7219 if (!SIZE_ONLY) {
7220 num = add_data( pRExC_state, 1, "S" );
7221 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7222 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7223 }
7224
81714fb9
YO
7225 RExC_sawback = 1;
7226 ret = reganode(pRExC_state,
7227 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7228 num);
7229 *flagp |= HASWIDTH;
1f1031fe 7230
81714fb9
YO
7231 /* override incorrect value set in reganode MJD */
7232 Set_Node_Offset(ret, parse_start+1);
7233 Set_Node_Cur_Length(ret); /* MJD */
7234 nextchar(pRExC_state);
1f1031fe 7235
81714fb9
YO
7236 }
7237 break;
1f1031fe 7238 }
2bf803e2 7239 case 'g':
a0d0e21e
LW
7240 case '1': case '2': case '3': case '4':
7241 case '5': case '6': case '7': case '8': case '9':
7242 {
c74340f9 7243 I32 num;
2bf803e2
YO
7244 bool isg = *RExC_parse == 'g';
7245 bool isrel = 0;
7246 bool hasbrace = 0;
7247 if (isg) {
c74340f9 7248 RExC_parse++;
2bf803e2
YO
7249 if (*RExC_parse == '{') {
7250 RExC_parse++;
7251 hasbrace = 1;
7252 }
7253 if (*RExC_parse == '-') {
7254 RExC_parse++;
7255 isrel = 1;
7256 }
1f1031fe
YO
7257 if (hasbrace && !isDIGIT(*RExC_parse)) {
7258 if (isrel) RExC_parse--;
7259 RExC_parse -= 2;
7260 goto parse_named_seq;
7261 } }
c74340f9 7262 num = atoi(RExC_parse);
b72d83b2
RGS
7263 if (isg && num == 0)
7264 vFAIL("Reference to invalid group 0");
c74340f9 7265 if (isrel) {
5624f11d 7266 num = RExC_npar - num;
c74340f9
YO
7267 if (num < 1)
7268 vFAIL("Reference to nonexistent or unclosed group");
7269 }
2bf803e2 7270 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7271 goto defchar;
7272 else {
3dab1dad 7273 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7274 while (isDIGIT(*RExC_parse))
7275 RExC_parse++;
1f1031fe
YO
7276 if (parse_start == RExC_parse - 1)
7277 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7278 if (hasbrace) {
7279 if (*RExC_parse != '}')
7280 vFAIL("Unterminated \\g{...} pattern");
7281 RExC_parse++;
7282 }
c74340f9
YO
7283 if (!SIZE_ONLY) {
7284 if (num > (I32)RExC_rx->nparens)
7285 vFAIL("Reference to nonexistent group");
c74340f9 7286 }
830247a4 7287 RExC_sawback = 1;
eb160463
GS
7288 ret = reganode(pRExC_state,
7289 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7290 num);
a0d0e21e 7291 *flagp |= HASWIDTH;
2af232bd 7292
fac92740 7293 /* override incorrect value set in reganode MJD */
2af232bd 7294 Set_Node_Offset(ret, parse_start+1);
fac92740 7295 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7296 RExC_parse--;
7297 nextchar(pRExC_state);
a0d0e21e
LW
7298 }
7299 }
7300 break;
7301 case '\0':
830247a4 7302 if (RExC_parse >= RExC_end)
b45f050a 7303 FAIL("Trailing \\");
a0d0e21e
LW
7304 /* FALL THROUGH */
7305 default:
a0288114 7306 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7307 back into the quick-grab loop below */
45948336 7308 parse_start--;
a0d0e21e
LW
7309 goto defchar;
7310 }
7311 break;
4633a7c4
LW
7312
7313 case '#':
bbe252da 7314 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7315 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7316 goto tryagain;
7317 }
7318 /* FALL THROUGH */
7319
f9a79580
RGS
7320 default:
7321 outer_default:{
ba210ebe 7322 register STRLEN len;
58ae7d3f 7323 register UV ender;
a0d0e21e 7324 register char *p;
3dab1dad 7325 char *s;
80aecb99 7326 STRLEN foldlen;
89ebb4a3 7327 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7328
7329 parse_start = RExC_parse - 1;
a0d0e21e 7330
830247a4 7331 RExC_parse++;
a0d0e21e
LW
7332
7333 defchar:
58ae7d3f 7334 ender = 0;
eb160463
GS
7335 ret = reg_node(pRExC_state,
7336 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7337 s = STRING(ret);
830247a4
IZ
7338 for (len = 0, p = RExC_parse - 1;
7339 len < 127 && p < RExC_end;
a0d0e21e
LW
7340 len++)
7341 {
3dab1dad 7342 char * const oldp = p;
5b5a24f7 7343
bbe252da 7344 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7345 p = regwhite( pRExC_state, p );
f9a79580
RGS
7346 switch ((U8)*p) {
7347 case 0xDF:
7348 case 0xC3:
7349 case 0xCE:
56d400ed 7350 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7351 goto normal_default;
a0d0e21e
LW
7352 case '^':
7353 case '$':
7354 case '.':
7355 case '[':
7356 case '(':
7357 case ')':
7358 case '|':
7359 goto loopdone;
7360 case '\\':
ee9b8eae
YO
7361 /* Literal Escapes Switch
7362
7363 This switch is meant to handle escape sequences that
7364 resolve to a literal character.
7365
7366 Every escape sequence that represents something
7367 else, like an assertion or a char class, is handled
7368 in the switch marked 'Special Escapes' above in this
7369 routine, but also has an entry here as anything that
7370 isn't explicitly mentioned here will be treated as
7371 an unescaped equivalent literal.
7372 */
7373
a0a388a1 7374 switch ((U8)*++p) {
ee9b8eae 7375 /* These are all the special escapes. */
a0a388a1
YO
7376 case 0xDF:
7377 case 0xC3:
7378 case 0xCE:
7379 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7380 goto normal_default;
ee9b8eae
YO
7381 case 'A': /* Start assertion */
7382 case 'b': case 'B': /* Word-boundary assertion*/
7383 case 'C': /* Single char !DANGEROUS! */
7384 case 'd': case 'D': /* digit class */
7385 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7386 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7387 case 'k': case 'K': /* named backref, keep marker */
7388 case 'N': /* named char sequence */
38a44b82 7389 case 'p': case 'P': /* Unicode property */
e1d1eefb 7390 case 'R': /* LNBREAK */
ee9b8eae 7391 case 's': case 'S': /* space class */
e1d1eefb 7392 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7393 case 'w': case 'W': /* word class */
7394 case 'X': /* eXtended Unicode "combining character sequence" */
7395 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7396 --p;
7397 goto loopdone;
ee9b8eae
YO
7398
7399 /* Anything after here is an escape that resolves to a
7400 literal. (Except digits, which may or may not)
7401 */
a0d0e21e
LW
7402 case 'n':
7403 ender = '\n';
7404 p++;
a687059c 7405 break;
a0d0e21e
LW
7406 case 'r':
7407 ender = '\r';
7408 p++;
a687059c 7409 break;
a0d0e21e
LW
7410 case 't':
7411 ender = '\t';
7412 p++;
a687059c 7413 break;
a0d0e21e
LW
7414 case 'f':
7415 ender = '\f';
7416 p++;
a687059c 7417 break;
a0d0e21e 7418 case 'e':
c7f1f016 7419 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7420 p++;
a687059c 7421 break;
a0d0e21e 7422 case 'a':
c7f1f016 7423 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7424 p++;
a687059c 7425 break;
a0d0e21e 7426 case 'x':
a0ed51b3 7427 if (*++p == '{') {
1df70142 7428 char* const e = strchr(p, '}');
b81d288d 7429
b45f050a 7430 if (!e) {
830247a4 7431 RExC_parse = p + 1;
b45f050a
JF
7432 vFAIL("Missing right brace on \\x{}");
7433 }
de5f0749 7434 else {
a4c04bdc
NC
7435 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7436 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7437 STRLEN numlen = e - p - 1;
53305cf1 7438 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
7439 if (ender > 0xff)
7440 RExC_utf8 = 1;
a0ed51b3
LW
7441 p = e + 1;
7442 }
a0ed51b3
LW
7443 }
7444 else {
a4c04bdc 7445 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7446 STRLEN numlen = 2;
53305cf1 7447 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7448 p += numlen;
7449 }
9e08bc66
TS
7450 if (PL_encoding && ender < 0x100)
7451 goto recode_encoding;
a687059c 7452 break;
a0d0e21e
LW
7453 case 'c':
7454 p++;
bbce6d69 7455 ender = UCHARAT(p++);
7456 ender = toCTRL(ender);
a687059c 7457 break;
a0d0e21e
LW
7458 case '0': case '1': case '2': case '3':case '4':
7459 case '5': case '6': case '7': case '8':case '9':
7460 if (*p == '0' ||
830247a4 7461 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 7462 I32 flags = 0;
1df70142 7463 STRLEN numlen = 3;
53305cf1 7464 ender = grok_oct(p, &numlen, &flags, NULL);
609122bd
KW
7465
7466 /* An octal above 0xff is interpreted differently
7467 * depending on if the re is in utf8 or not. If it
7468 * is in utf8, the value will be itself, otherwise
7469 * it is interpreted as modulo 0x100. It has been
7470 * decided to discourage the use of octal above the
7471 * single-byte range. For now, warn only when
7472 * it ends up modulo */
7473 if (SIZE_ONLY && ender >= 0x100
7474 && ! UTF && ! PL_encoding
7475 && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) {
7476 vWARNdep(p, "Use of octal value above 377 is deprecated");
7477 }
a0d0e21e
LW
7478 p += numlen;
7479 }
7480 else {
7481 --p;
7482 goto loopdone;
a687059c 7483 }
9e08bc66
TS
7484 if (PL_encoding && ender < 0x100)
7485 goto recode_encoding;
7486 break;
7487 recode_encoding:
7488 {
7489 SV* enc = PL_encoding;
7490 ender = reg_recode((const char)(U8)ender, &enc);
7491 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7492 vWARN(p, "Invalid escape in the specified encoding");
7493 RExC_utf8 = 1;
7494 }
a687059c 7495 break;
a0d0e21e 7496 case '\0':
830247a4 7497 if (p >= RExC_end)
b45f050a 7498 FAIL("Trailing \\");
a687059c 7499 /* FALL THROUGH */
a0d0e21e 7500 default:
041457d9 7501 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 7502 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7503 goto normal_default;
a0d0e21e
LW
7504 }
7505 break;
a687059c 7506 default:
a0ed51b3 7507 normal_default:
fd400ab9 7508 if (UTF8_IS_START(*p) && UTF) {
1df70142 7509 STRLEN numlen;
5e12f4fb 7510 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7511 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7512 p += numlen;
7513 }
7514 else
7515 ender = *p++;
a0d0e21e 7516 break;
a687059c 7517 }
bcdf7404
YO
7518 if ( RExC_flags & RXf_PMf_EXTENDED)
7519 p = regwhite( pRExC_state, p );
60a8b682
JH
7520 if (UTF && FOLD) {
7521 /* Prime the casefolded buffer. */
ac7e0132 7522 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7523 }
bcdf7404 7524 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7525 if (len)
7526 p = oldp;
16ea2a2e 7527 else if (UTF) {
80aecb99 7528 if (FOLD) {
60a8b682 7529 /* Emit all the Unicode characters. */
1df70142 7530 STRLEN numlen;
80aecb99
JH
7531 for (foldbuf = tmpbuf;
7532 foldlen;
7533 foldlen -= numlen) {
7534 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7535 if (numlen > 0) {
71207a34 7536 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7537 s += unilen;
7538 len += unilen;
7539 /* In EBCDIC the numlen
7540 * and unilen can differ. */
9dc45d57 7541 foldbuf += numlen;
47654450
JH
7542 if (numlen >= foldlen)
7543 break;
9dc45d57
JH
7544 }
7545 else
7546 break; /* "Can't happen." */
80aecb99
JH
7547 }
7548 }
7549 else {
71207a34 7550 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7551 if (unilen > 0) {
0ebc6274
JH
7552 s += unilen;
7553 len += unilen;
9dc45d57 7554 }
80aecb99 7555 }
a0ed51b3 7556 }
a0d0e21e
LW
7557 else {
7558 len++;
eb160463 7559 REGC((char)ender, s++);
a0d0e21e
LW
7560 }
7561 break;
a687059c 7562 }
16ea2a2e 7563 if (UTF) {
80aecb99 7564 if (FOLD) {
60a8b682 7565 /* Emit all the Unicode characters. */
1df70142 7566 STRLEN numlen;
80aecb99
JH
7567 for (foldbuf = tmpbuf;
7568 foldlen;
7569 foldlen -= numlen) {
7570 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7571 if (numlen > 0) {
71207a34 7572 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7573 len += unilen;
7574 s += unilen;
7575 /* In EBCDIC the numlen
7576 * and unilen can differ. */
9dc45d57 7577 foldbuf += numlen;
47654450
JH
7578 if (numlen >= foldlen)
7579 break;
9dc45d57
JH
7580 }
7581 else
7582 break;
80aecb99
JH
7583 }
7584 }
7585 else {
71207a34 7586 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7587 if (unilen > 0) {
0ebc6274
JH
7588 s += unilen;
7589 len += unilen;
9dc45d57 7590 }
80aecb99
JH
7591 }
7592 len--;
a0ed51b3
LW
7593 }
7594 else
eb160463 7595 REGC((char)ender, s++);
a0d0e21e
LW
7596 }
7597 loopdone:
830247a4 7598 RExC_parse = p - 1;
fac92740 7599 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7600 nextchar(pRExC_state);
793db0cb
JH
7601 {
7602 /* len is STRLEN which is unsigned, need to copy to signed */
7603 IV iv = len;
7604 if (iv < 0)
7605 vFAIL("Internal disaster");
7606 }
a0d0e21e
LW
7607 if (len > 0)
7608 *flagp |= HASWIDTH;
090f7165 7609 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7610 *flagp |= SIMPLE;
3dab1dad 7611
cd439c50 7612 if (SIZE_ONLY)
830247a4 7613 RExC_size += STR_SZ(len);
3dab1dad
YO
7614 else {
7615 STR_LEN(ret) = len;
830247a4 7616 RExC_emit += STR_SZ(len);
07be1b83 7617 }
3dab1dad 7618 }
a0d0e21e
LW
7619 break;
7620 }
a687059c 7621
a0d0e21e 7622 return(ret);
a687059c
LW
7623}
7624
873ef191 7625STATIC char *
bcdf7404 7626S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7627{
bcdf7404 7628 const char *e = RExC_end;
7918f24d
NC
7629
7630 PERL_ARGS_ASSERT_REGWHITE;
7631
5b5a24f7
CS
7632 while (p < e) {
7633 if (isSPACE(*p))
7634 ++p;
7635 else if (*p == '#') {
bcdf7404 7636 bool ended = 0;
5b5a24f7 7637 do {
bcdf7404
YO
7638 if (*p++ == '\n') {
7639 ended = 1;
7640 break;
7641 }
7642 } while (p < e);
7643 if (!ended)
7644 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7645 }
7646 else
7647 break;
7648 }
7649 return p;
7650}
7651
b8c5462f
JH
7652/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7653 Character classes ([:foo:]) can also be negated ([:^foo:]).
7654 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7655 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7656 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7657
7658#define POSIXCC_DONE(c) ((c) == ':')
7659#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7660#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7661
b8c5462f 7662STATIC I32
830247a4 7663S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7664{
97aff369 7665 dVAR;
936ed897 7666 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7667
7918f24d
NC
7668 PERL_ARGS_ASSERT_REGPPOSIXCC;
7669
830247a4 7670 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7671 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7672 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7673 const char c = UCHARAT(RExC_parse);
097eb12c 7674 char* const s = RExC_parse++;
b81d288d 7675
9a86a77b 7676 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7677 RExC_parse++;
7678 if (RExC_parse == RExC_end)
620e46c5 7679 /* Grandfather lone [:, [=, [. */
830247a4 7680 RExC_parse = s;
620e46c5 7681 else {
3dab1dad 7682 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7683 assert(*t == c);
7684
9a86a77b 7685 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7686 const char *posixcc = s + 1;
830247a4 7687 RExC_parse++; /* skip over the ending ] */
3dab1dad 7688
b8c5462f 7689 if (*s == ':') {
1df70142
AL
7690 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7691 const I32 skip = t - posixcc;
80916619
NC
7692
7693 /* Initially switch on the length of the name. */
7694 switch (skip) {
7695 case 4:
3dab1dad
YO
7696 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7697 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7698 break;
80916619
NC
7699 case 5:
7700 /* Names all of length 5. */
7701 /* alnum alpha ascii blank cntrl digit graph lower
7702 print punct space upper */
7703 /* Offset 4 gives the best switch position. */
7704 switch (posixcc[4]) {
7705 case 'a':
3dab1dad
YO
7706 if (memEQ(posixcc, "alph", 4)) /* alpha */
7707 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7708 break;
7709 case 'e':
3dab1dad
YO
7710 if (memEQ(posixcc, "spac", 4)) /* space */
7711 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7712 break;
7713 case 'h':
3dab1dad
YO
7714 if (memEQ(posixcc, "grap", 4)) /* graph */
7715 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7716 break;
7717 case 'i':
3dab1dad
YO
7718 if (memEQ(posixcc, "asci", 4)) /* ascii */
7719 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7720 break;
7721 case 'k':
3dab1dad
YO
7722 if (memEQ(posixcc, "blan", 4)) /* blank */
7723 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7724 break;
7725 case 'l':
3dab1dad
YO
7726 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7727 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7728 break;
7729 case 'm':
3dab1dad
YO
7730 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7731 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7732 break;
7733 case 'r':
3dab1dad
YO
7734 if (memEQ(posixcc, "lowe", 4)) /* lower */
7735 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7736 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7737 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7738 break;
7739 case 't':
3dab1dad
YO
7740 if (memEQ(posixcc, "digi", 4)) /* digit */
7741 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7742 else if (memEQ(posixcc, "prin", 4)) /* print */
7743 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7744 else if (memEQ(posixcc, "punc", 4)) /* punct */
7745 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7746 break;
b8c5462f
JH
7747 }
7748 break;
80916619 7749 case 6:
3dab1dad
YO
7750 if (memEQ(posixcc, "xdigit", 6))
7751 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7752 break;
7753 }
80916619
NC
7754
7755 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7756 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7757 t - s - 1, s + 1);
80916619
NC
7758 assert (posixcc[skip] == ':');
7759 assert (posixcc[skip+1] == ']');
b45f050a 7760 } else if (!SIZE_ONLY) {
b8c5462f 7761 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7762
830247a4 7763 /* adjust RExC_parse so the warning shows after
b45f050a 7764 the class closes */
9a86a77b 7765 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7766 RExC_parse++;
b45f050a
JF
7767 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7768 }
b8c5462f
JH
7769 } else {
7770 /* Maternal grandfather:
7771 * "[:" ending in ":" but not in ":]" */
830247a4 7772 RExC_parse = s;
767d463e 7773 }
620e46c5
JH
7774 }
7775 }
7776
b8c5462f
JH
7777 return namedclass;
7778}
7779
7780STATIC void
830247a4 7781S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7782{
97aff369 7783 dVAR;
7918f24d
NC
7784
7785 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7786
3dab1dad 7787 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7788 const char *s = RExC_parse;
7789 const char c = *s++;
b8c5462f 7790
3dab1dad 7791 while (isALNUM(*s))
b8c5462f
JH
7792 s++;
7793 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7794 if (ckWARN(WARN_REGEXP))
7795 vWARN3(s+2,
7796 "POSIX syntax [%c %c] belongs inside character classes",
7797 c, c);
b45f050a
JF
7798
7799 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7800 if (POSIXCC_NOTYET(c)) {
830247a4 7801 /* adjust RExC_parse so the error shows after
b45f050a 7802 the class closes */
9a86a77b 7803 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7804 NOOP;
b45f050a
JF
7805 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7806 }
b8c5462f
JH
7807 }
7808 }
620e46c5
JH
7809}
7810
7f6f358c 7811
89836f1f
YO
7812#define _C_C_T_(NAME,TEST,WORD) \
7813ANYOF_##NAME: \
7814 if (LOC) \
7815 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7816 else { \
7817 for (value = 0; value < 256; value++) \
7818 if (TEST) \
7819 ANYOF_BITMAP_SET(ret, value); \
7820 } \
7821 yesno = '+'; \
7822 what = WORD; \
7823 break; \
7824case ANYOF_N##NAME: \
7825 if (LOC) \
7826 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7827 else { \
7828 for (value = 0; value < 256; value++) \
7829 if (!TEST) \
7830 ANYOF_BITMAP_SET(ret, value); \
7831 } \
7832 yesno = '!'; \
7833 what = WORD; \
7834 break
7835
e1d1eefb
YO
7836#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7837ANYOF_##NAME: \
7838 for (value = 0; value < 256; value++) \
7839 if (TEST) \
7840 ANYOF_BITMAP_SET(ret, value); \
7841 yesno = '+'; \
7842 what = WORD; \
7843 break; \
7844case ANYOF_N##NAME: \
7845 for (value = 0; value < 256; value++) \
7846 if (!TEST) \
7847 ANYOF_BITMAP_SET(ret, value); \
7848 yesno = '!'; \
7849 what = WORD; \
7850 break
89836f1f 7851
da7fcca4
YO
7852/*
7853 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7854 so that it is possible to override the option here without having to
7855 rebuild the entire core. as we are required to do if we change regcomp.h
7856 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7857*/
7858#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7859#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7860#endif
7861
7862#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7863#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7864#else
7865#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7866#endif
7867
7f6f358c
YO
7868/*
7869 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7870 matches the pattern or if the pattern matches a single char only and
7871 that char is < 256 and we are case insensitive then we produce an
7872 EXACT node instead.
7f6f358c 7873*/
89836f1f 7874
76e3520e 7875STATIC regnode *
3dab1dad 7876S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7877{
97aff369 7878 dVAR;
9a86a77b 7879 register UV nextvalue;
3568d838 7880 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7881 register IV range = 0;
e1d1eefb 7882 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 7883 register regnode *ret;
ba210ebe 7884 STRLEN numlen;
ffc61ed2 7885 IV namedclass;
cbbf8932 7886 char *rangebegin = NULL;
936ed897 7887 bool need_class = 0;
c445ea15 7888 SV *listsv = NULL;
ffc61ed2 7889 UV n;
9e55ce06 7890 bool optimize_invert = TRUE;
cbbf8932 7891 AV* unicode_alternate = NULL;
1b2d223b
JH
7892#ifdef EBCDIC
7893 UV literal_endpoint = 0;
7894#endif
7f6f358c 7895 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7896
3dab1dad 7897 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7898 case we need to change the emitted regop to an EXACT. */
07be1b83 7899 const char * orig_parse = RExC_parse;
72f13be8 7900 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7901
7902 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
7903#ifndef DEBUGGING
7904 PERL_UNUSED_ARG(depth);
7905#endif
72f13be8 7906
3dab1dad 7907 DEBUG_PARSE("clas");
7f6f358c
YO
7908
7909 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7910 ret = reganode(pRExC_state, ANYOF, 0);
7911
7912 if (!SIZE_ONLY)
7913 ANYOF_FLAGS(ret) = 0;
7914
9a86a77b 7915 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7916 RExC_naughty++;
7917 RExC_parse++;
7918 if (!SIZE_ONLY)
7919 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7920 }
a0d0e21e 7921
73060fc4 7922 if (SIZE_ONLY) {
830247a4 7923 RExC_size += ANYOF_SKIP;
73060fc4
JH
7924 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7925 }
936ed897 7926 else {
830247a4 7927 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7928 if (FOLD)
7929 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7930 if (LOC)
7931 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7932 ANYOF_BITMAP_ZERO(ret);
396482e1 7933 listsv = newSVpvs("# comment\n");
a0d0e21e 7934 }
b8c5462f 7935
9a86a77b
JH
7936 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7937
b938889d 7938 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7939 checkposixcc(pRExC_state);
b8c5462f 7940
f064b6ad
HS
7941 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7942 if (UCHARAT(RExC_parse) == ']')
7943 goto charclassloop;
ffc61ed2 7944
fc8cd66c 7945parseit:
9a86a77b 7946 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7947
7948 charclassloop:
7949
7950 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7951
73b437c8 7952 if (!range)
830247a4 7953 rangebegin = RExC_parse;
ffc61ed2 7954 if (UTF) {
5e12f4fb 7955 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7956 RExC_end - RExC_parse,
9f7f3913 7957 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7958 RExC_parse += numlen;
7959 }
7960 else
7961 value = UCHARAT(RExC_parse++);
7f6f358c 7962
9a86a77b
JH
7963 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7964 if (value == '[' && POSIXCC(nextvalue))
830247a4 7965 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7966 else if (value == '\\') {
ffc61ed2 7967 if (UTF) {
5e12f4fb 7968 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7969 RExC_end - RExC_parse,
9f7f3913 7970 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7971 RExC_parse += numlen;
7972 }
7973 else
7974 value = UCHARAT(RExC_parse++);
470c3474 7975 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7976 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7977 * be a problem later if we want switch on Unicode.
7978 * A similar issue a little bit later when switching on
7979 * namedclass. --jhi */
ffc61ed2 7980 switch ((I32)value) {
b8c5462f
JH
7981 case 'w': namedclass = ANYOF_ALNUM; break;
7982 case 'W': namedclass = ANYOF_NALNUM; break;
7983 case 's': namedclass = ANYOF_SPACE; break;
7984 case 'S': namedclass = ANYOF_NSPACE; break;
7985 case 'd': namedclass = ANYOF_DIGIT; break;
7986 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
7987 case 'v': namedclass = ANYOF_VERTWS; break;
7988 case 'V': namedclass = ANYOF_NVERTWS; break;
7989 case 'h': namedclass = ANYOF_HORIZWS; break;
7990 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
7991 case 'N': /* Handle \N{NAME} in class */
7992 {
7993 /* We only pay attention to the first char of
7994 multichar strings being returned. I kinda wonder
7995 if this makes sense as it does change the behaviour
7996 from earlier versions, OTOH that behaviour was broken
7997 as well. */
7998 UV v; /* value is register so we cant & it /grrr */
afefe6bf 7999 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
8000 goto parseit;
8001 }
8002 value= v;
8003 }
8004 break;
ffc61ed2
JH
8005 case 'p':
8006 case 'P':
3dab1dad
YO
8007 {
8008 char *e;
af6f566e 8009 if (RExC_parse >= RExC_end)
2a4859cd 8010 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8011 if (*RExC_parse == '{') {
1df70142 8012 const U8 c = (U8)value;
ffc61ed2
JH
8013 e = strchr(RExC_parse++, '}');
8014 if (!e)
0da60cf5 8015 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8016 while (isSPACE(UCHARAT(RExC_parse)))
8017 RExC_parse++;
8018 if (e == RExC_parse)
0da60cf5 8019 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8020 n = e - RExC_parse;
ab13f0c7
JH
8021 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8022 n--;
ffc61ed2
JH
8023 }
8024 else {
8025 e = RExC_parse;
8026 n = 1;
8027 }
8028 if (!SIZE_ONLY) {
ab13f0c7
JH
8029 if (UCHARAT(RExC_parse) == '^') {
8030 RExC_parse++;
8031 n--;
8032 value = value == 'p' ? 'P' : 'p'; /* toggle */
8033 while (isSPACE(UCHARAT(RExC_parse))) {
8034 RExC_parse++;
8035 n--;
8036 }
8037 }
097eb12c
AL
8038 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8039 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8040 }
8041 RExC_parse = e + 1;
8042 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8043 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8044 }
f81125e2 8045 break;
b8c5462f
JH
8046 case 'n': value = '\n'; break;
8047 case 'r': value = '\r'; break;
8048 case 't': value = '\t'; break;
8049 case 'f': value = '\f'; break;
8050 case 'b': value = '\b'; break;
c7f1f016
NIS
8051 case 'e': value = ASCII_TO_NATIVE('\033');break;
8052 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 8053 case 'x':
ffc61ed2 8054 if (*RExC_parse == '{') {
a4c04bdc
NC
8055 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8056 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8057 char * const e = strchr(RExC_parse++, '}');
b81d288d 8058 if (!e)
ffc61ed2 8059 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8060
8061 numlen = e - RExC_parse;
8062 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8063 RExC_parse = e + 1;
8064 }
8065 else {
a4c04bdc 8066 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8067 numlen = 2;
8068 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8069 RExC_parse += numlen;
8070 }
9e08bc66
TS
8071 if (PL_encoding && value < 0x100)
8072 goto recode_encoding;
b8c5462f
JH
8073 break;
8074 case 'c':
830247a4 8075 value = UCHARAT(RExC_parse++);
b8c5462f
JH
8076 value = toCTRL(value);
8077 break;
8078 case '0': case '1': case '2': case '3': case '4':
8079 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
8080 {
8081 I32 flags = 0;
8082 numlen = 3;
8083 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8084 RExC_parse += numlen;
8085 if (PL_encoding && value < 0x100)
8086 goto recode_encoding;
8087 break;
8088 }
8089 recode_encoding:
8090 {
8091 SV* enc = PL_encoding;
8092 value = reg_recode((const char)(U8)value, &enc);
8093 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
8094 vWARN(RExC_parse,
8095 "Invalid escape in the specified encoding");
8096 break;
8097 }
1028017a 8098 default:
041457d9 8099 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
8100 vWARN2(RExC_parse,
8101 "Unrecognized escape \\%c in character class passed through",
8102 (int)value);
1028017a 8103 break;
b8c5462f 8104 }
ffc61ed2 8105 } /* end of \blah */
1b2d223b
JH
8106#ifdef EBCDIC
8107 else
8108 literal_endpoint++;
8109#endif
ffc61ed2
JH
8110
8111 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8112
8113 if (!SIZE_ONLY && !need_class)
936ed897 8114 ANYOF_CLASS_ZERO(ret);
ffc61ed2 8115
936ed897 8116 need_class = 1;
ffc61ed2
JH
8117
8118 /* a bad range like a-\d, a-[:digit:] ? */
8119 if (range) {
73b437c8 8120 if (!SIZE_ONLY) {
afd78fd5 8121 if (ckWARN(WARN_REGEXP)) {
097eb12c 8122 const int w =
afd78fd5
JH
8123 RExC_parse >= rangebegin ?
8124 RExC_parse - rangebegin : 0;
830247a4 8125 vWARN4(RExC_parse,
b45f050a 8126 "False [] range \"%*.*s\"",
097eb12c 8127 w, w, rangebegin);
afd78fd5 8128 }
3568d838
JH
8129 if (prevvalue < 256) {
8130 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8131 ANYOF_BITMAP_SET(ret, '-');
8132 }
8133 else {
8134 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8135 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8136 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8137 }
b8c5462f 8138 }
ffc61ed2
JH
8139
8140 range = 0; /* this was not a true range */
73b437c8 8141 }
ffc61ed2 8142
89836f1f
YO
8143
8144
73b437c8 8145 if (!SIZE_ONLY) {
c49a72a9
NC
8146 const char *what = NULL;
8147 char yesno = 0;
8148
3568d838
JH
8149 if (namedclass > OOB_NAMEDCLASS)
8150 optimize_invert = FALSE;
e2962f66
JH
8151 /* Possible truncation here but in some 64-bit environments
8152 * the compiler gets heartburn about switch on 64-bit values.
8153 * A similar issue a little earlier when switching on value.
98f323fa 8154 * --jhi */
e2962f66 8155 switch ((I32)namedclass) {
da7fcca4
YO
8156
8157 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8158 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8159 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8160 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8161 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8162 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8163 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8164 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8165 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8166 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8167#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
89836f1f 8168 case _C_C_T_(ALNUM, isALNUM(value), "Word");
89836f1f 8169 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
da7fcca4
YO
8170#else
8171 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8172 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8173#endif
89836f1f 8174 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
8175 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8176 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8177 case ANYOF_ASCII:
8178 if (LOC)
936ed897 8179 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8180 else {
c7f1f016 8181#ifndef EBCDIC
1ba5c669
JH
8182 for (value = 0; value < 128; value++)
8183 ANYOF_BITMAP_SET(ret, value);
8184#else /* EBCDIC */
ffbc6a93 8185 for (value = 0; value < 256; value++) {
3a3c4447
JH
8186 if (isASCII(value))
8187 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8188 }
1ba5c669 8189#endif /* EBCDIC */
73b437c8 8190 }
c49a72a9
NC
8191 yesno = '+';
8192 what = "ASCII";
73b437c8
JH
8193 break;
8194 case ANYOF_NASCII:
8195 if (LOC)
936ed897 8196 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8197 else {
c7f1f016 8198#ifndef EBCDIC
1ba5c669
JH
8199 for (value = 128; value < 256; value++)
8200 ANYOF_BITMAP_SET(ret, value);
8201#else /* EBCDIC */
ffbc6a93 8202 for (value = 0; value < 256; value++) {
3a3c4447
JH
8203 if (!isASCII(value))
8204 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8205 }
1ba5c669 8206#endif /* EBCDIC */
73b437c8 8207 }
c49a72a9
NC
8208 yesno = '!';
8209 what = "ASCII";
89836f1f 8210 break;
ffc61ed2
JH
8211 case ANYOF_DIGIT:
8212 if (LOC)
8213 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8214 else {
8215 /* consecutive digits assumed */
8216 for (value = '0'; value <= '9'; value++)
8217 ANYOF_BITMAP_SET(ret, value);
8218 }
c49a72a9 8219 yesno = '+';
da7fcca4 8220 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8221 break;
8222 case ANYOF_NDIGIT:
8223 if (LOC)
8224 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8225 else {
8226 /* consecutive digits assumed */
8227 for (value = 0; value < '0'; value++)
8228 ANYOF_BITMAP_SET(ret, value);
8229 for (value = '9' + 1; value < 256; value++)
8230 ANYOF_BITMAP_SET(ret, value);
8231 }
c49a72a9 8232 yesno = '!';
da7fcca4 8233 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8234 break;
f81125e2
JP
8235 case ANYOF_MAX:
8236 /* this is to handle \p and \P */
8237 break;
73b437c8 8238 default:
b45f050a 8239 vFAIL("Invalid [::] class");
73b437c8 8240 break;
b8c5462f 8241 }
c49a72a9
NC
8242 if (what) {
8243 /* Strings such as "+utf8::isWord\n" */
8244 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8245 }
b8c5462f 8246 if (LOC)
936ed897 8247 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 8248 continue;
a0d0e21e 8249 }
ffc61ed2
JH
8250 } /* end of namedclass \blah */
8251
a0d0e21e 8252 if (range) {
eb160463 8253 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8254 const int w = RExC_parse - rangebegin;
8255 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8256 range = 0; /* not a valid range */
73b437c8 8257 }
a0d0e21e
LW
8258 }
8259 else {
3568d838 8260 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8261 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8262 RExC_parse[1] != ']') {
8263 RExC_parse++;
ffc61ed2
JH
8264
8265 /* a bad range like \w-, [:word:]- ? */
8266 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8267 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8268 const int w =
afd78fd5
JH
8269 RExC_parse >= rangebegin ?
8270 RExC_parse - rangebegin : 0;
830247a4 8271 vWARN4(RExC_parse,
b45f050a 8272 "False [] range \"%*.*s\"",
097eb12c 8273 w, w, rangebegin);
afd78fd5 8274 }
73b437c8 8275 if (!SIZE_ONLY)
936ed897 8276 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8277 } else
ffc61ed2
JH
8278 range = 1; /* yeah, it's a range! */
8279 continue; /* but do it the next time */
a0d0e21e 8280 }
a687059c 8281 }
ffc61ed2 8282
93733859 8283 /* now is the next time */
07be1b83 8284 /*stored += (value - prevvalue + 1);*/
ae5c130c 8285 if (!SIZE_ONLY) {
3568d838 8286 if (prevvalue < 256) {
1df70142 8287 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8288 IV i;
3568d838 8289#ifdef EBCDIC
1b2d223b
JH
8290 /* In EBCDIC [\x89-\x91] should include
8291 * the \x8e but [i-j] should not. */
8292 if (literal_endpoint == 2 &&
8293 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8294 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8295 {
3568d838
JH
8296 if (isLOWER(prevvalue)) {
8297 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8298 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8299 stored++;
ffc61ed2 8300 ANYOF_BITMAP_SET(ret, i);
2670d666 8301 }
ffc61ed2 8302 } else {
3568d838 8303 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8304 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8305 stored++;
ffc61ed2 8306 ANYOF_BITMAP_SET(ret, i);
2670d666 8307 }
ffc61ed2 8308 }
8ada0baa 8309 }
ffc61ed2 8310 else
8ada0baa 8311#endif
07be1b83
YO
8312 for (i = prevvalue; i <= ceilvalue; i++) {
8313 if (!ANYOF_BITMAP_TEST(ret,i)) {
8314 stored++;
8315 ANYOF_BITMAP_SET(ret, i);
8316 }
8317 }
3568d838 8318 }
a5961de5 8319 if (value > 255 || UTF) {
1df70142
AL
8320 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8321 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8322 stored+=2; /* can't optimize this class */
ffc61ed2 8323 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8324 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8325 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8326 prevnatvalue, natvalue);
8327 }
8328 else if (prevnatvalue == natvalue) {
8329 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8330 if (FOLD) {
89ebb4a3 8331 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8332 STRLEN foldlen;
1df70142 8333 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8334
e294cc5d
JH
8335#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8336 if (RExC_precomp[0] == ':' &&
8337 RExC_precomp[1] == '[' &&
8338 (f == 0xDF || f == 0x92)) {
8339 f = NATIVE_TO_UNI(f);
8340 }
8341#endif
c840d2a2
JH
8342 /* If folding and foldable and a single
8343 * character, insert also the folded version
8344 * to the charclass. */
9e55ce06 8345 if (f != value) {
e294cc5d
JH
8346#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8347 if ((RExC_precomp[0] == ':' &&
8348 RExC_precomp[1] == '[' &&
8349 (f == 0xA2 &&
8350 (value == 0xFB05 || value == 0xFB06))) ?
8351 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8352 foldlen == (STRLEN)UNISKIP(f) )
8353#else
eb160463 8354 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8355#endif
9e55ce06
JH
8356 Perl_sv_catpvf(aTHX_ listsv,
8357 "%04"UVxf"\n", f);
8358 else {
8359 /* Any multicharacter foldings
8360 * require the following transform:
8361 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8362 * where E folds into "pq" and F folds
8363 * into "rst", all other characters
8364 * fold to single characters. We save
8365 * away these multicharacter foldings,
8366 * to be later saved as part of the
8367 * additional "s" data. */
8368 SV *sv;
8369
8370 if (!unicode_alternate)
8371 unicode_alternate = newAV();
740cce10
NC
8372 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8373 TRUE);
9e55ce06
JH
8374 av_push(unicode_alternate, sv);
8375 }
8376 }
254ba52a 8377
60a8b682
JH
8378 /* If folding and the value is one of the Greek
8379 * sigmas insert a few more sigmas to make the
8380 * folding rules of the sigmas to work right.
8381 * Note that not all the possible combinations
8382 * are handled here: some of them are handled
9e55ce06
JH
8383 * by the standard folding rules, and some of
8384 * them (literal or EXACTF cases) are handled
8385 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8386 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8387 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8388 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8389 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8390 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8391 }
8392 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8393 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8394 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8395 }
8396 }
ffc61ed2 8397 }
1b2d223b
JH
8398#ifdef EBCDIC
8399 literal_endpoint = 0;
8400#endif
8ada0baa 8401 }
ffc61ed2
JH
8402
8403 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8404 }
ffc61ed2 8405
936ed897 8406 if (need_class) {
4f66b38d 8407 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 8408 if (SIZE_ONLY)
830247a4 8409 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 8410 else
830247a4 8411 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 8412 }
ffc61ed2 8413
7f6f358c
YO
8414
8415 if (SIZE_ONLY)
8416 return ret;
8417 /****** !SIZE_ONLY AFTER HERE *********/
8418
3b57cd43 8419 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8420 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8421 ) {
8422 /* optimize single char class to an EXACT node
8423 but *only* when its not a UTF/high char */
07be1b83
YO
8424 const char * cur_parse= RExC_parse;
8425 RExC_emit = (regnode *)orig_emit;
8426 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8427 ret = reg_node(pRExC_state,
8428 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8429 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8430 *STRING(ret)= (char)value;
8431 STR_LEN(ret)= 1;
8432 RExC_emit += STR_SZ(1);
b023a8ee
MHM
8433 if (listsv) {
8434 SvREFCNT_dec(listsv);
8435 }
7f6f358c
YO
8436 return ret;
8437 }
ae5c130c 8438 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8439 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8440 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8441 ) {
a0ed51b3 8442 for (value = 0; value < 256; ++value) {
936ed897 8443 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8444 UV fold = PL_fold[value];
ffc61ed2
JH
8445
8446 if (fold != value)
8447 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8448 }
8449 }
936ed897 8450 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8451 }
ffc61ed2 8452
ae5c130c 8453 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8454 if (optimize_invert &&
ffc61ed2
JH
8455 /* If the only flag is inversion. */
8456 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8457 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8458 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8459 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8460 }
7f6f358c 8461 {
097eb12c 8462 AV * const av = newAV();
ffc61ed2 8463 SV *rv;
9e55ce06 8464 /* The 0th element stores the character class description
6a0407ee 8465 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8466 * to initialize the appropriate swash (which gets stored in
8467 * the 1st element), and also useful for dumping the regnode.
8468 * The 2nd element stores the multicharacter foldings,
6a0407ee 8469 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8470 av_store(av, 0, listsv);
8471 av_store(av, 1, NULL);
ad64d0ec
NC
8472 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8473 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8474 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8475 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8476 ARG_SET(ret, n);
a0ed51b3 8477 }
a0ed51b3
LW
8478 return ret;
8479}
89836f1f
YO
8480#undef _C_C_T_
8481
a0ed51b3 8482
bcdf7404
YO
8483/* reg_skipcomment()
8484
8485 Absorbs an /x style # comments from the input stream.
8486 Returns true if there is more text remaining in the stream.
8487 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8488 terminates the pattern without including a newline.
8489
8490 Note its the callers responsibility to ensure that we are
8491 actually in /x mode
8492
8493*/
8494
8495STATIC bool
8496S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8497{
8498 bool ended = 0;
7918f24d
NC
8499
8500 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8501
bcdf7404
YO
8502 while (RExC_parse < RExC_end)
8503 if (*RExC_parse++ == '\n') {
8504 ended = 1;
8505 break;
8506 }
8507 if (!ended) {
8508 /* we ran off the end of the pattern without ending
8509 the comment, so we have to add an \n when wrapping */
8510 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8511 return 0;
8512 } else
8513 return 1;
8514}
8515
8516/* nextchar()
8517
8518 Advance that parse position, and optionall absorbs
8519 "whitespace" from the inputstream.
8520
8521 Without /x "whitespace" means (?#...) style comments only,
8522 with /x this means (?#...) and # comments and whitespace proper.
8523
8524 Returns the RExC_parse point from BEFORE the scan occurs.
8525
8526 This is the /x friendly way of saying RExC_parse++.
8527*/
8528
76e3520e 8529STATIC char*
830247a4 8530S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8531{
097eb12c 8532 char* const retval = RExC_parse++;
a0d0e21e 8533
7918f24d
NC
8534 PERL_ARGS_ASSERT_NEXTCHAR;
8535
4633a7c4 8536 for (;;) {
830247a4
IZ
8537 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8538 RExC_parse[2] == '#') {
e994fd66
AE
8539 while (*RExC_parse != ')') {
8540 if (RExC_parse == RExC_end)
8541 FAIL("Sequence (?#... not terminated");
830247a4 8542 RExC_parse++;
e994fd66 8543 }
830247a4 8544 RExC_parse++;
4633a7c4
LW
8545 continue;
8546 }
bbe252da 8547 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8548 if (isSPACE(*RExC_parse)) {
8549 RExC_parse++;
748a9306
LW
8550 continue;
8551 }
830247a4 8552 else if (*RExC_parse == '#') {
bcdf7404
YO
8553 if ( reg_skipcomment( pRExC_state ) )
8554 continue;
748a9306 8555 }
748a9306 8556 }
4633a7c4 8557 return retval;
a0d0e21e 8558 }
a687059c
LW
8559}
8560
8561/*
c277df42 8562- reg_node - emit a node
a0d0e21e 8563*/
76e3520e 8564STATIC regnode * /* Location. */
830247a4 8565S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8566{
97aff369 8567 dVAR;
c277df42 8568 register regnode *ptr;
504618e9 8569 regnode * const ret = RExC_emit;
07be1b83 8570 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8571
7918f24d
NC
8572 PERL_ARGS_ASSERT_REG_NODE;
8573
c277df42 8574 if (SIZE_ONLY) {
830247a4
IZ
8575 SIZE_ALIGN(RExC_size);
8576 RExC_size += 1;
a0d0e21e
LW
8577 return(ret);
8578 }
3b57cd43
YO
8579 if (RExC_emit >= RExC_emit_bound)
8580 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8581
c277df42 8582 NODE_ALIGN_FILL(ret);
a0d0e21e 8583 ptr = ret;
c277df42 8584 FILL_ADVANCE_NODE(ptr, op);
7122b237 8585#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8586 if (RExC_offsets) { /* MJD */
07be1b83 8587 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8588 "reg_node", __LINE__,
13d6edb4 8589 PL_reg_name[op],
07be1b83
YO
8590 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8591 ? "Overwriting end of array!\n" : "OK",
8592 (UV)(RExC_emit - RExC_emit_start),
8593 (UV)(RExC_parse - RExC_start),
8594 (UV)RExC_offsets[0]));
ccb2c380 8595 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8596 }
7122b237 8597#endif
830247a4 8598 RExC_emit = ptr;
a0d0e21e 8599 return(ret);
a687059c
LW
8600}
8601
8602/*
a0d0e21e
LW
8603- reganode - emit a node with an argument
8604*/
76e3520e 8605STATIC regnode * /* Location. */
830247a4 8606S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8607{
97aff369 8608 dVAR;
c277df42 8609 register regnode *ptr;
504618e9 8610 regnode * const ret = RExC_emit;
07be1b83 8611 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8612
7918f24d
NC
8613 PERL_ARGS_ASSERT_REGANODE;
8614
c277df42 8615 if (SIZE_ONLY) {
830247a4
IZ
8616 SIZE_ALIGN(RExC_size);
8617 RExC_size += 2;
6bda09f9
YO
8618 /*
8619 We can't do this:
8620
8621 assert(2==regarglen[op]+1);
8622
8623 Anything larger than this has to allocate the extra amount.
8624 If we changed this to be:
8625
8626 RExC_size += (1 + regarglen[op]);
8627
8628 then it wouldn't matter. Its not clear what side effect
8629 might come from that so its not done so far.
8630 -- dmq
8631 */
a0d0e21e
LW
8632 return(ret);
8633 }
3b57cd43
YO
8634 if (RExC_emit >= RExC_emit_bound)
8635 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8636
c277df42 8637 NODE_ALIGN_FILL(ret);
a0d0e21e 8638 ptr = ret;
c277df42 8639 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8640#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8641 if (RExC_offsets) { /* MJD */
07be1b83 8642 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8643 "reganode",
ccb2c380 8644 __LINE__,
13d6edb4 8645 PL_reg_name[op],
07be1b83 8646 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8647 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8648 (UV)(RExC_emit - RExC_emit_start),
8649 (UV)(RExC_parse - RExC_start),
8650 (UV)RExC_offsets[0]));
ccb2c380 8651 Set_Cur_Node_Offset;
fac92740 8652 }
7122b237 8653#endif
830247a4 8654 RExC_emit = ptr;
a0d0e21e 8655 return(ret);
fe14fcc3
LW
8656}
8657
8658/*
cd439c50 8659- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8660*/
71207a34
AL
8661STATIC STRLEN
8662S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8663{
97aff369 8664 dVAR;
7918f24d
NC
8665
8666 PERL_ARGS_ASSERT_REGUNI;
8667
71207a34 8668 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8669}
8670
8671/*
a0d0e21e
LW
8672- reginsert - insert an operator in front of already-emitted operand
8673*
8674* Means relocating the operand.
8675*/
76e3520e 8676STATIC void
6bda09f9 8677S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8678{
97aff369 8679 dVAR;
c277df42
IZ
8680 register regnode *src;
8681 register regnode *dst;
8682 register regnode *place;
504618e9 8683 const int offset = regarglen[(U8)op];
6bda09f9 8684 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8685 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8686
8687 PERL_ARGS_ASSERT_REGINSERT;
def51078 8688 PERL_UNUSED_ARG(depth);
22c35a8c 8689/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8690 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8691 if (SIZE_ONLY) {
6bda09f9 8692 RExC_size += size;
a0d0e21e
LW
8693 return;
8694 }
a687059c 8695
830247a4 8696 src = RExC_emit;
6bda09f9 8697 RExC_emit += size;
830247a4 8698 dst = RExC_emit;
40d049e4 8699 if (RExC_open_parens) {
6bda09f9 8700 int paren;
3b57cd43 8701 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 8702 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 8703 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 8704 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
8705 RExC_open_parens[paren] += size;
8706 } else {
3b57cd43 8707 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
8708 }
8709 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 8710 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
8711 RExC_close_parens[paren] += size;
8712 } else {
3b57cd43 8713 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
8714 }
8715 }
6bda09f9 8716 }
40d049e4 8717
fac92740 8718 while (src > opnd) {
c277df42 8719 StructCopy(--src, --dst, regnode);
7122b237 8720#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8721 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8722 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8723 "reg_insert",
ccb2c380 8724 __LINE__,
13d6edb4 8725 PL_reg_name[op],
07be1b83
YO
8726 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8727 ? "Overwriting end of array!\n" : "OK",
8728 (UV)(src - RExC_emit_start),
8729 (UV)(dst - RExC_emit_start),
8730 (UV)RExC_offsets[0]));
ccb2c380
MP
8731 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8732 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8733 }
7122b237 8734#endif
fac92740
MJD
8735 }
8736
a0d0e21e
LW
8737
8738 place = opnd; /* Op node, where operand used to be. */
7122b237 8739#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8740 if (RExC_offsets) { /* MJD */
07be1b83 8741 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8742 "reginsert",
ccb2c380 8743 __LINE__,
13d6edb4 8744 PL_reg_name[op],
07be1b83 8745 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8746 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8747 (UV)(place - RExC_emit_start),
8748 (UV)(RExC_parse - RExC_start),
786e8c11 8749 (UV)RExC_offsets[0]));
ccb2c380 8750 Set_Node_Offset(place, RExC_parse);
45948336 8751 Set_Node_Length(place, 1);
fac92740 8752 }
7122b237 8753#endif
c277df42
IZ
8754 src = NEXTOPER(place);
8755 FILL_ADVANCE_NODE(place, op);
8756 Zero(src, offset, regnode);
a687059c
LW
8757}
8758
8759/*
c277df42 8760- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8761- SEE ALSO: regtail_study
a0d0e21e 8762*/
097eb12c 8763/* TODO: All three parms should be const */
76e3520e 8764STATIC void
3dab1dad 8765S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8766{
97aff369 8767 dVAR;
c277df42 8768 register regnode *scan;
72f13be8 8769 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8770
8771 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
8772#ifndef DEBUGGING
8773 PERL_UNUSED_ARG(depth);
8774#endif
a0d0e21e 8775
c277df42 8776 if (SIZE_ONLY)
a0d0e21e
LW
8777 return;
8778
8779 /* Find last node. */
8780 scan = p;
8781 for (;;) {
504618e9 8782 regnode * const temp = regnext(scan);
3dab1dad
YO
8783 DEBUG_PARSE_r({
8784 SV * const mysv=sv_newmortal();
8785 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8786 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8787 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8788 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8789 (temp == NULL ? "->" : ""),
13d6edb4 8790 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 8791 );
3dab1dad
YO
8792 });
8793 if (temp == NULL)
8794 break;
8795 scan = temp;
8796 }
8797
8798 if (reg_off_by_arg[OP(scan)]) {
8799 ARG_SET(scan, val - scan);
8800 }
8801 else {
8802 NEXT_OFF(scan) = val - scan;
8803 }
8804}
8805
07be1b83 8806#ifdef DEBUGGING
3dab1dad
YO
8807/*
8808- regtail_study - set the next-pointer at the end of a node chain of p to val.
8809- Look for optimizable sequences at the same time.
8810- currently only looks for EXACT chains.
07be1b83
YO
8811
8812This is expermental code. The idea is to use this routine to perform
8813in place optimizations on branches and groups as they are constructed,
8814with the long term intention of removing optimization from study_chunk so
8815that it is purely analytical.
8816
8817Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8818to control which is which.
8819
3dab1dad
YO
8820*/
8821/* TODO: All four parms should be const */
07be1b83 8822
3dab1dad
YO
8823STATIC U8
8824S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8825{
8826 dVAR;
8827 register regnode *scan;
07be1b83
YO
8828 U8 exact = PSEUDO;
8829#ifdef EXPERIMENTAL_INPLACESCAN
8830 I32 min = 0;
8831#endif
3dab1dad
YO
8832 GET_RE_DEBUG_FLAGS_DECL;
8833
7918f24d
NC
8834 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8835
07be1b83 8836
3dab1dad
YO
8837 if (SIZE_ONLY)
8838 return exact;
8839
8840 /* Find last node. */
8841
8842 scan = p;
8843 for (;;) {
8844 regnode * const temp = regnext(scan);
07be1b83
YO
8845#ifdef EXPERIMENTAL_INPLACESCAN
8846 if (PL_regkind[OP(scan)] == EXACT)
8847 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8848 return EXACT;
8849#endif
3dab1dad
YO
8850 if ( exact ) {
8851 switch (OP(scan)) {
8852 case EXACT:
8853 case EXACTF:
8854 case EXACTFL:
8855 if( exact == PSEUDO )
8856 exact= OP(scan);
07be1b83
YO
8857 else if ( exact != OP(scan) )
8858 exact= 0;
3dab1dad
YO
8859 case NOTHING:
8860 break;
8861 default:
8862 exact= 0;
8863 }
8864 }
8865 DEBUG_PARSE_r({
8866 SV * const mysv=sv_newmortal();
8867 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8868 regprop(RExC_rx, mysv, scan);
eaf3ca90 8869 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8870 SvPV_nolen_const(mysv),
eaf3ca90 8871 REG_NODE_NUM(scan),
13d6edb4 8872 PL_reg_name[exact]);
3dab1dad 8873 });
a0d0e21e
LW
8874 if (temp == NULL)
8875 break;
8876 scan = temp;
8877 }
07be1b83
YO
8878 DEBUG_PARSE_r({
8879 SV * const mysv_val=sv_newmortal();
8880 DEBUG_PARSE_MSG("");
8881 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8882 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8883 SvPV_nolen_const(mysv_val),
8884 (IV)REG_NODE_NUM(val),
8885 (IV)(val - scan)
07be1b83
YO
8886 );
8887 });
c277df42
IZ
8888 if (reg_off_by_arg[OP(scan)]) {
8889 ARG_SET(scan, val - scan);
a0ed51b3
LW
8890 }
8891 else {
c277df42
IZ
8892 NEXT_OFF(scan) = val - scan;
8893 }
3dab1dad
YO
8894
8895 return exact;
a687059c 8896}
07be1b83 8897#endif
a687059c
LW
8898
8899/*
a687059c
LW
8900 - regcurly - a little FSA that accepts {\d+,?\d*}
8901 */
79072805 8902STATIC I32
5f66b61c 8903S_regcurly(register const char *s)
a687059c 8904{
7918f24d
NC
8905 PERL_ARGS_ASSERT_REGCURLY;
8906
a687059c
LW
8907 if (*s++ != '{')
8908 return FALSE;
f0fcb552 8909 if (!isDIGIT(*s))
a687059c 8910 return FALSE;
f0fcb552 8911 while (isDIGIT(*s))
a687059c
LW
8912 s++;
8913 if (*s == ',')
8914 s++;
f0fcb552 8915 while (isDIGIT(*s))
a687059c
LW
8916 s++;
8917 if (*s != '}')
8918 return FALSE;
8919 return TRUE;
8920}
8921
a687059c
LW
8922
8923/*
fd181c75 8924 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 8925 */
f7819f85 8926#ifdef DEBUGGING
c33269f7 8927static void
7918f24d
NC
8928S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8929{
f7819f85
A
8930 int bit;
8931 int set=0;
7918f24d 8932
f7819f85
A
8933 for (bit=0; bit<32; bit++) {
8934 if (flags & (1<<bit)) {
8935 if (!set++ && lead)
8936 PerlIO_printf(Perl_debug_log, "%s",lead);
8937 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8938 }
8939 }
8940 if (lead) {
8941 if (set)
8942 PerlIO_printf(Perl_debug_log, "\n");
8943 else
8944 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8945 }
8946}
8947#endif
8948
a687059c 8949void
097eb12c 8950Perl_regdump(pTHX_ const regexp *r)
a687059c 8951{
35ff7856 8952#ifdef DEBUGGING
97aff369 8953 dVAR;
c445ea15 8954 SV * const sv = sv_newmortal();
ab3bbdeb 8955 SV *dsv= sv_newmortal();
f8fc2ecf 8956 RXi_GET_DECL(r,ri);
f7819f85 8957 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8958
7918f24d
NC
8959 PERL_ARGS_ASSERT_REGDUMP;
8960
f8fc2ecf 8961 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8962
8963 /* Header fields of interest. */
ab3bbdeb
YO
8964 if (r->anchored_substr) {
8965 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8966 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8967 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8968 "anchored %s%s at %"IVdf" ",
8969 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8970 (IV)r->anchored_offset);
ab3bbdeb
YO
8971 } else if (r->anchored_utf8) {
8972 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8973 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8974 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8975 "anchored utf8 %s%s at %"IVdf" ",
8976 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8977 (IV)r->anchored_offset);
ab3bbdeb
YO
8978 }
8979 if (r->float_substr) {
8980 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8981 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8982 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8983 "floating %s%s at %"IVdf"..%"UVuf" ",
8984 s, RE_SV_TAIL(r->float_substr),
7b0972df 8985 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8986 } else if (r->float_utf8) {
8987 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8988 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8989 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8990 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8991 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8992 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8993 }
33b8afdf 8994 if (r->check_substr || r->check_utf8)
b81d288d 8995 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8996 (const char *)
8997 (r->check_substr == r->float_substr
8998 && r->check_utf8 == r->float_utf8
8999 ? "(checking floating" : "(checking anchored"));
bbe252da 9000 if (r->extflags & RXf_NOSCAN)
c277df42 9001 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 9002 if (r->extflags & RXf_CHECK_ALL)
c277df42 9003 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 9004 if (r->check_substr || r->check_utf8)
c277df42
IZ
9005 PerlIO_printf(Perl_debug_log, ") ");
9006
f8fc2ecf
YO
9007 if (ri->regstclass) {
9008 regprop(r, sv, ri->regstclass);
1de06328 9009 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 9010 }
bbe252da 9011 if (r->extflags & RXf_ANCH) {
774d564b 9012 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9013 if (r->extflags & RXf_ANCH_BOL)
774d564b 9014 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9015 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9016 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9017 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9018 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9019 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9020 PerlIO_printf(Perl_debug_log, "(GPOS)");
9021 PerlIO_putc(Perl_debug_log, ' ');
9022 }
bbe252da 9023 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9024 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9025 if (r->intflags & PREGf_SKIP)
760ac839 9026 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9027 if (r->intflags & PREGf_IMPLICIT)
760ac839 9028 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9029 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9030 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9031 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9032 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9033 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9034#else
7918f24d 9035 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9036 PERL_UNUSED_CONTEXT;
65e66c80 9037 PERL_UNUSED_ARG(r);
17c3b450 9038#endif /* DEBUGGING */
a687059c
LW
9039}
9040
9041/*
a0d0e21e
LW
9042- regprop - printable representation of opcode
9043*/
3339dfd8
YO
9044#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9045STMT_START { \
9046 if (do_sep) { \
9047 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9048 if (flags & ANYOF_INVERT) \
9049 /*make sure the invert info is in each */ \
9050 sv_catpvs(sv, "^"); \
9051 do_sep = 0; \
9052 } \
9053} STMT_END
9054
46fc3d4c 9055void
32fc9b6a 9056Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9057{
35ff7856 9058#ifdef DEBUGGING
97aff369 9059 dVAR;
9b155405 9060 register int k;
f8fc2ecf 9061 RXi_GET_DECL(prog,progi);
1de06328 9062 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9063
7918f24d 9064 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9065
76f68e9b 9066 sv_setpvs(sv, "");
8aa23a47 9067
03363afd 9068 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9069 /* It would be nice to FAIL() here, but this may be called from
9070 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9071 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9072 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9073
3dab1dad 9074 k = PL_regkind[OP(o)];
9b155405 9075
2a782b5b 9076 if (k == EXACT) {
f92a2122 9077 sv_catpvs(sv, " ");
ab3bbdeb
YO
9078 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9079 * is a crude hack but it may be the best for now since
9080 * we have no flag "this EXACTish node was UTF-8"
9081 * --jhi */
f92a2122
NC
9082 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9083 PERL_PV_ESCAPE_UNI_DETECT |
9084 PERL_PV_PRETTY_ELLIPSES |
9085 PERL_PV_PRETTY_LTGT |
9086 PERL_PV_PRETTY_NOCLEAR
9087 );
bb263b4e 9088 } else if (k == TRIE) {
3dab1dad 9089 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9090 * progi->data isn't available here */
1de06328 9091 const char op = OP(o);
647f639f 9092 const U32 n = ARG(o);
1de06328 9093 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9094 (reg_ac_data *)progi->data->data[n] :
1de06328 9095 NULL;
3251b653
NC
9096 const reg_trie_data * const trie
9097 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9098
13d6edb4 9099 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9100 DEBUG_TRIE_COMPILE_r(
9101 Perl_sv_catpvf(aTHX_ sv,
9102 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9103 (UV)trie->startstate,
1e2e3d02 9104 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9105 (UV)trie->wordcount,
9106 (UV)trie->minlen,
9107 (UV)trie->maxlen,
9108 (UV)TRIE_CHARCOUNT(trie),
9109 (UV)trie->uniquecharcount
9110 )
9111 );
9112 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9113 int i;
9114 int rangestart = -1;
f46cb337 9115 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9116 sv_catpvs(sv, "[");
1de06328
YO
9117 for (i = 0; i <= 256; i++) {
9118 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9119 if (rangestart == -1)
9120 rangestart = i;
9121 } else if (rangestart != -1) {
9122 if (i <= rangestart + 3)
9123 for (; rangestart < i; rangestart++)
9124 put_byte(sv, rangestart);
9125 else {
9126 put_byte(sv, rangestart);
9127 sv_catpvs(sv, "-");
9128 put_byte(sv, i - 1);
9129 }
9130 rangestart = -1;
9131 }
9132 }
f3a2811a 9133 sv_catpvs(sv, "]");
1de06328
YO
9134 }
9135
a3621e74 9136 } else if (k == CURLY) {
cb434fcc 9137 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9138 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9139 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9140 }
2c2d71f5
JH
9141 else if (k == WHILEM && o->flags) /* Ordinal/of */
9142 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9143 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9144 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9145 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9146 if ( k != REF || OP(o) < NREF) {
502c6561 9147 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9148 SV **name= av_fetch(list, ARG(o), 0 );
9149 if (name)
9150 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9151 }
9152 else {
502c6561 9153 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9154 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9155 I32 *nums=(I32*)SvPVX(sv_dat);
9156 SV **name= av_fetch(list, nums[0], 0 );
9157 I32 n;
9158 if (name) {
9159 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9160 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9161 (n ? "," : ""), (IV)nums[n]);
9162 }
9163 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9164 }
1f1031fe 9165 }
ee9b8eae 9166 }
1f1031fe 9167 } else if (k == GOSUB)
6bda09f9 9168 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9169 else if (k == VERB) {
9170 if (!o->flags)
9171 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9172 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9173 } else if (k == LOGICAL)
04ebc1ab 9174 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9175 else if (k == FOLDCHAR)
df44d732 9176 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9177 else if (k == ANYOF) {
9178 int i, rangestart = -1;
2d03de9c 9179 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9180 int do_sep = 0;
0bd48802
AL
9181
9182 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9183 static const char * const anyofs[] = {
653099ff
GS
9184 "\\w",
9185 "\\W",
9186 "\\s",
9187 "\\S",
9188 "\\d",
9189 "\\D",
9190 "[:alnum:]",
9191 "[:^alnum:]",
9192 "[:alpha:]",
9193 "[:^alpha:]",
9194 "[:ascii:]",
9195 "[:^ascii:]",
24d786f4
YO
9196 "[:cntrl:]",
9197 "[:^cntrl:]",
653099ff
GS
9198 "[:graph:]",
9199 "[:^graph:]",
9200 "[:lower:]",
9201 "[:^lower:]",
9202 "[:print:]",
9203 "[:^print:]",
9204 "[:punct:]",
9205 "[:^punct:]",
9206 "[:upper:]",
aaa51d5e 9207 "[:^upper:]",
653099ff 9208 "[:xdigit:]",
aaa51d5e
JF
9209 "[:^xdigit:]",
9210 "[:space:]",
9211 "[:^space:]",
9212 "[:blank:]",
9213 "[:^blank:]"
653099ff
GS
9214 };
9215
19860706 9216 if (flags & ANYOF_LOCALE)
396482e1 9217 sv_catpvs(sv, "{loc}");
19860706 9218 if (flags & ANYOF_FOLD)
396482e1 9219 sv_catpvs(sv, "{i}");
653099ff 9220 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9221 if (flags & ANYOF_INVERT)
396482e1 9222 sv_catpvs(sv, "^");
3339dfd8
YO
9223
9224 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9225 for (i = 0; i <= 256; i++) {
9226 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9227 if (rangestart == -1)
9228 rangestart = i;
9229 } else if (rangestart != -1) {
9230 if (i <= rangestart + 3)
9231 for (; rangestart < i; rangestart++)
653099ff 9232 put_byte(sv, rangestart);
ffc61ed2
JH
9233 else {
9234 put_byte(sv, rangestart);
396482e1 9235 sv_catpvs(sv, "-");
ffc61ed2 9236 put_byte(sv, i - 1);
653099ff 9237 }
24d786f4 9238 do_sep = 1;
ffc61ed2 9239 rangestart = -1;
653099ff 9240 }
847a199f 9241 }
3339dfd8
YO
9242
9243 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9244 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9245 if (o->flags & ANYOF_CLASS)
bb7a0f54 9246 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9247 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9248 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9249 do_sep = 1;
9250 }
9251
3339dfd8
YO
9252 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9253
9254 /* output information about the unicode matching */
ffc61ed2 9255 if (flags & ANYOF_UNICODE)
396482e1 9256 sv_catpvs(sv, "{unicode}");
1aa99e6b 9257 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9258 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9259
9260 {
9261 SV *lv;
32fc9b6a 9262 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9263
ffc61ed2
JH
9264 if (lv) {
9265 if (sw) {
89ebb4a3 9266 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9267
ffc61ed2 9268 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9269 uvchr_to_utf8(s, i);
ffc61ed2 9270
3568d838 9271 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9272 if (rangestart == -1)
9273 rangestart = i;
9274 } else if (rangestart != -1) {
ffc61ed2
JH
9275 if (i <= rangestart + 3)
9276 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9277 const U8 * const e = uvchr_to_utf8(s,rangestart);
9278 U8 *p;
9279 for(p = s; p < e; p++)
ffc61ed2
JH
9280 put_byte(sv, *p);
9281 }
9282 else {
2d03de9c
AL
9283 const U8 *e = uvchr_to_utf8(s,rangestart);
9284 U8 *p;
9285 for (p = s; p < e; p++)
ffc61ed2 9286 put_byte(sv, *p);
396482e1 9287 sv_catpvs(sv, "-");
2d03de9c
AL
9288 e = uvchr_to_utf8(s, i-1);
9289 for (p = s; p < e; p++)
1df70142 9290 put_byte(sv, *p);
ffc61ed2
JH
9291 }
9292 rangestart = -1;
9293 }
19860706 9294 }
ffc61ed2 9295
396482e1 9296 sv_catpvs(sv, "..."); /* et cetera */
19860706 9297 }
fde631ed 9298
ffc61ed2 9299 {
2e0de35c 9300 char *s = savesvpv(lv);
c445ea15 9301 char * const origs = s;
b81d288d 9302
3dab1dad
YO
9303 while (*s && *s != '\n')
9304 s++;
b81d288d 9305
ffc61ed2 9306 if (*s == '\n') {
2d03de9c 9307 const char * const t = ++s;
ffc61ed2
JH
9308
9309 while (*s) {
9310 if (*s == '\n')
9311 *s = ' ';
9312 s++;
9313 }
9314 if (s[-1] == ' ')
9315 s[-1] = 0;
9316
9317 sv_catpv(sv, t);
fde631ed 9318 }
b81d288d 9319
ffc61ed2 9320 Safefree(origs);
fde631ed
JH
9321 }
9322 }
653099ff 9323 }
ffc61ed2 9324
653099ff
GS
9325 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9326 }
9b155405 9327 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9328 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9329#else
96a5add6 9330 PERL_UNUSED_CONTEXT;
65e66c80
SP
9331 PERL_UNUSED_ARG(sv);
9332 PERL_UNUSED_ARG(o);
f9049ba1 9333 PERL_UNUSED_ARG(prog);
17c3b450 9334#endif /* DEBUGGING */
35ff7856 9335}
a687059c 9336
cad2e5aa 9337SV *
288b8c02 9338Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9339{ /* Assume that RE_INTUIT is set */
97aff369 9340 dVAR;
288b8c02 9341 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9342 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9343
9344 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9345 PERL_UNUSED_CONTEXT;
9346
a3621e74 9347 DEBUG_COMPILE_r(
cfd0369c 9348 {
2d03de9c 9349 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9350 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9351
9352 if (!PL_colorset) reginitcolors();
9353 PerlIO_printf(Perl_debug_log,
a0288114 9354 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9355 PL_colors[4],
9356 prog->check_substr ? "" : "utf8 ",
9357 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9358 s,
9359 PL_colors[1],
9360 (strlen(s) > 60 ? "..." : ""));
9361 } );
9362
33b8afdf 9363 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9364}
9365
84da74a7 9366/*
f8149455 9367 pregfree()
84da74a7 9368
f8149455
YO
9369 handles refcounting and freeing the perl core regexp structure. When
9370 it is necessary to actually free the structure the first thing it
9371 does is call the 'free' method of the regexp_engine associated to to
9372 the regexp, allowing the handling of the void *pprivate; member
9373 first. (This routine is not overridable by extensions, which is why
9374 the extensions free is called first.)
9375
9376 See regdupe and regdupe_internal if you change anything here.
84da74a7 9377*/
f8149455 9378#ifndef PERL_IN_XSUB_RE
2b69d0c2 9379void
84679df5 9380Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9381{
288b8c02
NC
9382 SvREFCNT_dec(r);
9383}
9384
9385void
9386Perl_pregfree2(pTHX_ REGEXP *rx)
9387{
27da23d5 9388 dVAR;
288b8c02 9389 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9390 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9391
7918f24d
NC
9392 PERL_ARGS_ASSERT_PREGFREE2;
9393
28d8d7f4
YO
9394 if (r->mother_re) {
9395 ReREFCNT_dec(r->mother_re);
9396 } else {
288b8c02 9397 CALLREGFREE_PVT(rx); /* free the private data */
5daac39c
NC
9398 if (RXp_PAREN_NAMES(r))
9399 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9400 }
9401 if (r->substrs) {
9402 if (r->anchored_substr)
9403 SvREFCNT_dec(r->anchored_substr);
9404 if (r->anchored_utf8)
9405 SvREFCNT_dec(r->anchored_utf8);
9406 if (r->float_substr)
9407 SvREFCNT_dec(r->float_substr);
9408 if (r->float_utf8)
9409 SvREFCNT_dec(r->float_utf8);
9410 Safefree(r->substrs);
9411 }
288b8c02 9412 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9413#ifdef PERL_OLD_COPY_ON_WRITE
ed252734 9414 if (r->saved_copy)
28d8d7f4 9415 SvREFCNT_dec(r->saved_copy);
ed252734 9416#endif
f0ab9afb
NC
9417 Safefree(r->swap);
9418 Safefree(r->offs);
f8149455 9419}
28d8d7f4
YO
9420
9421/* reg_temp_copy()
9422
9423 This is a hacky workaround to the structural issue of match results
9424 being stored in the regexp structure which is in turn stored in
9425 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9426 could be PL_curpm in multiple contexts, and could require multiple
9427 result sets being associated with the pattern simultaneously, such
9428 as when doing a recursive match with (??{$qr})
9429
9430 The solution is to make a lightweight copy of the regexp structure
9431 when a qr// is returned from the code executed by (??{$qr}) this
9432 lightweight copy doesnt actually own any of its data except for
9433 the starp/end and the actual regexp structure itself.
9434
9435*/
9436
9437
84679df5 9438REGEXP *
7918f24d
NC
9439Perl_reg_temp_copy (pTHX_ REGEXP *rx)
9440{
d2f13c59 9441 REGEXP *ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02
NC
9442 struct regexp *ret = (struct regexp *)SvANY(ret_x);
9443 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9444 register const I32 npar = r->nparens+1;
7918f24d
NC
9445
9446 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9447
288b8c02 9448 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9449 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9450 by pointing directly at the buffer, but flagging that the allocated
9451 space in the copy is zero. As we've just done a struct copy, it's now
9452 a case of zero-ing that, rather than copying the current length. */
9453 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9454 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
08e44740 9455 StructCopy(&(r->xpv_cur), &(ret->xpv_cur), struct regexp_allocated);
f7c278bf 9456 SvLEN_set(ret_x, 0);
f0ab9afb
NC
9457 Newx(ret->offs, npar, regexp_paren_pair);
9458 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9459 if (r->substrs) {
28d8d7f4 9460 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9461 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9462
9463 SvREFCNT_inc_void(ret->anchored_substr);
9464 SvREFCNT_inc_void(ret->anchored_utf8);
9465 SvREFCNT_inc_void(ret->float_substr);
9466 SvREFCNT_inc_void(ret->float_utf8);
9467
9468 /* check_substr and check_utf8, if non-NULL, point to either their
9469 anchored or float namesakes, and don't hold a second reference. */
486913e4 9470 }
288b8c02 9471 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9472#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9473 ret->saved_copy = NULL;
28d8d7f4 9474#endif
288b8c02 9475 ret->mother_re = rx;
28d8d7f4
YO
9476 ret->swap = NULL;
9477
288b8c02 9478 return ret_x;
28d8d7f4 9479}
f8149455
YO
9480#endif
9481
9482/* regfree_internal()
9483
9484 Free the private data in a regexp. This is overloadable by
9485 extensions. Perl takes care of the regexp structure in pregfree(),
9486 this covers the *pprivate pointer which technically perldoesnt
9487 know about, however of course we have to handle the
9488 regexp_internal structure when no extension is in use.
9489
9490 Note this is called before freeing anything in the regexp
9491 structure.
9492 */
9493
9494void
288b8c02 9495Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9496{
9497 dVAR;
288b8c02 9498 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9499 RXi_GET_DECL(r,ri);
9500 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9501
9502 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9503
f8149455
YO
9504 DEBUG_COMPILE_r({
9505 if (!PL_colorset)
9506 reginitcolors();
9507 {
9508 SV *dsv= sv_newmortal();
3c8556c3 9509 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9510 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9511 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9512 PL_colors[4],PL_colors[5],s);
9513 }
9514 });
7122b237
YO
9515#ifdef RE_TRACK_PATTERN_OFFSETS
9516 if (ri->u.offsets)
9517 Safefree(ri->u.offsets); /* 20010421 MJD */
9518#endif
f8fc2ecf
YO
9519 if (ri->data) {
9520 int n = ri->data->count;
f3548bdc
DM
9521 PAD* new_comppad = NULL;
9522 PAD* old_comppad;
4026c95a 9523 PADOFFSET refcnt;
dfad63ad 9524
c277df42 9525 while (--n >= 0) {
261faec3 9526 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9527 switch (ri->data->what[n]) {
c277df42 9528 case 's':
81714fb9 9529 case 'S':
55eed653 9530 case 'u':
ad64d0ec 9531 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9532 break;
653099ff 9533 case 'f':
f8fc2ecf 9534 Safefree(ri->data->data[n]);
653099ff 9535 break;
dfad63ad 9536 case 'p':
502c6561 9537 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9538 break;
c277df42 9539 case 'o':
dfad63ad 9540 if (new_comppad == NULL)
cea2e8a9 9541 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9542 PAD_SAVE_LOCAL(old_comppad,
9543 /* Watch out for global destruction's random ordering. */
c445ea15 9544 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9545 );
b34c0dd4 9546 OP_REFCNT_LOCK;
f8fc2ecf 9547 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9548 OP_REFCNT_UNLOCK;
9549 if (!refcnt)
f8fc2ecf 9550 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9551
f3548bdc 9552 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9553 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9554 new_comppad = NULL;
c277df42
IZ
9555 break;
9556 case 'n':
9e55ce06 9557 break;
07be1b83 9558 case 'T':
be8e71aa
YO
9559 { /* Aho Corasick add-on structure for a trie node.
9560 Used in stclass optimization only */
07be1b83 9561 U32 refcount;
f8fc2ecf 9562 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9563 OP_REFCNT_LOCK;
9564 refcount = --aho->refcount;
9565 OP_REFCNT_UNLOCK;
9566 if ( !refcount ) {
446bd890
NC
9567 PerlMemShared_free(aho->states);
9568 PerlMemShared_free(aho->fail);
446bd890
NC
9569 /* do this last!!!! */
9570 PerlMemShared_free(ri->data->data[n]);
9571 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9572 }
9573 }
9574 break;
a3621e74 9575 case 't':
07be1b83 9576 {
be8e71aa 9577 /* trie structure. */
07be1b83 9578 U32 refcount;
f8fc2ecf 9579 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9580 OP_REFCNT_LOCK;
9581 refcount = --trie->refcount;
9582 OP_REFCNT_UNLOCK;
9583 if ( !refcount ) {
446bd890 9584 PerlMemShared_free(trie->charmap);
446bd890
NC
9585 PerlMemShared_free(trie->states);
9586 PerlMemShared_free(trie->trans);
07be1b83 9587 if (trie->bitmap)
446bd890 9588 PerlMemShared_free(trie->bitmap);
07be1b83 9589 if (trie->wordlen)
446bd890 9590 PerlMemShared_free(trie->wordlen);
786e8c11 9591 if (trie->jump)
446bd890 9592 PerlMemShared_free(trie->jump);
786e8c11 9593 if (trie->nextword)
446bd890 9594 PerlMemShared_free(trie->nextword);
446bd890
NC
9595 /* do this last!!!! */
9596 PerlMemShared_free(ri->data->data[n]);
a3621e74 9597 }
07be1b83
YO
9598 }
9599 break;
c277df42 9600 default:
f8fc2ecf 9601 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9602 }
9603 }
f8fc2ecf
YO
9604 Safefree(ri->data->what);
9605 Safefree(ri->data);
a0d0e21e 9606 }
28d8d7f4 9607
f8fc2ecf 9608 Safefree(ri);
a687059c 9609}
c277df42 9610
84da74a7 9611#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
502c6561 9612#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
85fbaab2 9613#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
84da74a7
YO
9614#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9615
9616/*
32cd70f6 9617 re_dup - duplicate a regexp.
84da74a7 9618
8233f606
DM
9619 This routine is expected to clone a given regexp structure. It is only
9620 compiled under USE_ITHREADS.
32cd70f6 9621
f8149455
YO
9622 After all of the core data stored in struct regexp is duplicated
9623 the regexp_engine.dupe method is used to copy any private data
9624 stored in the *pprivate pointer. This allows extensions to handle
9625 any duplication it needs to do.
9626
9627 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9628*/
a3c0e9ca 9629#if defined(USE_ITHREADS)
f8149455 9630#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9631void
9632Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9633{
84da74a7 9634 dVAR;
a86a1ca7 9635 I32 npar;
288b8c02
NC
9636 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9637 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9638
7918f24d
NC
9639 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9640
84da74a7 9641 npar = r->nparens+1;
f0ab9afb
NC
9642 Newx(ret->offs, npar, regexp_paren_pair);
9643 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9644 if(ret->swap) {
28d8d7f4 9645 /* no need to copy these */
f0ab9afb 9646 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9647 }
84da74a7 9648
6057429f 9649 if (ret->substrs) {
32cd70f6
NC
9650 /* Do it this way to avoid reading from *r after the StructCopy().
9651 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9652 cache, it doesn't matter. */
66b1de87
NC
9653 const bool anchored = r->check_substr
9654 ? r->check_substr == r->anchored_substr
9655 : r->check_utf8 == r->anchored_utf8;
785a26d5 9656 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9657 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9658
32cd70f6
NC
9659 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9660 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9661 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9662 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9663
32cd70f6
NC
9664 /* check_substr and check_utf8, if non-NULL, point to either their
9665 anchored or float namesakes, and don't hold a second reference. */
9666
9667 if (ret->check_substr) {
9668 if (anchored) {
9669 assert(r->check_utf8 == r->anchored_utf8);
9670 ret->check_substr = ret->anchored_substr;
9671 ret->check_utf8 = ret->anchored_utf8;
9672 } else {
9673 assert(r->check_substr == r->float_substr);
9674 assert(r->check_utf8 == r->float_utf8);
9675 ret->check_substr = ret->float_substr;
9676 ret->check_utf8 = ret->float_utf8;
9677 }
66b1de87
NC
9678 } else if (ret->check_utf8) {
9679 if (anchored) {
9680 ret->check_utf8 = ret->anchored_utf8;
9681 } else {
9682 ret->check_utf8 = ret->float_utf8;
9683 }
32cd70f6 9684 }
6057429f 9685 }
f8149455 9686
5daac39c 9687 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9688
6057429f 9689 if (ret->pprivate)
288b8c02 9690 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9691
288b8c02 9692 if (RX_MATCH_COPIED(dstr))
6057429f 9693 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9694 else
9695 ret->subbeg = NULL;
9696#ifdef PERL_OLD_COPY_ON_WRITE
9697 ret->saved_copy = NULL;
9698#endif
6057429f
NC
9699
9700 ret->mother_re = NULL;
9701 ret->gofs = 0;
f8149455
YO
9702}
9703#endif /* PERL_IN_XSUB_RE */
9704
9705/*
9706 regdupe_internal()
9707
9708 This is the internal complement to regdupe() which is used to copy
9709 the structure pointed to by the *pprivate pointer in the regexp.
9710 This is the core version of the extension overridable cloning hook.
9711 The regexp structure being duplicated will be copied by perl prior
9712 to this and will be provided as the regexp *r argument, however
9713 with the /old/ structures pprivate pointer value. Thus this routine
9714 may override any copying normally done by perl.
9715
9716 It returns a pointer to the new regexp_internal structure.
9717*/
9718
9719void *
288b8c02 9720Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
9721{
9722 dVAR;
288b8c02 9723 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9724 regexp_internal *reti;
9725 int len, npar;
9726 RXi_GET_DECL(r,ri);
7918f24d
NC
9727
9728 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
9729
9730 npar = r->nparens+1;
7122b237 9731 len = ProgLen(ri);
f8149455 9732
45cf4570 9733 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
9734 Copy(ri->program, reti->program, len+1, regnode);
9735
f8149455 9736
f8fc2ecf 9737 reti->regstclass = NULL;
bcdf7404 9738
f8fc2ecf 9739 if (ri->data) {
84da74a7 9740 struct reg_data *d;
f8fc2ecf 9741 const int count = ri->data->count;
84da74a7
YO
9742 int i;
9743
9744 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9745 char, struct reg_data);
9746 Newx(d->what, count, U8);
9747
9748 d->count = count;
9749 for (i = 0; i < count; i++) {
f8fc2ecf 9750 d->what[i] = ri->data->what[i];
84da74a7 9751 switch (d->what[i]) {
55eed653 9752 /* legal options are one of: sSfpontTu
84da74a7
YO
9753 see also regcomp.h and pregfree() */
9754 case 's':
81714fb9 9755 case 'S':
0536c0a7 9756 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 9757 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 9758 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 9759 break;
84da74a7
YO
9760 case 'f':
9761 /* This is cheating. */
9762 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 9763 StructCopy(ri->data->data[i], d->data[i],
84da74a7 9764 struct regnode_charclass_class);
f8fc2ecf 9765 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
9766 break;
9767 case 'o':
bbe252da
YO
9768 /* Compiled op trees are readonly and in shared memory,
9769 and can thus be shared without duplication. */
84da74a7 9770 OP_REFCNT_LOCK;
f8fc2ecf 9771 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
9772 OP_REFCNT_UNLOCK;
9773 break;
23eab42c
NC
9774 case 'T':
9775 /* Trie stclasses are readonly and can thus be shared
9776 * without duplication. We free the stclass in pregfree
9777 * when the corresponding reg_ac_data struct is freed.
9778 */
9779 reti->regstclass= ri->regstclass;
9780 /* Fall through */
84da74a7 9781 case 't':
84da74a7 9782 OP_REFCNT_LOCK;
0536c0a7 9783 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 9784 OP_REFCNT_UNLOCK;
0536c0a7
NC
9785 /* Fall through */
9786 case 'n':
9787 d->data[i] = ri->data->data[i];
84da74a7 9788 break;
84da74a7 9789 default:
f8fc2ecf 9790 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
9791 }
9792 }
9793
f8fc2ecf 9794 reti->data = d;
84da74a7
YO
9795 }
9796 else
f8fc2ecf 9797 reti->data = NULL;
84da74a7 9798
cde0cee5
YO
9799 reti->name_list_idx = ri->name_list_idx;
9800
7122b237
YO
9801#ifdef RE_TRACK_PATTERN_OFFSETS
9802 if (ri->u.offsets) {
9803 Newx(reti->u.offsets, 2*len+1, U32);
9804 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9805 }
9806#else
9807 SetProgLen(reti,len);
9808#endif
9809
f8149455 9810 return (void*)reti;
84da74a7 9811}
f8149455
YO
9812
9813#endif /* USE_ITHREADS */
84da74a7 9814
f8149455 9815#ifndef PERL_IN_XSUB_RE
bcdf7404 9816
c277df42
IZ
9817/*
9818 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9819 */
9820regnode *
864dbfa3 9821Perl_regnext(pTHX_ register regnode *p)
c277df42 9822{
97aff369 9823 dVAR;
c277df42
IZ
9824 register I32 offset;
9825
f8fc2ecf 9826 if (!p)
c277df42
IZ
9827 return(NULL);
9828
9829 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9830 if (offset == 0)
9831 return(NULL);
9832
c277df42 9833 return(p+offset);
c277df42 9834}
76234dfb 9835#endif
c277df42 9836
01f988be 9837STATIC void
cea2e8a9 9838S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9839{
9840 va_list args;
9841 STRLEN l1 = strlen(pat1);
9842 STRLEN l2 = strlen(pat2);
9843 char buf[512];
06bf62c7 9844 SV *msv;
73d840c0 9845 const char *message;
c277df42 9846
7918f24d
NC
9847 PERL_ARGS_ASSERT_RE_CROAK2;
9848
c277df42
IZ
9849 if (l1 > 510)
9850 l1 = 510;
9851 if (l1 + l2 > 510)
9852 l2 = 510 - l1;
9853 Copy(pat1, buf, l1 , char);
9854 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9855 buf[l1 + l2] = '\n';
9856 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9857#ifdef I_STDARG
9858 /* ANSI variant takes additional second argument */
c277df42 9859 va_start(args, pat2);
8736538c
AS
9860#else
9861 va_start(args);
9862#endif
5a844595 9863 msv = vmess(buf, &args);
c277df42 9864 va_end(args);
cfd0369c 9865 message = SvPV_const(msv,l1);
c277df42
IZ
9866 if (l1 > 512)
9867 l1 = 512;
9868 Copy(message, buf, l1 , char);
197cf9b9 9869 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9870 Perl_croak(aTHX_ "%s", buf);
c277df42 9871}
a0ed51b3
LW
9872
9873/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9874
76234dfb 9875#ifndef PERL_IN_XSUB_RE
a0ed51b3 9876void
864dbfa3 9877Perl_save_re_context(pTHX)
b81d288d 9878{
97aff369 9879 dVAR;
1ade1aa1
NC
9880
9881 struct re_save_state *state;
9882
9883 SAVEVPTR(PL_curcop);
9884 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9885
9886 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9887 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9888 SSPUSHINT(SAVEt_RE_STATE);
9889
46ab3289 9890 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9891
a0ed51b3 9892 PL_reg_start_tmp = 0;
a0ed51b3 9893 PL_reg_start_tmpl = 0;
c445ea15 9894 PL_reg_oldsaved = NULL;
a5db57d6 9895 PL_reg_oldsavedlen = 0;
a5db57d6 9896 PL_reg_maxiter = 0;
a5db57d6 9897 PL_reg_leftiter = 0;
c445ea15 9898 PL_reg_poscache = NULL;
a5db57d6 9899 PL_reg_poscache_size = 0;
1ade1aa1
NC
9900#ifdef PERL_OLD_COPY_ON_WRITE
9901 PL_nrs = NULL;
9902#endif
ada6e8a9 9903
c445ea15
AL
9904 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9905 if (PL_curpm) {
9906 const REGEXP * const rx = PM_GETRE(PL_curpm);
9907 if (rx) {
1df70142 9908 U32 i;
07bc277f 9909 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 9910 char digits[TYPE_CHARS(long)];
d9fad198 9911 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9912 GV *const *const gvp
9913 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9914
b37c2d43
AL
9915 if (gvp) {
9916 GV * const gv = *gvp;
9917 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9918 save_scalar(gv);
49f27e4b 9919 }
ada6e8a9
AMS
9920 }
9921 }
9922 }
a0ed51b3 9923}
76234dfb 9924#endif
51371543 9925
51371543 9926static void
acfe0abc 9927clear_re(pTHX_ void *r)
51371543 9928{
97aff369 9929 dVAR;
84679df5 9930 ReREFCNT_dec((REGEXP *)r);
51371543 9931}
ffbc6a93 9932
a28509cc
AL
9933#ifdef DEBUGGING
9934
9935STATIC void
9936S_put_byte(pTHX_ SV *sv, int c)
9937{
7918f24d
NC
9938 PERL_ARGS_ASSERT_PUT_BYTE;
9939
7fddd944
NC
9940 /* Our definition of isPRINT() ignores locales, so only bytes that are
9941 not part of UTF-8 are considered printable. I assume that the same
9942 holds for UTF-EBCDIC.
9943 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9944 which Wikipedia says:
9945
9946 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9947 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9948 identical, to the ASCII delete (DEL) or rubout control character.
9949 ) So the old condition can be simplified to !isPRINT(c) */
9950 if (!isPRINT(c))
a28509cc 9951 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 9952 else {
88c9ea1e 9953 const char string = c;
5e7aa789
NC
9954 if (c == '-' || c == ']' || c == '\\' || c == '^')
9955 sv_catpvs(sv, "\\");
9956 sv_catpvn(sv, &string, 1);
9957 }
a28509cc
AL
9958}
9959
786e8c11 9960
3dab1dad
YO
9961#define CLEAR_OPTSTART \
9962 if (optstart) STMT_START { \
70685ca0 9963 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9964 optstart=NULL; \
9965 } STMT_END
9966
786e8c11 9967#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9968
b5a2f8d8
NC
9969STATIC const regnode *
9970S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9971 const regnode *last, const regnode *plast,
9972 SV* sv, I32 indent, U32 depth)
a28509cc 9973{
97aff369 9974 dVAR;
786e8c11 9975 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9976 register const regnode *next;
3dab1dad 9977 const regnode *optstart= NULL;
1f1031fe 9978
f8fc2ecf 9979 RXi_GET_DECL(r,ri);
3dab1dad 9980 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9981
9982 PERL_ARGS_ASSERT_DUMPUNTIL;
9983
786e8c11
YO
9984#ifdef DEBUG_DUMPUNTIL
9985 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9986 last ? last-start : 0,plast ? plast-start : 0);
9987#endif
9988
9989 if (plast && plast < last)
9990 last= plast;
9991
9992 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9993 /* While that wasn't END last time... */
a28509cc
AL
9994 NODE_ALIGN(node);
9995 op = OP(node);
de734bd5 9996 if (op == CLOSE || op == WHILEM)
786e8c11 9997 indent--;
b5a2f8d8 9998 next = regnext((regnode *)node);
1f1031fe 9999
a28509cc 10000 /* Where, what. */
8e11feef 10001 if (OP(node) == OPTIMIZED) {
e68ec53f 10002 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10003 optstart = node;
3dab1dad 10004 else
8e11feef 10005 goto after_print;
3dab1dad
YO
10006 } else
10007 CLEAR_OPTSTART;
1f1031fe 10008
32fc9b6a 10009 regprop(r, sv, node);
a28509cc 10010 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10011 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10012
10013 if (OP(node) != OPTIMIZED) {
10014 if (next == NULL) /* Next ptr. */
10015 PerlIO_printf(Perl_debug_log, " (0)");
10016 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10017 PerlIO_printf(Perl_debug_log, " (FAIL)");
10018 else
10019 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10020 (void)PerlIO_putc(Perl_debug_log, '\n');
10021 }
10022
a28509cc
AL
10023 after_print:
10024 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10025 assert(next);
10026 {
10027 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10028 ? regnext((regnode *)next)
10029 : next);
be8e71aa
YO
10030 if (last && nnode > last)
10031 nnode = last;
786e8c11 10032 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10033 }
a28509cc
AL
10034 }
10035 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10036 assert(next);
786e8c11 10037 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10038 }
10039 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10040 const regnode *this_trie = node;
1de06328 10041 const char op = OP(node);
647f639f 10042 const U32 n = ARG(node);
1de06328 10043 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10044 (reg_ac_data *)ri->data->data[n] :
1de06328 10045 NULL;
3251b653
NC
10046 const reg_trie_data * const trie =
10047 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10048#ifdef DEBUGGING
502c6561 10049 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10050#endif
786e8c11 10051 const regnode *nextbranch= NULL;
a28509cc 10052 I32 word_idx;
76f68e9b 10053 sv_setpvs(sv, "");
786e8c11 10054 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10055 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10056
10057 PerlIO_printf(Perl_debug_log, "%*s%s ",
10058 (int)(2*(indent+3)), "",
10059 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10060 PL_colors[0], PL_colors[1],
10061 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10062 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10063 PERL_PV_PRETTY_LTGT
786e8c11
YO
10064 )
10065 : "???"
10066 );
10067 if (trie->jump) {
40d049e4 10068 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10069 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10070 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10071 if (dist) {
10072 if (!nextbranch)
24b23f37 10073 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10074 DUMPUNTIL(this_trie + dist, nextbranch);
10075 }
786e8c11
YO
10076 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10077 nextbranch= regnext((regnode *)nextbranch);
10078 } else {
10079 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10080 }
786e8c11
YO
10081 }
10082 if (last && next > last)
10083 node= last;
10084 else
10085 node= next;
a28509cc 10086 }
786e8c11
YO
10087 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10088 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10089 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10090 }
10091 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10092 assert(next);
786e8c11 10093 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10094 }
10095 else if ( op == PLUS || op == STAR) {
786e8c11 10096 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10097 }
10098 else if (op == ANYOF) {
10099 /* arglen 1 + class block */
10100 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10101 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10102 node = NEXTOPER(node);
10103 }
10104 else if (PL_regkind[(U8)op] == EXACT) {
10105 /* Literal string, where present. */
10106 node += NODE_SZ_STR(node) - 1;
10107 node = NEXTOPER(node);
10108 }
10109 else {
10110 node = NEXTOPER(node);
10111 node += regarglen[(U8)op];
10112 }
10113 if (op == CURLYX || op == OPEN)
786e8c11 10114 indent++;
a28509cc 10115 }
3dab1dad 10116 CLEAR_OPTSTART;
786e8c11 10117#ifdef DEBUG_DUMPUNTIL
70685ca0 10118 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10119#endif
1de06328 10120 return node;
a28509cc
AL
10121}
10122
10123#endif /* DEBUGGING */
10124
241d1a3b
NC
10125/*
10126 * Local variables:
10127 * c-indentation-style: bsd
10128 * c-basic-offset: 4
10129 * indent-tabs-mode: t
10130 * End:
10131 *
37442d52
RGS
10132 * ex: set ts=8 sts=4 sw=4 noet:
10133 */