This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge branch 'blead' of ssh://perl5.git.perl.org/gitroot/perl into blead
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73
AD
29/* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
d4cce5f1 88#ifdef op
11343788 89#undef op
d4cce5f1 90#endif /* op */
11343788 91
fe14fcc3 92#ifdef MSDOS
7e4e8c89 93# if defined(BUGGY_MSC6)
fe14fcc3 94 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 95# pragma optimize("a",off)
fe14fcc3 96 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
97# pragma optimize("w",on )
98# endif /* BUGGY_MSC6 */
fe14fcc3
LW
99#endif /* MSDOS */
100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
830247a4 105typedef struct RExC_state_t {
e2509266 106 U32 flags; /* are we folding, multilining? */
830247a4 107 char *precomp; /* uncompiled string. */
288b8c02 108 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 111 char *start; /* Start of input for compile */
830247a4
IZ
112 char *end; /* End of input for compile */
113 char *parse; /* Input-scan pointer. */
114 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 115 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 116 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 117 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
118 I32 naughty; /* How bad is this pattern? */
119 I32 sawback; /* Did we see \1, ...? */
120 U32 seen;
121 I32 size; /* Code size. */
c74340f9
YO
122 I32 npar; /* Capture buffer count, (OPEN). */
123 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 124 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
125 I32 extralen;
126 I32 seen_zerolen;
127 I32 seen_evals;
40d049e4
YO
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
02daf0ab
YO
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
81714fb9 135 HV *paren_names; /* Paren names */
1f1031fe 136
40d049e4
YO
137 regnode **recurse; /* Recurse regops */
138 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
139#if ADD_TO_REGEXEC
140 char *starttry; /* -Dr: where regtry was called. */
141#define RExC_starttry (pRExC_state->starttry)
142#endif
3dab1dad 143#ifdef DEBUGGING
be8e71aa 144 const char *lastparse;
3dab1dad 145 I32 lastnum;
1f1031fe 146 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
147#define RExC_lastparse (pRExC_state->lastparse)
148#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 149#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 150#endif
830247a4
IZ
151} RExC_state_t;
152
e2509266 153#define RExC_flags (pRExC_state->flags)
830247a4 154#define RExC_precomp (pRExC_state->precomp)
288b8c02 155#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 156#define RExC_rx (pRExC_state->rx)
f8fc2ecf 157#define RExC_rxi (pRExC_state->rxi)
fac92740 158#define RExC_start (pRExC_state->start)
830247a4
IZ
159#define RExC_end (pRExC_state->end)
160#define RExC_parse (pRExC_state->parse)
161#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
162#ifdef RE_TRACK_PATTERN_OFFSETS
163#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
164#endif
830247a4 165#define RExC_emit (pRExC_state->emit)
fac92740 166#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 167#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
168#define RExC_naughty (pRExC_state->naughty)
169#define RExC_sawback (pRExC_state->sawback)
170#define RExC_seen (pRExC_state->seen)
171#define RExC_size (pRExC_state->size)
172#define RExC_npar (pRExC_state->npar)
e2e6a0f1 173#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
174#define RExC_extralen (pRExC_state->extralen)
175#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
176#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 177#define RExC_utf8 (pRExC_state->utf8)
02daf0ab 178#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
179#define RExC_open_parens (pRExC_state->open_parens)
180#define RExC_close_parens (pRExC_state->close_parens)
181#define RExC_opend (pRExC_state->opend)
81714fb9 182#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
183#define RExC_recurse (pRExC_state->recurse)
184#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 185
cde0cee5 186
a687059c
LW
187#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
188#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
189 ((*s) == '{' && regcurly(s)))
a687059c 190
35c8bce7
LW
191#ifdef SPSTART
192#undef SPSTART /* dratted cpp namespace... */
193#endif
a687059c
LW
194/*
195 * Flags to be passed up and down.
196 */
a687059c 197#define WORST 0 /* Worst case. */
a3b492c3
YO
198#define HASWIDTH 0x01 /* Known to match non-null strings. */
199#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
200#define SPSTART 0x04 /* Starts with * or +. */
201#define TRYAGAIN 0x08 /* Weeded out a declaration. */
202#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 203
3dab1dad
YO
204#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
205
07be1b83
YO
206/* whether trie related optimizations are enabled */
207#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
208#define TRIE_STUDY_OPT
786e8c11 209#define FULL_TRIE_STUDY
07be1b83
YO
210#define TRIE_STCLASS
211#endif
1de06328
YO
212
213
40d049e4
YO
214
215#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
216#define PBITVAL(paren) (1 << ((paren) & 7))
217#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
218#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
219#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
220
221
1de06328
YO
222/* About scan_data_t.
223
224 During optimisation we recurse through the regexp program performing
225 various inplace (keyhole style) optimisations. In addition study_chunk
226 and scan_commit populate this data structure with information about
227 what strings MUST appear in the pattern. We look for the longest
228 string that must appear for at a fixed location, and we look for the
229 longest string that may appear at a floating location. So for instance
230 in the pattern:
231
232 /FOO[xX]A.*B[xX]BAR/
233
234 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
235 strings (because they follow a .* construct). study_chunk will identify
236 both FOO and BAR as being the longest fixed and floating strings respectively.
237
238 The strings can be composites, for instance
239
240 /(f)(o)(o)/
241
242 will result in a composite fixed substring 'foo'.
243
244 For each string some basic information is maintained:
245
246 - offset or min_offset
247 This is the position the string must appear at, or not before.
248 It also implicitly (when combined with minlenp) tells us how many
249 character must match before the string we are searching.
250 Likewise when combined with minlenp and the length of the string
251 tells us how many characters must appear after the string we have
252 found.
253
254 - max_offset
255 Only used for floating strings. This is the rightmost point that
256 the string can appear at. Ifset to I32 max it indicates that the
257 string can occur infinitely far to the right.
258
259 - minlenp
260 A pointer to the minimum length of the pattern that the string
261 was found inside. This is important as in the case of positive
262 lookahead or positive lookbehind we can have multiple patterns
263 involved. Consider
264
265 /(?=FOO).*F/
266
267 The minimum length of the pattern overall is 3, the minimum length
268 of the lookahead part is 3, but the minimum length of the part that
269 will actually match is 1. So 'FOO's minimum length is 3, but the
270 minimum length for the F is 1. This is important as the minimum length
271 is used to determine offsets in front of and behind the string being
272 looked for. Since strings can be composites this is the length of the
273 pattern at the time it was commited with a scan_commit. Note that
274 the length is calculated by study_chunk, so that the minimum lengths
275 are not known until the full pattern has been compiled, thus the
276 pointer to the value.
277
278 - lookbehind
279
280 In the case of lookbehind the string being searched for can be
281 offset past the start point of the final matching string.
282 If this value was just blithely removed from the min_offset it would
283 invalidate some of the calculations for how many chars must match
284 before or after (as they are derived from min_offset and minlen and
285 the length of the string being searched for).
286 When the final pattern is compiled and the data is moved from the
287 scan_data_t structure into the regexp structure the information
288 about lookbehind is factored in, with the information that would
289 have been lost precalculated in the end_shift field for the
290 associated string.
291
292 The fields pos_min and pos_delta are used to store the minimum offset
293 and the delta to the maximum offset at the current point in the pattern.
294
295*/
2c2d71f5
JH
296
297typedef struct scan_data_t {
1de06328
YO
298 /*I32 len_min; unused */
299 /*I32 len_delta; unused */
2c2d71f5
JH
300 I32 pos_min;
301 I32 pos_delta;
302 SV *last_found;
1de06328 303 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
304 I32 last_start_min;
305 I32 last_start_max;
1de06328
YO
306 SV **longest; /* Either &l_fixed, or &l_float. */
307 SV *longest_fixed; /* longest fixed string found in pattern */
308 I32 offset_fixed; /* offset where it starts */
309 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
310 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
311 SV *longest_float; /* longest floating string found in pattern */
312 I32 offset_float_min; /* earliest point in string it can appear */
313 I32 offset_float_max; /* latest point in string it can appear */
314 I32 *minlen_float; /* pointer to the minlen relevent to the string */
315 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
316 I32 flags;
317 I32 whilem_c;
cb434fcc 318 I32 *last_closep;
653099ff 319 struct regnode_charclass_class *start_class;
2c2d71f5
JH
320} scan_data_t;
321
a687059c 322/*
e50aee73 323 * Forward declarations for pregcomp()'s friends.
a687059c 324 */
a0d0e21e 325
27da23d5 326static const scan_data_t zero_scan_data =
1de06328 327 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
328
329#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
330#define SF_BEFORE_SEOL 0x0001
331#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
332#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
333#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
334
09b7f37c
CB
335#ifdef NO_UNARY_PLUS
336# define SF_FIX_SHIFT_EOL (0+2)
337# define SF_FL_SHIFT_EOL (0+4)
338#else
339# define SF_FIX_SHIFT_EOL (+2)
340# define SF_FL_SHIFT_EOL (+4)
341#endif
c277df42
IZ
342
343#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
344#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
345
346#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
347#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
348#define SF_IS_INF 0x0040
349#define SF_HAS_PAR 0x0080
350#define SF_IN_PAR 0x0100
351#define SF_HAS_EVAL 0x0200
352#define SCF_DO_SUBSTR 0x0400
653099ff
GS
353#define SCF_DO_STCLASS_AND 0x0800
354#define SCF_DO_STCLASS_OR 0x1000
355#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 356#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 357
786e8c11 358#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 359#define SCF_SEEN_ACCEPT 0x8000
07be1b83 360
eb160463 361#define UTF (RExC_utf8 != 0)
bbe252da
YO
362#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
363#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 364
ffc61ed2 365#define OOB_UNICODE 12345678
93733859 366#define OOB_NAMEDCLASS -1
b8c5462f 367
a0ed51b3
LW
368#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
369#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
370
8615cb43 371
b45f050a
JF
372/* length of regex to show in messages that don't mark a position within */
373#define RegexLengthToShowInErrorMessages 127
374
375/*
376 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
377 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
378 * op/pragma/warn/regcomp.
379 */
7253e4e3
RK
380#define MARKER1 "<-- HERE" /* marker as it appears in the description */
381#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 382
7253e4e3 383#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
384
385/*
386 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
387 * arg. Show regex, up to a maximum length. If it's too long, chop and add
388 * "...".
389 */
58e23c8d 390#define _FAIL(code) STMT_START { \
bfed75c6 391 const char *ellipses = ""; \
ccb2c380
MP
392 IV len = RExC_end - RExC_precomp; \
393 \
394 if (!SIZE_ONLY) \
288b8c02 395 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
396 if (len > RegexLengthToShowInErrorMessages) { \
397 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
398 len = RegexLengthToShowInErrorMessages - 10; \
399 ellipses = "..."; \
400 } \
58e23c8d 401 code; \
ccb2c380 402} STMT_END
8615cb43 403
58e23c8d
YO
404#define FAIL(msg) _FAIL( \
405 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
406 msg, (int)len, RExC_precomp, ellipses))
407
408#define FAIL2(msg,arg) _FAIL( \
409 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
410 arg, (int)len, RExC_precomp, ellipses))
411
b45f050a 412/*
b45f050a
JF
413 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
414 */
ccb2c380 415#define Simple_vFAIL(m) STMT_START { \
a28509cc 416 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
417 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
418 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
423 */
ccb2c380
MP
424#define vFAIL(m) STMT_START { \
425 if (!SIZE_ONLY) \
288b8c02 426 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
427 Simple_vFAIL(m); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Like Simple_vFAIL(), but accepts two arguments.
432 */
ccb2c380 433#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 434 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
435 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
436 (int)offset, RExC_precomp, RExC_precomp + offset); \
437} STMT_END
b45f050a
JF
438
439/*
440 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
441 */
ccb2c380
MP
442#define vFAIL2(m,a1) STMT_START { \
443 if (!SIZE_ONLY) \
288b8c02 444 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
445 Simple_vFAIL2(m, a1); \
446} STMT_END
b45f050a
JF
447
448
449/*
450 * Like Simple_vFAIL(), but accepts three arguments.
451 */
ccb2c380 452#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 453 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
454 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
455 (int)offset, RExC_precomp, RExC_precomp + offset); \
456} STMT_END
b45f050a
JF
457
458/*
459 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
460 */
ccb2c380
MP
461#define vFAIL3(m,a1,a2) STMT_START { \
462 if (!SIZE_ONLY) \
288b8c02 463 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
464 Simple_vFAIL3(m, a1, a2); \
465} STMT_END
b45f050a
JF
466
467/*
468 * Like Simple_vFAIL(), but accepts four arguments.
469 */
ccb2c380 470#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 471 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
472 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
473 (int)offset, RExC_precomp, RExC_precomp + offset); \
474} STMT_END
b45f050a 475
668c081a 476#define ckWARNreg(loc,m) STMT_START { \
a28509cc 477 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
478 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
479 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
480} STMT_END
481
668c081a 482#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 483 const IV offset = loc - RExC_precomp; \
d1d15184 484 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
485 m REPORT_LOCATION, \
486 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
487} STMT_END
488
668c081a 489#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 490 const IV offset = loc - RExC_precomp; \
668c081a 491 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
492 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
493} STMT_END
494
495#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 496 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
497 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
498 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
499} STMT_END
500
668c081a
NC
501#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
502 const IV offset = loc - RExC_precomp; \
503 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
504 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
505} STMT_END
506
ccb2c380 507#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 508 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
509 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
510 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
511} STMT_END
512
668c081a
NC
513#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
514 const IV offset = loc - RExC_precomp; \
515 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
516 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
517} STMT_END
518
ccb2c380 519#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 520 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
521 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
522 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
523} STMT_END
9d1d55b5 524
8615cb43 525
cd439c50 526/* Allow for side effects in s */
ccb2c380
MP
527#define REGC(c,s) STMT_START { \
528 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
529} STMT_END
cd439c50 530
fac92740
MJD
531/* Macros for recording node offsets. 20001227 mjd@plover.com
532 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
533 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
534 * Element 0 holds the number n.
07be1b83 535 * Position is 1 indexed.
fac92740 536 */
7122b237
YO
537#ifndef RE_TRACK_PATTERN_OFFSETS
538#define Set_Node_Offset_To_R(node,byte)
539#define Set_Node_Offset(node,byte)
540#define Set_Cur_Node_Offset
541#define Set_Node_Length_To_R(node,len)
542#define Set_Node_Length(node,len)
543#define Set_Node_Cur_Length(node)
544#define Node_Offset(n)
545#define Node_Length(n)
546#define Set_Node_Offset_Length(node,offset,len)
547#define ProgLen(ri) ri->u.proglen
548#define SetProgLen(ri,x) ri->u.proglen = x
549#else
550#define ProgLen(ri) ri->u.offsets[0]
551#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
552#define Set_Node_Offset_To_R(node,byte) STMT_START { \
553 if (! SIZE_ONLY) { \
554 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 555 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 556 if((node) < 0) { \
551405c4 557 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
558 } else { \
559 RExC_offsets[2*(node)-1] = (byte); \
560 } \
561 } \
562} STMT_END
563
564#define Set_Node_Offset(node,byte) \
565 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
566#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
567
568#define Set_Node_Length_To_R(node,len) STMT_START { \
569 if (! SIZE_ONLY) { \
570 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 571 __LINE__, (int)(node), (int)(len))); \
ccb2c380 572 if((node) < 0) { \
551405c4 573 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
574 } else { \
575 RExC_offsets[2*(node)] = (len); \
576 } \
577 } \
578} STMT_END
579
580#define Set_Node_Length(node,len) \
581 Set_Node_Length_To_R((node)-RExC_emit_start, len)
582#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
583#define Set_Node_Cur_Length(node) \
584 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
585
586/* Get offsets and lengths */
587#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
588#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
589
07be1b83
YO
590#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
591 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
592 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
593} STMT_END
7122b237 594#endif
07be1b83
YO
595
596#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
597#define EXPERIMENTAL_INPLACESCAN
7122b237 598#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 599
304ee84b
YO
600#define DEBUG_STUDYDATA(str,data,depth) \
601DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 602 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
603 "%*s" str "Pos:%"IVdf"/%"IVdf \
604 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
605 (int)(depth)*2, "", \
606 (IV)((data)->pos_min), \
607 (IV)((data)->pos_delta), \
304ee84b 608 (UV)((data)->flags), \
1de06328 609 (IV)((data)->whilem_c), \
304ee84b
YO
610 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
611 is_inf ? "INF " : "" \
1de06328
YO
612 ); \
613 if ((data)->last_found) \
614 PerlIO_printf(Perl_debug_log, \
615 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
616 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
617 SvPVX_const((data)->last_found), \
618 (IV)((data)->last_end), \
619 (IV)((data)->last_start_min), \
620 (IV)((data)->last_start_max), \
621 ((data)->longest && \
622 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
623 SvPVX_const((data)->longest_fixed), \
624 (IV)((data)->offset_fixed), \
625 ((data)->longest && \
626 (data)->longest==&((data)->longest_float)) ? "*" : "", \
627 SvPVX_const((data)->longest_float), \
628 (IV)((data)->offset_float_min), \
629 (IV)((data)->offset_float_max) \
630 ); \
631 PerlIO_printf(Perl_debug_log,"\n"); \
632});
633
acfe0abc 634static void clear_re(pTHX_ void *r);
4327152a 635
653099ff 636/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 637 Update the longest found anchored substring and the longest found
653099ff
GS
638 floating substrings if needed. */
639
4327152a 640STATIC void
304ee84b 641S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 642{
e1ec3a88
AL
643 const STRLEN l = CHR_SVLEN(data->last_found);
644 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 645 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 646
7918f24d
NC
647 PERL_ARGS_ASSERT_SCAN_COMMIT;
648
c277df42 649 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 650 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
651 if (*data->longest == data->longest_fixed) {
652 data->offset_fixed = l ? data->last_start_min : data->pos_min;
653 if (data->flags & SF_BEFORE_EOL)
b81d288d 654 data->flags
c277df42
IZ
655 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
656 else
657 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
658 data->minlen_fixed=minlenp;
659 data->lookbehind_fixed=0;
a0ed51b3 660 }
304ee84b 661 else { /* *data->longest == data->longest_float */
c277df42 662 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
663 data->offset_float_max = (l
664 ? data->last_start_max
c277df42 665 : data->pos_min + data->pos_delta);
304ee84b 666 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 667 data->offset_float_max = I32_MAX;
c277df42 668 if (data->flags & SF_BEFORE_EOL)
b81d288d 669 data->flags
c277df42
IZ
670 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
671 else
672 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
673 data->minlen_float=minlenp;
674 data->lookbehind_float=0;
c277df42
IZ
675 }
676 }
677 SvCUR_set(data->last_found, 0);
0eda9292 678 {
a28509cc 679 SV * const sv = data->last_found;
097eb12c
AL
680 if (SvUTF8(sv) && SvMAGICAL(sv)) {
681 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
682 if (mg)
683 mg->mg_len = 0;
684 }
0eda9292 685 }
c277df42
IZ
686 data->last_end = -1;
687 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 688 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
689}
690
653099ff
GS
691/* Can match anything (initialization) */
692STATIC void
097eb12c 693S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 694{
7918f24d
NC
695 PERL_ARGS_ASSERT_CL_ANYTHING;
696
653099ff 697 ANYOF_CLASS_ZERO(cl);
f8bef550 698 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 699 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
700 if (LOC)
701 cl->flags |= ANYOF_LOCALE;
702}
703
704/* Can match anything (initialization) */
705STATIC int
5f66b61c 706S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
707{
708 int value;
709
7918f24d
NC
710 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
711
aaa51d5e 712 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
713 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
714 return 1;
1aa99e6b
IH
715 if (!(cl->flags & ANYOF_UNICODE_ALL))
716 return 0;
10edeb5d 717 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 718 return 0;
653099ff
GS
719 return 1;
720}
721
722/* Can match anything (initialization) */
723STATIC void
097eb12c 724S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 725{
7918f24d
NC
726 PERL_ARGS_ASSERT_CL_INIT;
727
8ecf7187 728 Zero(cl, 1, struct regnode_charclass_class);
653099ff 729 cl->type = ANYOF;
830247a4 730 cl_anything(pRExC_state, cl);
653099ff
GS
731}
732
733STATIC void
097eb12c 734S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 735{
7918f24d
NC
736 PERL_ARGS_ASSERT_CL_INIT_ZERO;
737
8ecf7187 738 Zero(cl, 1, struct regnode_charclass_class);
653099ff 739 cl->type = ANYOF;
830247a4 740 cl_anything(pRExC_state, cl);
653099ff
GS
741 if (LOC)
742 cl->flags |= ANYOF_LOCALE;
743}
744
745/* 'And' a given class with another one. Can create false positives */
746/* We assume that cl is not inverted */
747STATIC void
5f66b61c 748S_cl_and(struct regnode_charclass_class *cl,
a28509cc 749 const struct regnode_charclass_class *and_with)
653099ff 750{
7918f24d 751 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
752
753 assert(and_with->type == ANYOF);
653099ff
GS
754 if (!(and_with->flags & ANYOF_CLASS)
755 && !(cl->flags & ANYOF_CLASS)
756 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
757 && !(and_with->flags & ANYOF_FOLD)
758 && !(cl->flags & ANYOF_FOLD)) {
759 int i;
760
761 if (and_with->flags & ANYOF_INVERT)
762 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
763 cl->bitmap[i] &= ~and_with->bitmap[i];
764 else
765 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
766 cl->bitmap[i] &= and_with->bitmap[i];
767 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
768 if (!(and_with->flags & ANYOF_EOS))
769 cl->flags &= ~ANYOF_EOS;
1aa99e6b 770
14ebb1a2
JH
771 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
772 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
773 cl->flags &= ~ANYOF_UNICODE_ALL;
774 cl->flags |= ANYOF_UNICODE;
775 ARG_SET(cl, ARG(and_with));
776 }
14ebb1a2
JH
777 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
778 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 779 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
780 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
781 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 782 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
783}
784
785/* 'OR' a given class with another one. Can create false positives */
786/* We assume that cl is not inverted */
787STATIC void
097eb12c 788S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 789{
7918f24d
NC
790 PERL_ARGS_ASSERT_CL_OR;
791
653099ff
GS
792 if (or_with->flags & ANYOF_INVERT) {
793 /* We do not use
794 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
795 * <= (B1 | !B2) | (CL1 | !CL2)
796 * which is wasteful if CL2 is small, but we ignore CL2:
797 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
798 * XXXX Can we handle case-fold? Unclear:
799 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
800 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
801 */
802 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
803 && !(or_with->flags & ANYOF_FOLD)
804 && !(cl->flags & ANYOF_FOLD) ) {
805 int i;
806
807 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
808 cl->bitmap[i] |= ~or_with->bitmap[i];
809 } /* XXXX: logic is complicated otherwise */
810 else {
830247a4 811 cl_anything(pRExC_state, cl);
653099ff
GS
812 }
813 } else {
814 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
815 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 816 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
817 || (cl->flags & ANYOF_FOLD)) ) {
818 int i;
819
820 /* OR char bitmap and class bitmap separately */
821 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
822 cl->bitmap[i] |= or_with->bitmap[i];
823 if (or_with->flags & ANYOF_CLASS) {
824 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
825 cl->classflags[i] |= or_with->classflags[i];
826 cl->flags |= ANYOF_CLASS;
827 }
828 }
829 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 830 cl_anything(pRExC_state, cl);
653099ff
GS
831 }
832 }
833 if (or_with->flags & ANYOF_EOS)
834 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
835
836 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
837 ARG(cl) != ARG(or_with)) {
838 cl->flags |= ANYOF_UNICODE_ALL;
839 cl->flags &= ~ANYOF_UNICODE;
840 }
841 if (or_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags |= ANYOF_UNICODE_ALL;
843 cl->flags &= ~ANYOF_UNICODE;
844 }
653099ff
GS
845}
846
a3621e74
YO
847#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
848#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
849#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
850#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
851
3dab1dad
YO
852
853#ifdef DEBUGGING
07be1b83 854/*
2b8b4781
NC
855 dump_trie(trie,widecharmap,revcharmap)
856 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
857 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
858
859 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
860 The _interim_ variants are used for debugging the interim
861 tables that are used to generate the final compressed
862 representation which is what dump_trie expects.
863
3dab1dad
YO
864 Part of the reason for their existance is to provide a form
865 of documentation as to how the different representations function.
07be1b83
YO
866
867*/
3dab1dad
YO
868
869/*
3dab1dad
YO
870 Dumps the final compressed table form of the trie to Perl_debug_log.
871 Used for debugging make_trie().
872*/
873
874STATIC void
2b8b4781
NC
875S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
876 AV *revcharmap, U32 depth)
3dab1dad
YO
877{
878 U32 state;
ab3bbdeb 879 SV *sv=sv_newmortal();
55eed653 880 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
881 GET_RE_DEBUG_FLAGS_DECL;
882
7918f24d 883 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 884
3dab1dad
YO
885 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
886 (int)depth * 2 + 2,"",
887 "Match","Base","Ofs" );
888
889 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 890 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 891 if ( tmp ) {
ab3bbdeb
YO
892 PerlIO_printf( Perl_debug_log, "%*s",
893 colwidth,
ddc5bc0f 894 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
895 PL_colors[0], PL_colors[1],
896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
897 PERL_PV_ESCAPE_FIRSTCHAR
898 )
899 );
3dab1dad
YO
900 }
901 }
902 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
903 (int)depth * 2 + 2,"");
904
905 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 906 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
907 PerlIO_printf( Perl_debug_log, "\n");
908
1e2e3d02 909 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 910 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
911
912 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
913
914 if ( trie->states[ state ].wordnum ) {
915 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
916 } else {
917 PerlIO_printf( Perl_debug_log, "%6s", "" );
918 }
919
920 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
921
922 if ( base ) {
923 U32 ofs = 0;
924
925 while( ( base + ofs < trie->uniquecharcount ) ||
926 ( base + ofs - trie->uniquecharcount < trie->lasttrans
927 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
928 ofs++;
929
930 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
931
932 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
933 if ( ( base + ofs >= trie->uniquecharcount ) &&
934 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
935 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
936 {
ab3bbdeb
YO
937 PerlIO_printf( Perl_debug_log, "%*"UVXf,
938 colwidth,
3dab1dad
YO
939 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
940 } else {
ab3bbdeb 941 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
942 }
943 }
944
945 PerlIO_printf( Perl_debug_log, "]");
946
947 }
948 PerlIO_printf( Perl_debug_log, "\n" );
949 }
950}
951/*
3dab1dad
YO
952 Dumps a fully constructed but uncompressed trie in list form.
953 List tries normally only are used for construction when the number of
954 possible chars (trie->uniquecharcount) is very high.
955 Used for debugging make_trie().
956*/
957STATIC void
55eed653 958S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
959 HV *widecharmap, AV *revcharmap, U32 next_alloc,
960 U32 depth)
3dab1dad
YO
961{
962 U32 state;
ab3bbdeb 963 SV *sv=sv_newmortal();
55eed653 964 int colwidth= widecharmap ? 6 : 4;
3dab1dad 965 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
966
967 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
968
3dab1dad 969 /* print out the table precompression. */
ab3bbdeb
YO
970 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
971 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
972 "------:-----+-----------------\n" );
3dab1dad
YO
973
974 for( state=1 ; state < next_alloc ; state ++ ) {
975 U16 charid;
976
ab3bbdeb 977 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
978 (int)depth * 2 + 2,"", (UV)state );
979 if ( ! trie->states[ state ].wordnum ) {
980 PerlIO_printf( Perl_debug_log, "%5s| ","");
981 } else {
982 PerlIO_printf( Perl_debug_log, "W%4x| ",
983 trie->states[ state ].wordnum
984 );
985 }
986 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 987 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
988 if ( tmp ) {
989 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
990 colwidth,
ddc5bc0f 991 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
992 PL_colors[0], PL_colors[1],
993 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
994 PERL_PV_ESCAPE_FIRSTCHAR
995 ) ,
1e2e3d02
YO
996 TRIE_LIST_ITEM(state,charid).forid,
997 (UV)TRIE_LIST_ITEM(state,charid).newstate
998 );
999 if (!(charid % 10))
664e119d
RGS
1000 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1001 (int)((depth * 2) + 14), "");
1e2e3d02 1002 }
ab3bbdeb
YO
1003 }
1004 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1005 }
1006}
1007
1008/*
3dab1dad
YO
1009 Dumps a fully constructed but uncompressed trie in table form.
1010 This is the normal DFA style state transition table, with a few
1011 twists to facilitate compression later.
1012 Used for debugging make_trie().
1013*/
1014STATIC void
55eed653 1015S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1016 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1017 U32 depth)
3dab1dad
YO
1018{
1019 U32 state;
1020 U16 charid;
ab3bbdeb 1021 SV *sv=sv_newmortal();
55eed653 1022 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1023 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1024
1025 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1026
1027 /*
1028 print out the table precompression so that we can do a visual check
1029 that they are identical.
1030 */
1031
1032 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1033
1034 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1035 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1036 if ( tmp ) {
ab3bbdeb
YO
1037 PerlIO_printf( Perl_debug_log, "%*s",
1038 colwidth,
ddc5bc0f 1039 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1040 PL_colors[0], PL_colors[1],
1041 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1042 PERL_PV_ESCAPE_FIRSTCHAR
1043 )
1044 );
3dab1dad
YO
1045 }
1046 }
1047
1048 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1049
1050 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1051 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1052 }
1053
1054 PerlIO_printf( Perl_debug_log, "\n" );
1055
1056 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1057
1058 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1059 (int)depth * 2 + 2,"",
1060 (UV)TRIE_NODENUM( state ) );
1061
1062 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1063 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1064 if (v)
1065 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1066 else
1067 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1068 }
1069 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1070 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1071 } else {
1072 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1073 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1074 }
1075 }
07be1b83 1076}
3dab1dad
YO
1077
1078#endif
1079
786e8c11
YO
1080/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1081 startbranch: the first branch in the whole branch sequence
1082 first : start branch of sequence of branch-exact nodes.
1083 May be the same as startbranch
1084 last : Thing following the last branch.
1085 May be the same as tail.
1086 tail : item following the branch sequence
1087 count : words in the sequence
1088 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1089 depth : indent depth
3dab1dad 1090
786e8c11 1091Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1092
786e8c11
YO
1093A trie is an N'ary tree where the branches are determined by digital
1094decomposition of the key. IE, at the root node you look up the 1st character and
1095follow that branch repeat until you find the end of the branches. Nodes can be
1096marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1097
786e8c11 1098 /he|she|his|hers/
72f13be8 1099
786e8c11
YO
1100would convert into the following structure. Numbers represent states, letters
1101following numbers represent valid transitions on the letter from that state, if
1102the number is in square brackets it represents an accepting state, otherwise it
1103will be in parenthesis.
07be1b83 1104
786e8c11
YO
1105 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1106 | |
1107 | (2)
1108 | |
1109 (1) +-i->(6)-+-s->[7]
1110 |
1111 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1112
786e8c11
YO
1113 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1114
1115This shows that when matching against the string 'hers' we will begin at state 1
1116read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1117then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1118is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1119single traverse. We store a mapping from accepting to state to which word was
1120matched, and then when we have multiple possibilities we try to complete the
1121rest of the regex in the order in which they occured in the alternation.
1122
1123The only prior NFA like behaviour that would be changed by the TRIE support is
1124the silent ignoring of duplicate alternations which are of the form:
1125
1126 / (DUPE|DUPE) X? (?{ ... }) Y /x
1127
1128Thus EVAL blocks follwing a trie may be called a different number of times with
1129and without the optimisation. With the optimisations dupes will be silently
1130ignored. This inconsistant behaviour of EVAL type nodes is well established as
1131the following demonstrates:
1132
1133 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1134
1135which prints out 'word' three times, but
1136
1137 'words'=~/(word|word|word)(?{ print $1 })S/
1138
1139which doesnt print it out at all. This is due to other optimisations kicking in.
1140
1141Example of what happens on a structural level:
1142
1143The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1144
1145 1: CURLYM[1] {1,32767}(18)
1146 5: BRANCH(8)
1147 6: EXACT <ac>(16)
1148 8: BRANCH(11)
1149 9: EXACT <ad>(16)
1150 11: BRANCH(14)
1151 12: EXACT <ab>(16)
1152 16: SUCCEED(0)
1153 17: NOTHING(18)
1154 18: END(0)
1155
1156This would be optimizable with startbranch=5, first=5, last=16, tail=16
1157and should turn into:
1158
1159 1: CURLYM[1] {1,32767}(18)
1160 5: TRIE(16)
1161 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1162 <ac>
1163 <ad>
1164 <ab>
1165 16: SUCCEED(0)
1166 17: NOTHING(18)
1167 18: END(0)
1168
1169Cases where tail != last would be like /(?foo|bar)baz/:
1170
1171 1: BRANCH(4)
1172 2: EXACT <foo>(8)
1173 4: BRANCH(7)
1174 5: EXACT <bar>(8)
1175 7: TAIL(8)
1176 8: EXACT <baz>(10)
1177 10: END(0)
1178
1179which would be optimizable with startbranch=1, first=1, last=7, tail=8
1180and would end up looking like:
1181
1182 1: TRIE(8)
1183 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1184 <foo>
1185 <bar>
1186 7: TAIL(8)
1187 8: EXACT <baz>(10)
1188 10: END(0)
1189
1190 d = uvuni_to_utf8_flags(d, uv, 0);
1191
1192is the recommended Unicode-aware way of saying
1193
1194 *(d++) = uv;
1195*/
1196
1e2e3d02 1197#define TRIE_STORE_REVCHAR \
786e8c11 1198 STMT_START { \
73031816
NC
1199 if (UTF) { \
1200 SV *zlopp = newSV(2); \
88c9ea1e
CB
1201 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1202 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1203 SvCUR_set(zlopp, kapow - flrbbbbb); \
1204 SvPOK_on(zlopp); \
1205 SvUTF8_on(zlopp); \
1206 av_push(revcharmap, zlopp); \
1207 } else { \
6bdeddd2 1208 char ooooff = (char)uvc; \
73031816
NC
1209 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1210 } \
1211 } STMT_END
786e8c11
YO
1212
1213#define TRIE_READ_CHAR STMT_START { \
1214 wordlen++; \
1215 if ( UTF ) { \
1216 if ( folder ) { \
1217 if ( foldlen > 0 ) { \
1218 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1219 foldlen -= len; \
1220 scan += len; \
1221 len = 0; \
1222 } else { \
1223 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1224 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1225 foldlen -= UNISKIP( uvc ); \
1226 scan = foldbuf + UNISKIP( uvc ); \
1227 } \
1228 } else { \
1229 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1230 } \
1231 } else { \
1232 uvc = (U32)*uc; \
1233 len = 1; \
1234 } \
1235} STMT_END
1236
1237
1238
1239#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1240 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1241 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1242 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1243 } \
1244 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1245 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1246 TRIE_LIST_CUR( state )++; \
1247} STMT_END
07be1b83 1248
786e8c11
YO
1249#define TRIE_LIST_NEW(state) STMT_START { \
1250 Newxz( trie->states[ state ].trans.list, \
1251 4, reg_trie_trans_le ); \
1252 TRIE_LIST_CUR( state ) = 1; \
1253 TRIE_LIST_LEN( state ) = 4; \
1254} STMT_END
07be1b83 1255
786e8c11
YO
1256#define TRIE_HANDLE_WORD(state) STMT_START { \
1257 U16 dupe= trie->states[ state ].wordnum; \
1258 regnode * const noper_next = regnext( noper ); \
1259 \
1260 if (trie->wordlen) \
1261 trie->wordlen[ curword ] = wordlen; \
1262 DEBUG_r({ \
1263 /* store the word for dumping */ \
1264 SV* tmp; \
1265 if (OP(noper) != NOTHING) \
740cce10 1266 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1267 else \
740cce10 1268 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1269 av_push( trie_words, tmp ); \
786e8c11
YO
1270 }); \
1271 \
1272 curword++; \
1273 \
1274 if ( noper_next < tail ) { \
1275 if (!trie->jump) \
c944940b 1276 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1277 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1278 if (!jumper) \
1279 jumper = noper_next; \
1280 if (!nextbranch) \
1281 nextbranch= regnext(cur); \
1282 } \
1283 \
1284 if ( dupe ) { \
1285 /* So it's a dupe. This means we need to maintain a */\
1286 /* linked-list from the first to the next. */\
1287 /* we only allocate the nextword buffer when there */\
1288 /* a dupe, so first time we have to do the allocation */\
1289 if (!trie->nextword) \
c944940b 1290 trie->nextword = (U16 *) \
446bd890 1291 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1292 while ( trie->nextword[dupe] ) \
1293 dupe= trie->nextword[dupe]; \
1294 trie->nextword[dupe]= curword; \
1295 } else { \
1296 /* we haven't inserted this word yet. */ \
1297 trie->states[ state ].wordnum = curword; \
1298 } \
1299} STMT_END
07be1b83 1300
3dab1dad 1301
786e8c11
YO
1302#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1303 ( ( base + charid >= ucharcount \
1304 && base + charid < ubound \
1305 && state == trie->trans[ base - ucharcount + charid ].check \
1306 && trie->trans[ base - ucharcount + charid ].next ) \
1307 ? trie->trans[ base - ucharcount + charid ].next \
1308 : ( state==1 ? special : 0 ) \
1309 )
3dab1dad 1310
786e8c11
YO
1311#define MADE_TRIE 1
1312#define MADE_JUMP_TRIE 2
1313#define MADE_EXACT_TRIE 4
3dab1dad 1314
a3621e74 1315STATIC I32
786e8c11 1316S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1317{
27da23d5 1318 dVAR;
a3621e74
YO
1319 /* first pass, loop through and scan words */
1320 reg_trie_data *trie;
55eed653 1321 HV *widecharmap = NULL;
2b8b4781 1322 AV *revcharmap = newAV();
a3621e74 1323 regnode *cur;
9f7f3913 1324 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1325 STRLEN len = 0;
1326 UV uvc = 0;
1327 U16 curword = 0;
1328 U32 next_alloc = 0;
786e8c11
YO
1329 regnode *jumper = NULL;
1330 regnode *nextbranch = NULL;
7f69552c 1331 regnode *convert = NULL;
a3621e74 1332 /* we just use folder as a flag in utf8 */
e1ec3a88 1333 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1334 ? PL_fold
1335 : ( flags == EXACTFL
1336 ? PL_fold_locale
1337 : NULL
1338 )
1339 );
1340
2b8b4781
NC
1341#ifdef DEBUGGING
1342 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1343 AV *trie_words = NULL;
1344 /* along with revcharmap, this only used during construction but both are
1345 * useful during debugging so we store them in the struct when debugging.
8e11feef 1346 */
2b8b4781
NC
1347#else
1348 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1349 STRLEN trie_charcount=0;
3dab1dad 1350#endif
2b8b4781 1351 SV *re_trie_maxbuff;
a3621e74 1352 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1353
1354 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1355#ifndef DEBUGGING
1356 PERL_UNUSED_ARG(depth);
1357#endif
a3621e74 1358
c944940b 1359 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1360 trie->refcount = 1;
3dab1dad 1361 trie->startstate = 1;
786e8c11 1362 trie->wordcount = word_count;
f8fc2ecf 1363 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1364 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1365 if (!(UTF && folder))
c944940b 1366 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1367 DEBUG_r({
2b8b4781 1368 trie_words = newAV();
a3621e74 1369 });
a3621e74 1370
0111c4fd 1371 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1372 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1373 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1374 }
3dab1dad
YO
1375 DEBUG_OPTIMISE_r({
1376 PerlIO_printf( Perl_debug_log,
786e8c11 1377 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1378 (int)depth * 2 + 2, "",
1379 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1380 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1381 (int)depth);
3dab1dad 1382 });
7f69552c
YO
1383
1384 /* Find the node we are going to overwrite */
1385 if ( first == startbranch && OP( last ) != BRANCH ) {
1386 /* whole branch chain */
1387 convert = first;
1388 } else {
1389 /* branch sub-chain */
1390 convert = NEXTOPER( first );
1391 }
1392
a3621e74
YO
1393 /* -- First loop and Setup --
1394
1395 We first traverse the branches and scan each word to determine if it
1396 contains widechars, and how many unique chars there are, this is
1397 important as we have to build a table with at least as many columns as we
1398 have unique chars.
1399
1400 We use an array of integers to represent the character codes 0..255
38a44b82 1401 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1402 native representation of the character value as the key and IV's for the
1403 coded index.
1404
1405 *TODO* If we keep track of how many times each character is used we can
1406 remap the columns so that the table compression later on is more
1407 efficient in terms of memory by ensuring most common value is in the
1408 middle and the least common are on the outside. IMO this would be better
1409 than a most to least common mapping as theres a decent chance the most
1410 common letter will share a node with the least common, meaning the node
1411 will not be compressable. With a middle is most common approach the worst
1412 case is when we have the least common nodes twice.
1413
1414 */
1415
a3621e74 1416 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1417 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1418 const U8 *uc = (U8*)STRING( noper );
a28509cc 1419 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1420 STRLEN foldlen = 0;
1421 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1422 const U8 *scan = (U8*)NULL;
07be1b83 1423 U32 wordlen = 0; /* required init */
02daf0ab
YO
1424 STRLEN chars = 0;
1425 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1426
3dab1dad
YO
1427 if (OP(noper) == NOTHING) {
1428 trie->minlen= 0;
1429 continue;
1430 }
02daf0ab
YO
1431 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1432 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1433 regardless of encoding */
1434
a3621e74 1435 for ( ; uc < e ; uc += len ) {
3dab1dad 1436 TRIE_CHARCOUNT(trie)++;
a3621e74 1437 TRIE_READ_CHAR;
3dab1dad 1438 chars++;
a3621e74
YO
1439 if ( uvc < 256 ) {
1440 if ( !trie->charmap[ uvc ] ) {
1441 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1442 if ( folder )
1443 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1444 TRIE_STORE_REVCHAR;
a3621e74 1445 }
02daf0ab
YO
1446 if ( set_bit ) {
1447 /* store the codepoint in the bitmap, and if its ascii
1448 also store its folded equivelent. */
1449 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1450
1451 /* store the folded codepoint */
1452 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1453
1454 if ( !UTF ) {
1455 /* store first byte of utf8 representation of
1456 codepoints in the 127 < uvc < 256 range */
1457 if (127 < uvc && uvc < 192) {
1458 TRIE_BITMAP_SET(trie,194);
1459 } else if (191 < uvc ) {
1460 TRIE_BITMAP_SET(trie,195);
1461 /* && uvc < 256 -- we know uvc is < 256 already */
1462 }
1463 }
02daf0ab
YO
1464 set_bit = 0; /* We've done our bit :-) */
1465 }
a3621e74
YO
1466 } else {
1467 SV** svpp;
55eed653
NC
1468 if ( !widecharmap )
1469 widecharmap = newHV();
a3621e74 1470
55eed653 1471 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1472
1473 if ( !svpp )
e4584336 1474 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1475
1476 if ( !SvTRUE( *svpp ) ) {
1477 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1478 TRIE_STORE_REVCHAR;
a3621e74
YO
1479 }
1480 }
1481 }
3dab1dad
YO
1482 if( cur == first ) {
1483 trie->minlen=chars;
1484 trie->maxlen=chars;
1485 } else if (chars < trie->minlen) {
1486 trie->minlen=chars;
1487 } else if (chars > trie->maxlen) {
1488 trie->maxlen=chars;
1489 }
1490
a3621e74
YO
1491 } /* end first pass */
1492 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1493 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1494 (int)depth * 2 + 2,"",
55eed653 1495 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1496 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1497 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1498 );
c944940b 1499 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1500
1501 /*
1502 We now know what we are dealing with in terms of unique chars and
1503 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1504 representation using a flat table will take. If it's over a reasonable
1505 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1506 conservative but potentially much slower representation using an array
1507 of lists.
1508
1509 At the end we convert both representations into the same compressed
1510 form that will be used in regexec.c for matching with. The latter
1511 is a form that cannot be used to construct with but has memory
1512 properties similar to the list form and access properties similar
1513 to the table form making it both suitable for fast searches and
1514 small enough that its feasable to store for the duration of a program.
1515
1516 See the comment in the code where the compressed table is produced
1517 inplace from the flat tabe representation for an explanation of how
1518 the compression works.
1519
1520 */
1521
1522
3dab1dad 1523 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1524 /*
1525 Second Pass -- Array Of Lists Representation
1526
1527 Each state will be represented by a list of charid:state records
1528 (reg_trie_trans_le) the first such element holds the CUR and LEN
1529 points of the allocated array. (See defines above).
1530
1531 We build the initial structure using the lists, and then convert
1532 it into the compressed table form which allows faster lookups
1533 (but cant be modified once converted).
a3621e74
YO
1534 */
1535
a3621e74
YO
1536 STRLEN transcount = 1;
1537
1e2e3d02
YO
1538 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1539 "%*sCompiling trie using list compiler\n",
1540 (int)depth * 2 + 2, ""));
446bd890 1541
c944940b
JH
1542 trie->states = (reg_trie_state *)
1543 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1544 sizeof(reg_trie_state) );
a3621e74
YO
1545 TRIE_LIST_NEW(1);
1546 next_alloc = 2;
1547
1548 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1549
c445ea15
AL
1550 regnode * const noper = NEXTOPER( cur );
1551 U8 *uc = (U8*)STRING( noper );
1552 const U8 * const e = uc + STR_LEN( noper );
1553 U32 state = 1; /* required init */
1554 U16 charid = 0; /* sanity init */
1555 U8 *scan = (U8*)NULL; /* sanity init */
1556 STRLEN foldlen = 0; /* required init */
07be1b83 1557 U32 wordlen = 0; /* required init */
c445ea15
AL
1558 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1559
3dab1dad 1560 if (OP(noper) != NOTHING) {
786e8c11 1561 for ( ; uc < e ; uc += len ) {
c445ea15 1562
786e8c11 1563 TRIE_READ_CHAR;
c445ea15 1564
786e8c11
YO
1565 if ( uvc < 256 ) {
1566 charid = trie->charmap[ uvc ];
c445ea15 1567 } else {
55eed653 1568 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1569 if ( !svpp ) {
1570 charid = 0;
1571 } else {
1572 charid=(U16)SvIV( *svpp );
1573 }
c445ea15 1574 }
786e8c11
YO
1575 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1576 if ( charid ) {
a3621e74 1577
786e8c11
YO
1578 U16 check;
1579 U32 newstate = 0;
a3621e74 1580
786e8c11
YO
1581 charid--;
1582 if ( !trie->states[ state ].trans.list ) {
1583 TRIE_LIST_NEW( state );
c445ea15 1584 }
786e8c11
YO
1585 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1586 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1587 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1588 break;
1589 }
1590 }
1591 if ( ! newstate ) {
1592 newstate = next_alloc++;
1593 TRIE_LIST_PUSH( state, charid, newstate );
1594 transcount++;
1595 }
1596 state = newstate;
1597 } else {
1598 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1599 }
a28509cc 1600 }
c445ea15 1601 }
3dab1dad 1602 TRIE_HANDLE_WORD(state);
a3621e74
YO
1603
1604 } /* end second pass */
1605
1e2e3d02
YO
1606 /* next alloc is the NEXT state to be allocated */
1607 trie->statecount = next_alloc;
c944940b
JH
1608 trie->states = (reg_trie_state *)
1609 PerlMemShared_realloc( trie->states,
1610 next_alloc
1611 * sizeof(reg_trie_state) );
a3621e74 1612
3dab1dad 1613 /* and now dump it out before we compress it */
2b8b4781
NC
1614 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1615 revcharmap, next_alloc,
1616 depth+1)
1e2e3d02 1617 );
a3621e74 1618
c944940b
JH
1619 trie->trans = (reg_trie_trans *)
1620 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1621 {
1622 U32 state;
a3621e74
YO
1623 U32 tp = 0;
1624 U32 zp = 0;
1625
1626
1627 for( state=1 ; state < next_alloc ; state ++ ) {
1628 U32 base=0;
1629
1630 /*
1631 DEBUG_TRIE_COMPILE_MORE_r(
1632 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1633 );
1634 */
1635
1636 if (trie->states[state].trans.list) {
1637 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1638 U16 maxid=minid;
a28509cc 1639 U16 idx;
a3621e74
YO
1640
1641 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1642 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1643 if ( forid < minid ) {
1644 minid=forid;
1645 } else if ( forid > maxid ) {
1646 maxid=forid;
1647 }
a3621e74
YO
1648 }
1649 if ( transcount < tp + maxid - minid + 1) {
1650 transcount *= 2;
c944940b
JH
1651 trie->trans = (reg_trie_trans *)
1652 PerlMemShared_realloc( trie->trans,
446bd890
NC
1653 transcount
1654 * sizeof(reg_trie_trans) );
a3621e74
YO
1655 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1656 }
1657 base = trie->uniquecharcount + tp - minid;
1658 if ( maxid == minid ) {
1659 U32 set = 0;
1660 for ( ; zp < tp ; zp++ ) {
1661 if ( ! trie->trans[ zp ].next ) {
1662 base = trie->uniquecharcount + zp - minid;
1663 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1664 trie->trans[ zp ].check = state;
1665 set = 1;
1666 break;
1667 }
1668 }
1669 if ( !set ) {
1670 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1671 trie->trans[ tp ].check = state;
1672 tp++;
1673 zp = tp;
1674 }
1675 } else {
1676 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1677 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1678 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1679 trie->trans[ tid ].check = state;
1680 }
1681 tp += ( maxid - minid + 1 );
1682 }
1683 Safefree(trie->states[ state ].trans.list);
1684 }
1685 /*
1686 DEBUG_TRIE_COMPILE_MORE_r(
1687 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1688 );
1689 */
1690 trie->states[ state ].trans.base=base;
1691 }
cc601c31 1692 trie->lasttrans = tp + 1;
a3621e74
YO
1693 }
1694 } else {
1695 /*
1696 Second Pass -- Flat Table Representation.
1697
1698 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1699 We know that we will need Charcount+1 trans at most to store the data
1700 (one row per char at worst case) So we preallocate both structures
1701 assuming worst case.
1702
1703 We then construct the trie using only the .next slots of the entry
1704 structs.
1705
1706 We use the .check field of the first entry of the node temporarily to
1707 make compression both faster and easier by keeping track of how many non
1708 zero fields are in the node.
1709
1710 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1711 transition.
1712
1713 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1714 number representing the first entry of the node, and state as a
1715 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1716 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1717 are 2 entrys per node. eg:
1718
1719 A B A B
1720 1. 2 4 1. 3 7
1721 2. 0 3 3. 0 5
1722 3. 0 0 5. 0 0
1723 4. 0 0 7. 0 0
1724
1725 The table is internally in the right hand, idx form. However as we also
1726 have to deal with the states array which is indexed by nodenum we have to
1727 use TRIE_NODENUM() to convert.
1728
1729 */
1e2e3d02
YO
1730 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1731 "%*sCompiling trie using table compiler\n",
1732 (int)depth * 2 + 2, ""));
3dab1dad 1733
c944940b
JH
1734 trie->trans = (reg_trie_trans *)
1735 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1736 * trie->uniquecharcount + 1,
1737 sizeof(reg_trie_trans) );
1738 trie->states = (reg_trie_state *)
1739 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1740 sizeof(reg_trie_state) );
a3621e74
YO
1741 next_alloc = trie->uniquecharcount + 1;
1742
3dab1dad 1743
a3621e74
YO
1744 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1745
c445ea15 1746 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1747 const U8 *uc = (U8*)STRING( noper );
1748 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1749
1750 U32 state = 1; /* required init */
1751
1752 U16 charid = 0; /* sanity init */
1753 U32 accept_state = 0; /* sanity init */
1754 U8 *scan = (U8*)NULL; /* sanity init */
1755
1756 STRLEN foldlen = 0; /* required init */
07be1b83 1757 U32 wordlen = 0; /* required init */
a3621e74
YO
1758 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1759
3dab1dad 1760 if ( OP(noper) != NOTHING ) {
786e8c11 1761 for ( ; uc < e ; uc += len ) {
a3621e74 1762
786e8c11 1763 TRIE_READ_CHAR;
a3621e74 1764
786e8c11
YO
1765 if ( uvc < 256 ) {
1766 charid = trie->charmap[ uvc ];
1767 } else {
55eed653 1768 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1769 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1770 }
786e8c11
YO
1771 if ( charid ) {
1772 charid--;
1773 if ( !trie->trans[ state + charid ].next ) {
1774 trie->trans[ state + charid ].next = next_alloc;
1775 trie->trans[ state ].check++;
1776 next_alloc += trie->uniquecharcount;
1777 }
1778 state = trie->trans[ state + charid ].next;
1779 } else {
1780 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1781 }
1782 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1783 }
a3621e74 1784 }
3dab1dad
YO
1785 accept_state = TRIE_NODENUM( state );
1786 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1787
1788 } /* end second pass */
1789
3dab1dad 1790 /* and now dump it out before we compress it */
2b8b4781
NC
1791 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1792 revcharmap,
1793 next_alloc, depth+1));
a3621e74 1794
a3621e74
YO
1795 {
1796 /*
1797 * Inplace compress the table.*
1798
1799 For sparse data sets the table constructed by the trie algorithm will
1800 be mostly 0/FAIL transitions or to put it another way mostly empty.
1801 (Note that leaf nodes will not contain any transitions.)
1802
1803 This algorithm compresses the tables by eliminating most such
1804 transitions, at the cost of a modest bit of extra work during lookup:
1805
1806 - Each states[] entry contains a .base field which indicates the
1807 index in the state[] array wheres its transition data is stored.
1808
1809 - If .base is 0 there are no valid transitions from that node.
1810
1811 - If .base is nonzero then charid is added to it to find an entry in
1812 the trans array.
1813
1814 -If trans[states[state].base+charid].check!=state then the
1815 transition is taken to be a 0/Fail transition. Thus if there are fail
1816 transitions at the front of the node then the .base offset will point
1817 somewhere inside the previous nodes data (or maybe even into a node
1818 even earlier), but the .check field determines if the transition is
1819 valid.
1820
786e8c11 1821 XXX - wrong maybe?
a3621e74
YO
1822 The following process inplace converts the table to the compressed
1823 table: We first do not compress the root node 1,and mark its all its
1824 .check pointers as 1 and set its .base pointer as 1 as well. This
1825 allows to do a DFA construction from the compressed table later, and
1826 ensures that any .base pointers we calculate later are greater than
1827 0.
1828
1829 - We set 'pos' to indicate the first entry of the second node.
1830
1831 - We then iterate over the columns of the node, finding the first and
1832 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1833 and set the .check pointers accordingly, and advance pos
1834 appropriately and repreat for the next node. Note that when we copy
1835 the next pointers we have to convert them from the original
1836 NODEIDX form to NODENUM form as the former is not valid post
1837 compression.
1838
1839 - If a node has no transitions used we mark its base as 0 and do not
1840 advance the pos pointer.
1841
1842 - If a node only has one transition we use a second pointer into the
1843 structure to fill in allocated fail transitions from other states.
1844 This pointer is independent of the main pointer and scans forward
1845 looking for null transitions that are allocated to a state. When it
1846 finds one it writes the single transition into the "hole". If the
786e8c11 1847 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1848
1849 - Once compressed we can Renew/realloc the structures to release the
1850 excess space.
1851
1852 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1853 specifically Fig 3.47 and the associated pseudocode.
1854
1855 demq
1856 */
a3b680e6 1857 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1858 U32 state, charid;
a3621e74 1859 U32 pos = 0, zp=0;
1e2e3d02 1860 trie->statecount = laststate;
a3621e74
YO
1861
1862 for ( state = 1 ; state < laststate ; state++ ) {
1863 U8 flag = 0;
a28509cc
AL
1864 const U32 stateidx = TRIE_NODEIDX( state );
1865 const U32 o_used = trie->trans[ stateidx ].check;
1866 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1867 trie->trans[ stateidx ].check = 0;
1868
1869 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1870 if ( flag || trie->trans[ stateidx + charid ].next ) {
1871 if ( trie->trans[ stateidx + charid ].next ) {
1872 if (o_used == 1) {
1873 for ( ; zp < pos ; zp++ ) {
1874 if ( ! trie->trans[ zp ].next ) {
1875 break;
1876 }
1877 }
1878 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1879 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1880 trie->trans[ zp ].check = state;
1881 if ( ++zp > pos ) pos = zp;
1882 break;
1883 }
1884 used--;
1885 }
1886 if ( !flag ) {
1887 flag = 1;
1888 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1889 }
1890 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1891 trie->trans[ pos ].check = state;
1892 pos++;
1893 }
1894 }
1895 }
cc601c31 1896 trie->lasttrans = pos + 1;
c944940b
JH
1897 trie->states = (reg_trie_state *)
1898 PerlMemShared_realloc( trie->states, laststate
1899 * sizeof(reg_trie_state) );
a3621e74 1900 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1901 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1902 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1903 (int)depth * 2 + 2,"",
1904 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1905 (IV)next_alloc,
1906 (IV)pos,
a3621e74
YO
1907 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1908 );
1909
1910 } /* end table compress */
1911 }
1e2e3d02
YO
1912 DEBUG_TRIE_COMPILE_MORE_r(
1913 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1914 (int)depth * 2 + 2, "",
1915 (UV)trie->statecount,
1916 (UV)trie->lasttrans)
1917 );
cc601c31 1918 /* resize the trans array to remove unused space */
c944940b
JH
1919 trie->trans = (reg_trie_trans *)
1920 PerlMemShared_realloc( trie->trans, trie->lasttrans
1921 * sizeof(reg_trie_trans) );
a3621e74 1922
3dab1dad 1923 /* and now dump out the compressed format */
2b8b4781 1924 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1925
3dab1dad 1926 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1927 U8 nodetype =(U8)(flags & 0xFF);
1928 char *str=NULL;
786e8c11 1929
07be1b83 1930#ifdef DEBUGGING
e62cc96a 1931 regnode *optimize = NULL;
7122b237
YO
1932#ifdef RE_TRACK_PATTERN_OFFSETS
1933
b57a0404
JH
1934 U32 mjd_offset = 0;
1935 U32 mjd_nodelen = 0;
7122b237
YO
1936#endif /* RE_TRACK_PATTERN_OFFSETS */
1937#endif /* DEBUGGING */
a3621e74 1938 /*
3dab1dad
YO
1939 This means we convert either the first branch or the first Exact,
1940 depending on whether the thing following (in 'last') is a branch
1941 or not and whther first is the startbranch (ie is it a sub part of
1942 the alternation or is it the whole thing.)
1943 Assuming its a sub part we conver the EXACT otherwise we convert
1944 the whole branch sequence, including the first.
a3621e74 1945 */
3dab1dad 1946 /* Find the node we are going to overwrite */
7f69552c 1947 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1948 /* branch sub-chain */
3dab1dad 1949 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1950#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1951 DEBUG_r({
1952 mjd_offset= Node_Offset((convert));
1953 mjd_nodelen= Node_Length((convert));
1954 });
7122b237 1955#endif
7f69552c 1956 /* whole branch chain */
7122b237
YO
1957 }
1958#ifdef RE_TRACK_PATTERN_OFFSETS
1959 else {
7f69552c
YO
1960 DEBUG_r({
1961 const regnode *nop = NEXTOPER( convert );
1962 mjd_offset= Node_Offset((nop));
1963 mjd_nodelen= Node_Length((nop));
1964 });
07be1b83
YO
1965 }
1966 DEBUG_OPTIMISE_r(
1967 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1968 (int)depth * 2 + 2, "",
786e8c11 1969 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1970 );
7122b237 1971#endif
3dab1dad
YO
1972 /* But first we check to see if there is a common prefix we can
1973 split out as an EXACT and put in front of the TRIE node. */
1974 trie->startstate= 1;
55eed653 1975 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1976 U32 state;
1e2e3d02 1977 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1978 U32 ofs = 0;
8e11feef
RGS
1979 I32 idx = -1;
1980 U32 count = 0;
1981 const U32 base = trie->states[ state ].trans.base;
a3621e74 1982
3dab1dad 1983 if ( trie->states[state].wordnum )
8e11feef 1984 count = 1;
a3621e74 1985
8e11feef 1986 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1987 if ( ( base + ofs >= trie->uniquecharcount ) &&
1988 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1989 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1990 {
3dab1dad 1991 if ( ++count > 1 ) {
2b8b4781 1992 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1993 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1994 if ( state == 1 ) break;
3dab1dad
YO
1995 if ( count == 2 ) {
1996 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1997 DEBUG_OPTIMISE_r(
8e11feef
RGS
1998 PerlIO_printf(Perl_debug_log,
1999 "%*sNew Start State=%"UVuf" Class: [",
2000 (int)depth * 2 + 2, "",
786e8c11 2001 (UV)state));
be8e71aa 2002 if (idx >= 0) {
2b8b4781 2003 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2004 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2005
3dab1dad 2006 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2007 if ( folder )
2008 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2009 DEBUG_OPTIMISE_r(
f1f66076 2010 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2011 );
8e11feef
RGS
2012 }
2013 }
2014 TRIE_BITMAP_SET(trie,*ch);
2015 if ( folder )
2016 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2017 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2018 }
2019 idx = ofs;
2020 }
3dab1dad
YO
2021 }
2022 if ( count == 1 ) {
2b8b4781 2023 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2024 STRLEN len;
2025 char *ch = SvPV( *tmp, len );
de734bd5
A
2026 DEBUG_OPTIMISE_r({
2027 SV *sv=sv_newmortal();
8e11feef
RGS
2028 PerlIO_printf( Perl_debug_log,
2029 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2030 (int)depth * 2 + 2, "",
de734bd5
A
2031 (UV)state, (UV)idx,
2032 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2033 PL_colors[0], PL_colors[1],
2034 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2035 PERL_PV_ESCAPE_FIRSTCHAR
2036 )
2037 );
2038 });
3dab1dad
YO
2039 if ( state==1 ) {
2040 OP( convert ) = nodetype;
2041 str=STRING(convert);
2042 STR_LEN(convert)=0;
2043 }
c490c714
YO
2044 STR_LEN(convert) += len;
2045 while (len--)
de734bd5 2046 *str++ = *ch++;
8e11feef 2047 } else {
f9049ba1 2048#ifdef DEBUGGING
8e11feef
RGS
2049 if (state>1)
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2051#endif
8e11feef
RGS
2052 break;
2053 }
2054 }
3dab1dad 2055 if (str) {
8e11feef 2056 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2057 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2058 trie->startstate = state;
07be1b83
YO
2059 trie->minlen -= (state - 1);
2060 trie->maxlen -= (state - 1);
33809eae
JH
2061#ifdef DEBUGGING
2062 /* At least the UNICOS C compiler choked on this
2063 * being argument to DEBUG_r(), so let's just have
2064 * it right here. */
2065 if (
2066#ifdef PERL_EXT_RE_BUILD
2067 1
2068#else
2069 DEBUG_r_TEST
2070#endif
2071 ) {
2072 regnode *fix = convert;
2073 U32 word = trie->wordcount;
2074 mjd_nodelen++;
2075 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2076 while( ++fix < n ) {
2077 Set_Node_Offset_Length(fix, 0, 0);
2078 }
2079 while (word--) {
2080 SV ** const tmp = av_fetch( trie_words, word, 0 );
2081 if (tmp) {
2082 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2083 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2084 else
2085 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2086 }
2087 }
2088 }
2089#endif
8e11feef
RGS
2090 if (trie->maxlen) {
2091 convert = n;
2092 } else {
3dab1dad 2093 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2094 DEBUG_r(optimize= n);
3dab1dad
YO
2095 }
2096 }
2097 }
a5ca303d
YO
2098 if (!jumper)
2099 jumper = last;
3dab1dad 2100 if ( trie->maxlen ) {
8e11feef
RGS
2101 NEXT_OFF( convert ) = (U16)(tail - convert);
2102 ARG_SET( convert, data_slot );
786e8c11
YO
2103 /* Store the offset to the first unabsorbed branch in
2104 jump[0], which is otherwise unused by the jump logic.
2105 We use this when dumping a trie and during optimisation. */
2106 if (trie->jump)
7f69552c 2107 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2108
786e8c11
YO
2109 /* XXXX */
2110 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2111 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2112 {
2113 OP( convert ) = TRIEC;
2114 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2115 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2116 trie->bitmap= NULL;
2117 } else
2118 OP( convert ) = TRIE;
a3621e74 2119
3dab1dad
YO
2120 /* store the type in the flags */
2121 convert->flags = nodetype;
a5ca303d
YO
2122 DEBUG_r({
2123 optimize = convert
2124 + NODE_STEP_REGNODE
2125 + regarglen[ OP( convert ) ];
2126 });
2127 /* XXX We really should free up the resource in trie now,
2128 as we won't use them - (which resources?) dmq */
3dab1dad 2129 }
a3621e74 2130 /* needed for dumping*/
e62cc96a 2131 DEBUG_r(if (optimize) {
07be1b83 2132 regnode *opt = convert;
bcdf7404 2133
e62cc96a 2134 while ( ++opt < optimize) {
07be1b83
YO
2135 Set_Node_Offset_Length(opt,0,0);
2136 }
786e8c11
YO
2137 /*
2138 Try to clean up some of the debris left after the
2139 optimisation.
a3621e74 2140 */
786e8c11 2141 while( optimize < jumper ) {
07be1b83 2142 mjd_nodelen += Node_Length((optimize));
a3621e74 2143 OP( optimize ) = OPTIMIZED;
07be1b83 2144 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2145 optimize++;
2146 }
07be1b83 2147 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2148 });
2149 } /* end node insert */
55eed653 2150 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2151#ifdef DEBUGGING
2152 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2153 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2154#else
2155 SvREFCNT_dec(revcharmap);
07be1b83 2156#endif
786e8c11
YO
2157 return trie->jump
2158 ? MADE_JUMP_TRIE
2159 : trie->startstate>1
2160 ? MADE_EXACT_TRIE
2161 : MADE_TRIE;
2162}
2163
2164STATIC void
2165S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2166{
2167/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2168
2169 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2170 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2171 ISBN 0-201-10088-6
2172
2173 We find the fail state for each state in the trie, this state is the longest proper
2174 suffix of the current states 'word' that is also a proper prefix of another word in our
2175 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2176 the DFA not to have to restart after its tried and failed a word at a given point, it
2177 simply continues as though it had been matching the other word in the first place.
2178 Consider
2179 'abcdgu'=~/abcdefg|cdgu/
2180 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2181 fail, which would bring use to the state representing 'd' in the second word where we would
2182 try 'g' and succeed, prodceding to match 'cdgu'.
2183 */
2184 /* add a fail transition */
3251b653
NC
2185 const U32 trie_offset = ARG(source);
2186 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2187 U32 *q;
2188 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2189 const U32 numstates = trie->statecount;
786e8c11
YO
2190 const U32 ubound = trie->lasttrans + ucharcount;
2191 U32 q_read = 0;
2192 U32 q_write = 0;
2193 U32 charid;
2194 U32 base = trie->states[ 1 ].trans.base;
2195 U32 *fail;
2196 reg_ac_data *aho;
2197 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2198 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2199
2200 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2201#ifndef DEBUGGING
2202 PERL_UNUSED_ARG(depth);
2203#endif
2204
2205
2206 ARG_SET( stclass, data_slot );
c944940b 2207 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2208 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2209 aho->trie=trie_offset;
446bd890
NC
2210 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2211 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2212 Newxz( q, numstates, U32);
c944940b 2213 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2214 aho->refcount = 1;
2215 fail = aho->fail;
2216 /* initialize fail[0..1] to be 1 so that we always have
2217 a valid final fail state */
2218 fail[ 0 ] = fail[ 1 ] = 1;
2219
2220 for ( charid = 0; charid < ucharcount ; charid++ ) {
2221 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2222 if ( newstate ) {
2223 q[ q_write ] = newstate;
2224 /* set to point at the root */
2225 fail[ q[ q_write++ ] ]=1;
2226 }
2227 }
2228 while ( q_read < q_write) {
2229 const U32 cur = q[ q_read++ % numstates ];
2230 base = trie->states[ cur ].trans.base;
2231
2232 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2233 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2234 if (ch_state) {
2235 U32 fail_state = cur;
2236 U32 fail_base;
2237 do {
2238 fail_state = fail[ fail_state ];
2239 fail_base = aho->states[ fail_state ].trans.base;
2240 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2241
2242 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2243 fail[ ch_state ] = fail_state;
2244 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2245 {
2246 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2247 }
2248 q[ q_write++ % numstates] = ch_state;
2249 }
2250 }
2251 }
2252 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2253 when we fail in state 1, this allows us to use the
2254 charclass scan to find a valid start char. This is based on the principle
2255 that theres a good chance the string being searched contains lots of stuff
2256 that cant be a start char.
2257 */
2258 fail[ 0 ] = fail[ 1 ] = 0;
2259 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2260 PerlIO_printf(Perl_debug_log,
2261 "%*sStclass Failtable (%"UVuf" states): 0",
2262 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2263 );
786e8c11
YO
2264 for( q_read=1; q_read<numstates; q_read++ ) {
2265 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2266 }
2267 PerlIO_printf(Perl_debug_log, "\n");
2268 });
2269 Safefree(q);
2270 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2271}
2272
786e8c11 2273
a3621e74 2274/*
5d1c421c
JH
2275 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2276 * These need to be revisited when a newer toolchain becomes available.
2277 */
2278#if defined(__sparc64__) && defined(__GNUC__)
2279# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2280# undef SPARC64_GCC_WORKAROUND
2281# define SPARC64_GCC_WORKAROUND 1
2282# endif
2283#endif
2284
07be1b83 2285#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2286 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2287 SV * const mysv=sv_newmortal(); \
2288 regnode *Next = regnext(scan); \
2289 regprop(RExC_rx, mysv, scan); \
7f69552c 2290 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2291 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2292 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2293 }});
07be1b83 2294
1de06328
YO
2295
2296
2297
2298
07be1b83
YO
2299#define JOIN_EXACT(scan,min,flags) \
2300 if (PL_regkind[OP(scan)] == EXACT) \
2301 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2302
be8e71aa 2303STATIC U32
07be1b83
YO
2304S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2305 /* Merge several consecutive EXACTish nodes into one. */
2306 regnode *n = regnext(scan);
2307 U32 stringok = 1;
2308 regnode *next = scan + NODE_SZ_STR(scan);
2309 U32 merged = 0;
2310 U32 stopnow = 0;
2311#ifdef DEBUGGING
2312 regnode *stop = scan;
72f13be8 2313 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2314#else
d47053eb
RGS
2315 PERL_UNUSED_ARG(depth);
2316#endif
7918f24d
NC
2317
2318 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2319#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2320 PERL_UNUSED_ARG(flags);
2321 PERL_UNUSED_ARG(val);
07be1b83 2322#endif
07be1b83
YO
2323 DEBUG_PEEP("join",scan,depth);
2324
2325 /* Skip NOTHING, merge EXACT*. */
2326 while (n &&
2327 ( PL_regkind[OP(n)] == NOTHING ||
2328 (stringok && (OP(n) == OP(scan))))
2329 && NEXT_OFF(n)
2330 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2331
2332 if (OP(n) == TAIL || n > next)
2333 stringok = 0;
2334 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2335 DEBUG_PEEP("skip:",n,depth);
2336 NEXT_OFF(scan) += NEXT_OFF(n);
2337 next = n + NODE_STEP_REGNODE;
2338#ifdef DEBUGGING
2339 if (stringok)
2340 stop = n;
2341#endif
2342 n = regnext(n);
2343 }
2344 else if (stringok) {
786e8c11 2345 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2346 regnode * const nnext = regnext(n);
2347
2348 DEBUG_PEEP("merg",n,depth);
2349
2350 merged++;
2351 if (oldl + STR_LEN(n) > U8_MAX)
2352 break;
2353 NEXT_OFF(scan) += NEXT_OFF(n);
2354 STR_LEN(scan) += STR_LEN(n);
2355 next = n + NODE_SZ_STR(n);
2356 /* Now we can overwrite *n : */
2357 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2358#ifdef DEBUGGING
2359 stop = next - 1;
2360#endif
2361 n = nnext;
2362 if (stopnow) break;
2363 }
2364
d47053eb
RGS
2365#ifdef EXPERIMENTAL_INPLACESCAN
2366 if (flags && !NEXT_OFF(n)) {
2367 DEBUG_PEEP("atch", val, depth);
2368 if (reg_off_by_arg[OP(n)]) {
2369 ARG_SET(n, val - n);
2370 }
2371 else {
2372 NEXT_OFF(n) = val - n;
2373 }
2374 stopnow = 1;
2375 }
07be1b83
YO
2376#endif
2377 }
2378
2379 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2380 /*
2381 Two problematic code points in Unicode casefolding of EXACT nodes:
2382
2383 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2384 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2385
2386 which casefold to
2387
2388 Unicode UTF-8
2389
2390 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2391 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2392
2393 This means that in case-insensitive matching (or "loose matching",
2394 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2395 length of the above casefolded versions) can match a target string
2396 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2397 This would rather mess up the minimum length computation.
2398
2399 What we'll do is to look for the tail four bytes, and then peek
2400 at the preceding two bytes to see whether we need to decrease
2401 the minimum length by four (six minus two).
2402
2403 Thanks to the design of UTF-8, there cannot be false matches:
2404 A sequence of valid UTF-8 bytes cannot be a subsequence of
2405 another valid sequence of UTF-8 bytes.
2406
2407 */
2408 char * const s0 = STRING(scan), *s, *t;
2409 char * const s1 = s0 + STR_LEN(scan) - 1;
2410 char * const s2 = s1 - 4;
e294cc5d
JH
2411#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2412 const char t0[] = "\xaf\x49\xaf\x42";
2413#else
07be1b83 2414 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2415#endif
07be1b83
YO
2416 const char * const t1 = t0 + 3;
2417
2418 for (s = s0 + 2;
2419 s < s2 && (t = ninstr(s, s1, t0, t1));
2420 s = t + 4) {
e294cc5d
JH
2421#ifdef EBCDIC
2422 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2423 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2424#else
07be1b83
YO
2425 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2426 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2427#endif
07be1b83
YO
2428 *min -= 4;
2429 }
2430 }
2431
2432#ifdef DEBUGGING
2433 /* Allow dumping */
2434 n = scan + NODE_SZ_STR(scan);
2435 while (n <= stop) {
2436 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2437 OP(n) = OPTIMIZED;
2438 NEXT_OFF(n) = 0;
2439 }
2440 n++;
2441 }
2442#endif
2443 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2444 return stopnow;
2445}
2446
653099ff
GS
2447/* REx optimizer. Converts nodes into quickier variants "in place".
2448 Finds fixed substrings. */
2449
a0288114 2450/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2451 to the position after last scanned or to NULL. */
2452
40d049e4
YO
2453#define INIT_AND_WITHP \
2454 assert(!and_withp); \
2455 Newx(and_withp,1,struct regnode_charclass_class); \
2456 SAVEFREEPV(and_withp)
07be1b83 2457
b515a41d
YO
2458/* this is a chain of data about sub patterns we are processing that
2459 need to be handled seperately/specially in study_chunk. Its so
2460 we can simulate recursion without losing state. */
2461struct scan_frame;
2462typedef struct scan_frame {
2463 regnode *last; /* last node to process in this frame */
2464 regnode *next; /* next node to process when last is reached */
2465 struct scan_frame *prev; /*previous frame*/
2466 I32 stop; /* what stopparen do we use */
2467} scan_frame;
2468
304ee84b
YO
2469
2470#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2471
e1d1eefb
YO
2472#define CASE_SYNST_FNC(nAmE) \
2473case nAmE: \
2474 if (flags & SCF_DO_STCLASS_AND) { \
2475 for (value = 0; value < 256; value++) \
2476 if (!is_ ## nAmE ## _cp(value)) \
2477 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2478 } \
2479 else { \
2480 for (value = 0; value < 256; value++) \
2481 if (is_ ## nAmE ## _cp(value)) \
2482 ANYOF_BITMAP_SET(data->start_class, value); \
2483 } \
2484 break; \
2485case N ## nAmE: \
2486 if (flags & SCF_DO_STCLASS_AND) { \
2487 for (value = 0; value < 256; value++) \
2488 if (is_ ## nAmE ## _cp(value)) \
2489 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2490 } \
2491 else { \
2492 for (value = 0; value < 256; value++) \
2493 if (!is_ ## nAmE ## _cp(value)) \
2494 ANYOF_BITMAP_SET(data->start_class, value); \
2495 } \
2496 break
2497
2498
2499
76e3520e 2500STATIC I32
40d049e4 2501S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2502 I32 *minlenp, I32 *deltap,
40d049e4
YO
2503 regnode *last,
2504 scan_data_t *data,
2505 I32 stopparen,
2506 U8* recursed,
2507 struct regnode_charclass_class *and_withp,
2508 U32 flags, U32 depth)
c277df42
IZ
2509 /* scanp: Start here (read-write). */
2510 /* deltap: Write maxlen-minlen here. */
2511 /* last: Stop before this one. */
40d049e4
YO
2512 /* data: string data about the pattern */
2513 /* stopparen: treat close N as END */
2514 /* recursed: which subroutines have we recursed into */
2515 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2516{
97aff369 2517 dVAR;
c277df42
IZ
2518 I32 min = 0, pars = 0, code;
2519 regnode *scan = *scanp, *next;
2520 I32 delta = 0;
2521 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2522 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2523 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2524 scan_data_t data_fake;
a3621e74 2525 SV *re_trie_maxbuff = NULL;
786e8c11 2526 regnode *first_non_open = scan;
e2e6a0f1 2527 I32 stopmin = I32_MAX;
8aa23a47 2528 scan_frame *frame = NULL;
a3621e74 2529 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2530
7918f24d
NC
2531 PERL_ARGS_ASSERT_STUDY_CHUNK;
2532
13a24bad 2533#ifdef DEBUGGING
40d049e4 2534 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2535#endif
40d049e4 2536
786e8c11 2537 if ( depth == 0 ) {
40d049e4 2538 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2539 first_non_open=regnext(first_non_open);
2540 }
2541
b81d288d 2542
8aa23a47
YO
2543 fake_study_recurse:
2544 while ( scan && OP(scan) != END && scan < last ){
2545 /* Peephole optimizer: */
304ee84b 2546 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2547 DEBUG_PEEP("Peep",scan,depth);
2548 JOIN_EXACT(scan,&min,0);
2549
2550 /* Follow the next-chain of the current node and optimize
2551 away all the NOTHINGs from it. */
2552 if (OP(scan) != CURLYX) {
2553 const int max = (reg_off_by_arg[OP(scan)]
2554 ? I32_MAX
2555 /* I32 may be smaller than U16 on CRAYs! */
2556 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2557 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2558 int noff;
2559 regnode *n = scan;
2560
2561 /* Skip NOTHING and LONGJMP. */
2562 while ((n = regnext(n))
2563 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2564 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2565 && off + noff < max)
2566 off += noff;
2567 if (reg_off_by_arg[OP(scan)])
2568 ARG(scan) = off;
2569 else
2570 NEXT_OFF(scan) = off;
2571 }
a3621e74 2572
c277df42 2573
8aa23a47
YO
2574
2575 /* The principal pseudo-switch. Cannot be a switch, since we
2576 look into several different things. */
2577 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2578 || OP(scan) == IFTHEN) {
2579 next = regnext(scan);
2580 code = OP(scan);
2581 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2582
2583 if (OP(next) == code || code == IFTHEN) {
2584 /* NOTE - There is similar code to this block below for handling
2585 TRIE nodes on a re-study. If you change stuff here check there
2586 too. */
2587 I32 max1 = 0, min1 = I32_MAX, num = 0;
2588 struct regnode_charclass_class accum;
2589 regnode * const startbranch=scan;
2590
2591 if (flags & SCF_DO_SUBSTR)
304ee84b 2592 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2593 if (flags & SCF_DO_STCLASS)
2594 cl_init_zero(pRExC_state, &accum);
2595
2596 while (OP(scan) == code) {
2597 I32 deltanext, minnext, f = 0, fake;
2598 struct regnode_charclass_class this_class;
2599
2600 num++;
2601 data_fake.flags = 0;
2602 if (data) {
2603 data_fake.whilem_c = data->whilem_c;
2604 data_fake.last_closep = data->last_closep;
2605 }
2606 else
2607 data_fake.last_closep = &fake;
58e23c8d
YO
2608
2609 data_fake.pos_delta = delta;
8aa23a47
YO
2610 next = regnext(scan);
2611 scan = NEXTOPER(scan);
2612 if (code != BRANCH)
c277df42 2613 scan = NEXTOPER(scan);
8aa23a47
YO
2614 if (flags & SCF_DO_STCLASS) {
2615 cl_init(pRExC_state, &this_class);
2616 data_fake.start_class = &this_class;
2617 f = SCF_DO_STCLASS_AND;
58e23c8d 2618 }
8aa23a47
YO
2619 if (flags & SCF_WHILEM_VISITED_POS)
2620 f |= SCF_WHILEM_VISITED_POS;
2621
2622 /* we suppose the run is continuous, last=next...*/
2623 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2624 next, &data_fake,
2625 stopparen, recursed, NULL, f,depth+1);
2626 if (min1 > minnext)
2627 min1 = minnext;
2628 if (max1 < minnext + deltanext)
2629 max1 = minnext + deltanext;
2630 if (deltanext == I32_MAX)
2631 is_inf = is_inf_internal = 1;
2632 scan = next;
2633 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2634 pars++;
2635 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2636 if ( stopmin > minnext)
2637 stopmin = min + min1;
2638 flags &= ~SCF_DO_SUBSTR;
2639 if (data)
2640 data->flags |= SCF_SEEN_ACCEPT;
2641 }
2642 if (data) {
2643 if (data_fake.flags & SF_HAS_EVAL)
2644 data->flags |= SF_HAS_EVAL;
2645 data->whilem_c = data_fake.whilem_c;
3dab1dad 2646 }
8aa23a47
YO
2647 if (flags & SCF_DO_STCLASS)
2648 cl_or(pRExC_state, &accum, &this_class);
2649 }
2650 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2651 min1 = 0;
2652 if (flags & SCF_DO_SUBSTR) {
2653 data->pos_min += min1;
2654 data->pos_delta += max1 - min1;
2655 if (max1 != min1 || is_inf)
2656 data->longest = &(data->longest_float);
2657 }
2658 min += min1;
2659 delta += max1 - min1;
2660 if (flags & SCF_DO_STCLASS_OR) {
2661 cl_or(pRExC_state, data->start_class, &accum);
2662 if (min1) {
2663 cl_and(data->start_class, and_withp);
2664 flags &= ~SCF_DO_STCLASS;
653099ff 2665 }
8aa23a47
YO
2666 }
2667 else if (flags & SCF_DO_STCLASS_AND) {
2668 if (min1) {
2669 cl_and(data->start_class, &accum);
2670 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2671 }
8aa23a47
YO
2672 else {
2673 /* Switch to OR mode: cache the old value of
2674 * data->start_class */
2675 INIT_AND_WITHP;
2676 StructCopy(data->start_class, and_withp,
2677 struct regnode_charclass_class);
2678 flags &= ~SCF_DO_STCLASS_AND;
2679 StructCopy(&accum, data->start_class,
2680 struct regnode_charclass_class);
2681 flags |= SCF_DO_STCLASS_OR;
2682 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2683 }
8aa23a47 2684 }
a3621e74 2685
8aa23a47
YO
2686 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2687 /* demq.
a3621e74 2688
8aa23a47
YO
2689 Assuming this was/is a branch we are dealing with: 'scan' now
2690 points at the item that follows the branch sequence, whatever
2691 it is. We now start at the beginning of the sequence and look
2692 for subsequences of
a3621e74 2693
8aa23a47
YO
2694 BRANCH->EXACT=>x1
2695 BRANCH->EXACT=>x2
2696 tail
a3621e74 2697
8aa23a47 2698 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2699
8aa23a47
YO
2700 If we can find such a subseqence we need to turn the first
2701 element into a trie and then add the subsequent branch exact
2702 strings to the trie.
a3621e74 2703
8aa23a47 2704 We have two cases
a3621e74 2705
8aa23a47 2706 1. patterns where the whole set of branch can be converted.
a3621e74 2707
8aa23a47 2708 2. patterns where only a subset can be converted.
a3621e74 2709
8aa23a47
YO
2710 In case 1 we can replace the whole set with a single regop
2711 for the trie. In case 2 we need to keep the start and end
2712 branchs so
a3621e74 2713
8aa23a47
YO
2714 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2715 becomes BRANCH TRIE; BRANCH X;
786e8c11 2716
8aa23a47
YO
2717 There is an additional case, that being where there is a
2718 common prefix, which gets split out into an EXACT like node
2719 preceding the TRIE node.
a3621e74 2720
8aa23a47
YO
2721 If x(1..n)==tail then we can do a simple trie, if not we make
2722 a "jump" trie, such that when we match the appropriate word
2723 we "jump" to the appopriate tail node. Essentailly we turn
2724 a nested if into a case structure of sorts.
b515a41d 2725
8aa23a47
YO
2726 */
2727
2728 int made=0;
2729 if (!re_trie_maxbuff) {
2730 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2731 if (!SvIOK(re_trie_maxbuff))
2732 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2733 }
2734 if ( SvIV(re_trie_maxbuff)>=0 ) {
2735 regnode *cur;
2736 regnode *first = (regnode *)NULL;
2737 regnode *last = (regnode *)NULL;
2738 regnode *tail = scan;
2739 U8 optype = 0;
2740 U32 count=0;
a3621e74
YO
2741
2742#ifdef DEBUGGING
8aa23a47 2743 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2744#endif
8aa23a47
YO
2745 /* var tail is used because there may be a TAIL
2746 regop in the way. Ie, the exacts will point to the
2747 thing following the TAIL, but the last branch will
2748 point at the TAIL. So we advance tail. If we
2749 have nested (?:) we may have to move through several
2750 tails.
2751 */
2752
2753 while ( OP( tail ) == TAIL ) {
2754 /* this is the TAIL generated by (?:) */
2755 tail = regnext( tail );
2756 }
a3621e74 2757
8aa23a47
YO
2758
2759 DEBUG_OPTIMISE_r({
2760 regprop(RExC_rx, mysv, tail );
2761 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2762 (int)depth * 2 + 2, "",
2763 "Looking for TRIE'able sequences. Tail node is: ",
2764 SvPV_nolen_const( mysv )
2765 );
2766 });
2767
2768 /*
2769
2770 step through the branches, cur represents each
2771 branch, noper is the first thing to be matched
2772 as part of that branch and noper_next is the
2773 regnext() of that node. if noper is an EXACT
2774 and noper_next is the same as scan (our current
2775 position in the regex) then the EXACT branch is
2776 a possible optimization target. Once we have
2777 two or more consequetive such branches we can
2778 create a trie of the EXACT's contents and stich
2779 it in place. If the sequence represents all of
2780 the branches we eliminate the whole thing and
2781 replace it with a single TRIE. If it is a
2782 subsequence then we need to stitch it in. This
2783 means the first branch has to remain, and needs
2784 to be repointed at the item on the branch chain
2785 following the last branch optimized. This could
2786 be either a BRANCH, in which case the
2787 subsequence is internal, or it could be the
2788 item following the branch sequence in which
2789 case the subsequence is at the end.
2790
2791 */
2792
2793 /* dont use tail as the end marker for this traverse */
2794 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2795 regnode * const noper = NEXTOPER( cur );
b515a41d 2796#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2797 regnode * const noper_next = regnext( noper );
b515a41d
YO
2798#endif
2799
8aa23a47
YO
2800 DEBUG_OPTIMISE_r({
2801 regprop(RExC_rx, mysv, cur);
2802 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2803 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2804
2805 regprop(RExC_rx, mysv, noper);
2806 PerlIO_printf( Perl_debug_log, " -> %s",
2807 SvPV_nolen_const(mysv));
2808
2809 if ( noper_next ) {
2810 regprop(RExC_rx, mysv, noper_next );
2811 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2812 SvPV_nolen_const(mysv));
2813 }
2814 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2815 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2816 });
2817 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2818 : PL_regkind[ OP( noper ) ] == EXACT )
2819 || OP(noper) == NOTHING )
786e8c11 2820#ifdef NOJUMPTRIE
8aa23a47 2821 && noper_next == tail
786e8c11 2822#endif
8aa23a47
YO
2823 && count < U16_MAX)
2824 {
2825 count++;
2826 if ( !first || optype == NOTHING ) {
2827 if (!first) first = cur;
2828 optype = OP( noper );
2829 } else {
2830 last = cur;
2831 }
2832 } else {
a0a388a1 2833/*
0abd0d78
YO
2834 Currently we do not believe that the trie logic can
2835 handle case insensitive matching properly when the
2836 pattern is not unicode (thus forcing unicode semantics).
2837
2838 If/when this is fixed the following define can be swapped
2839 in below to fully enable trie logic.
2840
a0a388a1 2841#define TRIE_TYPE_IS_SAFE 1
0abd0d78
YO
2842
2843*/
2844#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2845
a0a388a1 2846 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
2847 make_trie( pRExC_state,
2848 startbranch, first, cur, tail, count,
2849 optype, depth+1 );
2850 }
2851 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2852#ifdef NOJUMPTRIE
8aa23a47 2853 && noper_next == tail
786e8c11 2854#endif
8aa23a47
YO
2855 ){
2856 count = 1;
2857 first = cur;
2858 optype = OP( noper );
2859 } else {
2860 count = 0;
2861 first = NULL;
2862 optype = 0;
2863 }
2864 last = NULL;
2865 }
2866 }
2867 DEBUG_OPTIMISE_r({
2868 regprop(RExC_rx, mysv, cur);
2869 PerlIO_printf( Perl_debug_log,
2870 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2871 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2872
2873 });
a0a388a1
YO
2874
2875 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 2876 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2877#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2878 if ( ((made == MADE_EXACT_TRIE &&
2879 startbranch == first)
2880 || ( first_non_open == first )) &&
2881 depth==0 ) {
2882 flags |= SCF_TRIE_RESTUDY;
2883 if ( startbranch == first
2884 && scan == tail )
2885 {
2886 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2887 }
2888 }
3dab1dad 2889#endif
8aa23a47
YO
2890 }
2891 }
2892
2893 } /* do trie */
2894
653099ff 2895 }
8aa23a47
YO
2896 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2897 scan = NEXTOPER(NEXTOPER(scan));
2898 } else /* single branch is optimized. */
2899 scan = NEXTOPER(scan);
2900 continue;
2901 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2902 scan_frame *newframe = NULL;
2903 I32 paren;
2904 regnode *start;
2905 regnode *end;
2906
2907 if (OP(scan) != SUSPEND) {
2908 /* set the pointer */
2909 if (OP(scan) == GOSUB) {
2910 paren = ARG(scan);
2911 RExC_recurse[ARG2L(scan)] = scan;
2912 start = RExC_open_parens[paren-1];
2913 end = RExC_close_parens[paren-1];
2914 } else {
2915 paren = 0;
f8fc2ecf 2916 start = RExC_rxi->program + 1;
8aa23a47
YO
2917 end = RExC_opend;
2918 }
2919 if (!recursed) {
2920 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2921 SAVEFREEPV(recursed);
2922 }
2923 if (!PAREN_TEST(recursed,paren+1)) {
2924 PAREN_SET(recursed,paren+1);
2925 Newx(newframe,1,scan_frame);
2926 } else {
2927 if (flags & SCF_DO_SUBSTR) {
304ee84b 2928 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2929 data->longest = &(data->longest_float);
2930 }
2931 is_inf = is_inf_internal = 1;
2932 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2933 cl_anything(pRExC_state, data->start_class);
2934 flags &= ~SCF_DO_STCLASS;
2935 }
2936 } else {
2937 Newx(newframe,1,scan_frame);
2938 paren = stopparen;
2939 start = scan+2;
2940 end = regnext(scan);
2941 }
2942 if (newframe) {
2943 assert(start);
2944 assert(end);
2945 SAVEFREEPV(newframe);
2946 newframe->next = regnext(scan);
2947 newframe->last = last;
2948 newframe->stop = stopparen;
2949 newframe->prev = frame;
2950
2951 frame = newframe;
2952 scan = start;
2953 stopparen = paren;
2954 last = end;
2955
2956 continue;
2957 }
2958 }
2959 else if (OP(scan) == EXACT) {
2960 I32 l = STR_LEN(scan);
2961 UV uc;
2962 if (UTF) {
2963 const U8 * const s = (U8*)STRING(scan);
2964 l = utf8_length(s, s + l);
2965 uc = utf8_to_uvchr(s, NULL);
2966 } else {
2967 uc = *((U8*)STRING(scan));
2968 }
2969 min += l;
2970 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2971 /* The code below prefers earlier match for fixed
2972 offset, later match for variable offset. */
2973 if (data->last_end == -1) { /* Update the start info. */
2974 data->last_start_min = data->pos_min;
2975 data->last_start_max = is_inf
2976 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2977 }
8aa23a47
YO
2978 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2979 if (UTF)
2980 SvUTF8_on(data->last_found);
2981 {
2982 SV * const sv = data->last_found;
2983 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2984 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2985 if (mg && mg->mg_len >= 0)
2986 mg->mg_len += utf8_length((U8*)STRING(scan),
2987 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2988 }
8aa23a47
YO
2989 data->last_end = data->pos_min + l;
2990 data->pos_min += l; /* As in the first entry. */
2991 data->flags &= ~SF_BEFORE_EOL;
2992 }
2993 if (flags & SCF_DO_STCLASS_AND) {
2994 /* Check whether it is compatible with what we know already! */
2995 int compat = 1;
2996
2997 if (uc >= 0x100 ||
2998 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2999 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3000 && (!(data->start_class->flags & ANYOF_FOLD)
3001 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3002 )
3003 compat = 0;
3004 ANYOF_CLASS_ZERO(data->start_class);
3005 ANYOF_BITMAP_ZERO(data->start_class);
3006 if (compat)
3007 ANYOF_BITMAP_SET(data->start_class, uc);
3008 data->start_class->flags &= ~ANYOF_EOS;
3009 if (uc < 0x100)
3010 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3011 }
3012 else if (flags & SCF_DO_STCLASS_OR) {
3013 /* false positive possible if the class is case-folded */
3014 if (uc < 0x100)
3015 ANYOF_BITMAP_SET(data->start_class, uc);
3016 else
3017 data->start_class->flags |= ANYOF_UNICODE_ALL;
3018 data->start_class->flags &= ~ANYOF_EOS;
3019 cl_and(data->start_class, and_withp);
3020 }
3021 flags &= ~SCF_DO_STCLASS;
3022 }
3023 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3024 I32 l = STR_LEN(scan);
3025 UV uc = *((U8*)STRING(scan));
3026
3027 /* Search for fixed substrings supports EXACT only. */
3028 if (flags & SCF_DO_SUBSTR) {
3029 assert(data);
304ee84b 3030 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3031 }
3032 if (UTF) {
3033 const U8 * const s = (U8 *)STRING(scan);
3034 l = utf8_length(s, s + l);
3035 uc = utf8_to_uvchr(s, NULL);
3036 }
3037 min += l;
3038 if (flags & SCF_DO_SUBSTR)
3039 data->pos_min += l;
3040 if (flags & SCF_DO_STCLASS_AND) {
3041 /* Check whether it is compatible with what we know already! */
3042 int compat = 1;
3043
3044 if (uc >= 0x100 ||
3045 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3046 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3047 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
3048 compat = 0;
3049 ANYOF_CLASS_ZERO(data->start_class);
3050 ANYOF_BITMAP_ZERO(data->start_class);
3051 if (compat) {
3052 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3053 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
3054 data->start_class->flags |= ANYOF_FOLD;
3055 if (OP(scan) == EXACTFL)
3056 data->start_class->flags |= ANYOF_LOCALE;
653099ff 3057 }
8aa23a47
YO
3058 }
3059 else if (flags & SCF_DO_STCLASS_OR) {
3060 if (data->start_class->flags & ANYOF_FOLD) {
3061 /* false positive possible if the class is case-folded.
3062 Assume that the locale settings are the same... */
1aa99e6b
IH
3063 if (uc < 0x100)
3064 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
3065 data->start_class->flags &= ~ANYOF_EOS;
3066 }
8aa23a47 3067 cl_and(data->start_class, and_withp);
653099ff 3068 }
8aa23a47
YO
3069 flags &= ~SCF_DO_STCLASS;
3070 }
3071 else if (strchr((const char*)PL_varies,OP(scan))) {
3072 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3073 I32 f = flags, pos_before = 0;
3074 regnode * const oscan = scan;
3075 struct regnode_charclass_class this_class;
3076 struct regnode_charclass_class *oclass = NULL;
3077 I32 next_is_eval = 0;
3078
3079 switch (PL_regkind[OP(scan)]) {
3080 case WHILEM: /* End of (?:...)* . */
3081 scan = NEXTOPER(scan);
3082 goto finish;
3083 case PLUS:
3084 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3085 next = NEXTOPER(scan);
3086 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3087 mincount = 1;
3088 maxcount = REG_INFTY;
3089 next = regnext(scan);
3090 scan = NEXTOPER(scan);
3091 goto do_curly;
3092 }
3093 }
3094 if (flags & SCF_DO_SUBSTR)
3095 data->pos_min++;
3096 min++;
3097 /* Fall through. */
3098 case STAR:
3099 if (flags & SCF_DO_STCLASS) {
3100 mincount = 0;
3101 maxcount = REG_INFTY;
3102 next = regnext(scan);
3103 scan = NEXTOPER(scan);
3104 goto do_curly;
3105 }
3106 is_inf = is_inf_internal = 1;
3107 scan = regnext(scan);
c277df42 3108 if (flags & SCF_DO_SUBSTR) {
304ee84b 3109 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3110 data->longest = &(data->longest_float);
c277df42 3111 }
8aa23a47
YO
3112 goto optimize_curly_tail;
3113 case CURLY:
3114 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3115 && (scan->flags == stopparen))
3116 {
3117 mincount = 1;
3118 maxcount = 1;
3119 } else {
3120 mincount = ARG1(scan);
3121 maxcount = ARG2(scan);
653099ff 3122 }
8aa23a47
YO
3123 next = regnext(scan);
3124 if (OP(scan) == CURLYX) {
3125 I32 lp = (data ? *(data->last_closep) : 0);
3126 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3127 }
8aa23a47
YO
3128 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3129 next_is_eval = (OP(scan) == EVAL);
3130 do_curly:
3131 if (flags & SCF_DO_SUBSTR) {
304ee84b 3132 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3133 pos_before = data->pos_min;
b45f050a 3134 }
8aa23a47
YO
3135 if (data) {
3136 fl = data->flags;
3137 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3138 if (is_inf)
3139 data->flags |= SF_IS_INF;
3140 }
3141 if (flags & SCF_DO_STCLASS) {
3142 cl_init(pRExC_state, &this_class);
3143 oclass = data->start_class;
3144 data->start_class = &this_class;
3145 f |= SCF_DO_STCLASS_AND;
3146 f &= ~SCF_DO_STCLASS_OR;
3147 }
3148 /* These are the cases when once a subexpression
3149 fails at a particular position, it cannot succeed
3150 even after backtracking at the enclosing scope.
3151
3152 XXXX what if minimal match and we are at the
3153 initial run of {n,m}? */
3154 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3155 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3156
8aa23a47
YO
3157 /* This will finish on WHILEM, setting scan, or on NULL: */
3158 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3159 last, data, stopparen, recursed, NULL,
3160 (mincount == 0
3161 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3162
8aa23a47
YO
3163 if (flags & SCF_DO_STCLASS)
3164 data->start_class = oclass;
3165 if (mincount == 0 || minnext == 0) {
3166 if (flags & SCF_DO_STCLASS_OR) {
3167 cl_or(pRExC_state, data->start_class, &this_class);
3168 }
3169 else if (flags & SCF_DO_STCLASS_AND) {
3170 /* Switch to OR mode: cache the old value of
3171 * data->start_class */
3172 INIT_AND_WITHP;
3173 StructCopy(data->start_class, and_withp,
3174 struct regnode_charclass_class);
3175 flags &= ~SCF_DO_STCLASS_AND;
3176 StructCopy(&this_class, data->start_class,
3177 struct regnode_charclass_class);
3178 flags |= SCF_DO_STCLASS_OR;
3179 data->start_class->flags |= ANYOF_EOS;
3180 }
3181 } else { /* Non-zero len */
3182 if (flags & SCF_DO_STCLASS_OR) {
3183 cl_or(pRExC_state, data->start_class, &this_class);
3184 cl_and(data->start_class, and_withp);
3185 }
3186 else if (flags & SCF_DO_STCLASS_AND)
3187 cl_and(data->start_class, &this_class);
3188 flags &= ~SCF_DO_STCLASS;
3189 }
3190 if (!scan) /* It was not CURLYX, but CURLY. */
3191 scan = next;
3192 if ( /* ? quantifier ok, except for (?{ ... }) */
3193 (next_is_eval || !(mincount == 0 && maxcount == 1))
3194 && (minnext == 0) && (deltanext == 0)
3195 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3196 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3197 {
668c081a
NC
3198 ckWARNreg(RExC_parse,
3199 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3200 }
3201
3202 min += minnext * mincount;
3203 is_inf_internal |= ((maxcount == REG_INFTY
3204 && (minnext + deltanext) > 0)
3205 || deltanext == I32_MAX);
3206 is_inf |= is_inf_internal;
3207 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3208
3209 /* Try powerful optimization CURLYX => CURLYN. */
3210 if ( OP(oscan) == CURLYX && data
3211 && data->flags & SF_IN_PAR
3212 && !(data->flags & SF_HAS_EVAL)
3213 && !deltanext && minnext == 1 ) {
3214 /* Try to optimize to CURLYN. */
3215 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3216 regnode * const nxt1 = nxt;
497b47a8 3217#ifdef DEBUGGING
8aa23a47 3218 regnode *nxt2;
497b47a8 3219#endif
c277df42 3220
8aa23a47
YO
3221 /* Skip open. */
3222 nxt = regnext(nxt);
3223 if (!strchr((const char*)PL_simple,OP(nxt))
3224 && !(PL_regkind[OP(nxt)] == EXACT
3225 && STR_LEN(nxt) == 1))
3226 goto nogo;
497b47a8 3227#ifdef DEBUGGING
8aa23a47 3228 nxt2 = nxt;
497b47a8 3229#endif
8aa23a47
YO
3230 nxt = regnext(nxt);
3231 if (OP(nxt) != CLOSE)
3232 goto nogo;
3233 if (RExC_open_parens) {
3234 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3235 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3236 }
3237 /* Now we know that nxt2 is the only contents: */
3238 oscan->flags = (U8)ARG(nxt);
3239 OP(oscan) = CURLYN;
3240 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3241
c277df42 3242#ifdef DEBUGGING
8aa23a47
YO
3243 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3244 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3245 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3246 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3247 OP(nxt + 1) = OPTIMIZED; /* was count. */
3248 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3249#endif
8aa23a47
YO
3250 }
3251 nogo:
3252
3253 /* Try optimization CURLYX => CURLYM. */
3254 if ( OP(oscan) == CURLYX && data
3255 && !(data->flags & SF_HAS_PAR)
3256 && !(data->flags & SF_HAS_EVAL)
3257 && !deltanext /* atom is fixed width */
3258 && minnext != 0 /* CURLYM can't handle zero width */
3259 ) {
3260 /* XXXX How to optimize if data == 0? */
3261 /* Optimize to a simpler form. */
3262 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3263 regnode *nxt2;
3264
3265 OP(oscan) = CURLYM;
3266 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3267 && (OP(nxt2) != WHILEM))
3268 nxt = nxt2;
3269 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3270 /* Need to optimize away parenths. */
3271 if (data->flags & SF_IN_PAR) {
3272 /* Set the parenth number. */
3273 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3274
3275 if (OP(nxt) != CLOSE)
3276 FAIL("Panic opt close");
3277 oscan->flags = (U8)ARG(nxt);
3278 if (RExC_open_parens) {
3279 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3280 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3281 }
8aa23a47
YO
3282 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3283 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3284
c277df42 3285#ifdef DEBUGGING
8aa23a47
YO
3286 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3287 OP(nxt + 1) = OPTIMIZED; /* was count. */
3288 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3289 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3290#endif
c277df42 3291#if 0
8aa23a47
YO
3292 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3293 regnode *nnxt = regnext(nxt1);
3294
3295 if (nnxt == nxt) {
3296 if (reg_off_by_arg[OP(nxt1)])
3297 ARG_SET(nxt1, nxt2 - nxt1);
3298 else if (nxt2 - nxt1 < U16_MAX)
3299 NEXT_OFF(nxt1) = nxt2 - nxt1;
3300 else
3301 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3302 }
8aa23a47 3303 nxt1 = nnxt;
c277df42 3304 }
5d1c421c 3305#endif
8aa23a47
YO
3306 /* Optimize again: */
3307 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3308 NULL, stopparen, recursed, NULL, 0,depth+1);
3309 }
3310 else
3311 oscan->flags = 0;
3312 }
3313 else if ((OP(oscan) == CURLYX)
3314 && (flags & SCF_WHILEM_VISITED_POS)
3315 /* See the comment on a similar expression above.
3316 However, this time it not a subexpression
3317 we care about, but the expression itself. */
3318 && (maxcount == REG_INFTY)
3319 && data && ++data->whilem_c < 16) {
3320 /* This stays as CURLYX, we can put the count/of pair. */
3321 /* Find WHILEM (as in regexec.c) */
3322 regnode *nxt = oscan + NEXT_OFF(oscan);
3323
3324 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3325 nxt += ARG(nxt);
3326 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3327 | (RExC_whilem_seen << 4)); /* On WHILEM */
3328 }
3329 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3330 pars++;
3331 if (flags & SCF_DO_SUBSTR) {
3332 SV *last_str = NULL;
3333 int counted = mincount != 0;
a0ed51b3 3334
8aa23a47
YO
3335 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3336#if defined(SPARC64_GCC_WORKAROUND)
3337 I32 b = 0;
3338 STRLEN l = 0;
3339 const char *s = NULL;
3340 I32 old = 0;
b515a41d 3341
8aa23a47
YO
3342 if (pos_before >= data->last_start_min)
3343 b = pos_before;
3344 else
3345 b = data->last_start_min;
b515a41d 3346
8aa23a47
YO
3347 l = 0;
3348 s = SvPV_const(data->last_found, l);
3349 old = b - data->last_start_min;
3350
3351#else
3352 I32 b = pos_before >= data->last_start_min
3353 ? pos_before : data->last_start_min;
3354 STRLEN l;
3355 const char * const s = SvPV_const(data->last_found, l);
3356 I32 old = b - data->last_start_min;
3357#endif
3358
3359 if (UTF)
3360 old = utf8_hop((U8*)s, old) - (U8*)s;
3361
3362 l -= old;
3363 /* Get the added string: */
740cce10 3364 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3365 if (deltanext == 0 && pos_before == b) {
3366 /* What was added is a constant string */
3367 if (mincount > 1) {
3368 SvGROW(last_str, (mincount * l) + 1);
3369 repeatcpy(SvPVX(last_str) + l,
3370 SvPVX_const(last_str), l, mincount - 1);
3371 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3372 /* Add additional parts. */
3373 SvCUR_set(data->last_found,
3374 SvCUR(data->last_found) - l);
3375 sv_catsv(data->last_found, last_str);
3376 {
3377 SV * sv = data->last_found;
3378 MAGIC *mg =
3379 SvUTF8(sv) && SvMAGICAL(sv) ?
3380 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3381 if (mg && mg->mg_len >= 0)
bd94e887 3382 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3383 }
8aa23a47 3384 data->last_end += l * (mincount - 1);
b515a41d 3385 }
8aa23a47
YO
3386 } else {
3387 /* start offset must point into the last copy */
3388 data->last_start_min += minnext * (mincount - 1);
3389 data->last_start_max += is_inf ? I32_MAX
3390 : (maxcount - 1) * (minnext + data->pos_delta);
3391 }
c277df42 3392 }
8aa23a47
YO
3393 /* It is counted once already... */
3394 data->pos_min += minnext * (mincount - counted);
3395 data->pos_delta += - counted * deltanext +
3396 (minnext + deltanext) * maxcount - minnext * mincount;
3397 if (mincount != maxcount) {
3398 /* Cannot extend fixed substrings found inside
3399 the group. */
304ee84b 3400 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3401 if (mincount && last_str) {
3402 SV * const sv = data->last_found;
3403 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3404 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3405
3406 if (mg)
3407 mg->mg_len = -1;
3408 sv_setsv(sv, last_str);
3409 data->last_end = data->pos_min;
3410 data->last_start_min =
3411 data->pos_min - CHR_SVLEN(last_str);
3412 data->last_start_max = is_inf
3413 ? I32_MAX
3414 : data->pos_min + data->pos_delta
3415 - CHR_SVLEN(last_str);
3416 }
3417 data->longest = &(data->longest_float);
3418 }
3419 SvREFCNT_dec(last_str);
c277df42 3420 }
8aa23a47
YO
3421 if (data && (fl & SF_HAS_EVAL))
3422 data->flags |= SF_HAS_EVAL;
3423 optimize_curly_tail:
3424 if (OP(oscan) != CURLYX) {
3425 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3426 && NEXT_OFF(next))
3427 NEXT_OFF(oscan) += NEXT_OFF(next);
3428 }
3429 continue;
3430 default: /* REF and CLUMP only? */
3431 if (flags & SCF_DO_SUBSTR) {
304ee84b 3432 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3433 data->longest = &(data->longest_float);
3434 }
3435 is_inf = is_inf_internal = 1;
3436 if (flags & SCF_DO_STCLASS_OR)
3437 cl_anything(pRExC_state, data->start_class);
3438 flags &= ~SCF_DO_STCLASS;
3439 break;
c277df42 3440 }
8aa23a47 3441 }
e1d1eefb
YO
3442 else if (OP(scan) == LNBREAK) {
3443 if (flags & SCF_DO_STCLASS) {
3444 int value = 0;
3445 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3446 if (flags & SCF_DO_STCLASS_AND) {
3447 for (value = 0; value < 256; value++)
e64b1bd1 3448 if (!is_VERTWS_cp(value))
e1d1eefb
YO
3449 ANYOF_BITMAP_CLEAR(data->start_class, value);
3450 }
3451 else {
3452 for (value = 0; value < 256; value++)
e64b1bd1 3453 if (is_VERTWS_cp(value))
e1d1eefb
YO
3454 ANYOF_BITMAP_SET(data->start_class, value);
3455 }
3456 if (flags & SCF_DO_STCLASS_OR)
3457 cl_and(data->start_class, and_withp);
3458 flags &= ~SCF_DO_STCLASS;
3459 }
3460 min += 1;
f9a79580 3461 delta += 1;
e1d1eefb
YO
3462 if (flags & SCF_DO_SUBSTR) {
3463 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3464 data->pos_min += 1;
f9a79580 3465 data->pos_delta += 1;
e1d1eefb
YO
3466 data->longest = &(data->longest_float);
3467 }
3468
3469 }
f9a79580
RGS
3470 else if (OP(scan) == FOLDCHAR) {
3471 int d = ARG(scan)==0xDF ? 1 : 2;
3472 flags &= ~SCF_DO_STCLASS;
3473 min += 1;
3474 delta += d;
3475 if (flags & SCF_DO_SUBSTR) {
3476 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3477 data->pos_min += 1;
3478 data->pos_delta += d;
3479 data->longest = &(data->longest_float);
3480 }
3481 }
8aa23a47
YO
3482 else if (strchr((const char*)PL_simple,OP(scan))) {
3483 int value = 0;
653099ff 3484
8aa23a47 3485 if (flags & SCF_DO_SUBSTR) {
304ee84b 3486 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3487 data->pos_min++;
3488 }
3489 min++;
3490 if (flags & SCF_DO_STCLASS) {
3491 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3492
8aa23a47
YO
3493 /* Some of the logic below assumes that switching
3494 locale on will only add false positives. */
3495 switch (PL_regkind[OP(scan)]) {
3496 case SANY:
3497 default:
3498 do_default:
3499 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3500 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3501 cl_anything(pRExC_state, data->start_class);
3502 break;
3503 case REG_ANY:
3504 if (OP(scan) == SANY)
3505 goto do_default;
3506 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3507 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3508 || (data->start_class->flags & ANYOF_CLASS));
3509 cl_anything(pRExC_state, data->start_class);
653099ff 3510 }
8aa23a47
YO
3511 if (flags & SCF_DO_STCLASS_AND || !value)
3512 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3513 break;
3514 case ANYOF:
3515 if (flags & SCF_DO_STCLASS_AND)
3516 cl_and(data->start_class,
3517 (struct regnode_charclass_class*)scan);
653099ff 3518 else
8aa23a47
YO
3519 cl_or(pRExC_state, data->start_class,
3520 (struct regnode_charclass_class*)scan);
3521 break;
3522 case ALNUM:
3523 if (flags & SCF_DO_STCLASS_AND) {
3524 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3525 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3526 for (value = 0; value < 256; value++)
3527 if (!isALNUM(value))
3528 ANYOF_BITMAP_CLEAR(data->start_class, value);
3529 }
653099ff 3530 }
8aa23a47
YO
3531 else {
3532 if (data->start_class->flags & ANYOF_LOCALE)
3533 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3534 else {
3535 for (value = 0; value < 256; value++)
3536 if (isALNUM(value))
3537 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3538 }
8aa23a47
YO
3539 }
3540 break;
3541 case ALNUML:
3542 if (flags & SCF_DO_STCLASS_AND) {
3543 if (data->start_class->flags & ANYOF_LOCALE)
3544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3545 }
3546 else {
3547 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3548 data->start_class->flags |= ANYOF_LOCALE;
3549 }
3550 break;
3551 case NALNUM:
3552 if (flags & SCF_DO_STCLASS_AND) {
3553 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3554 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3555 for (value = 0; value < 256; value++)
3556 if (isALNUM(value))
3557 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3558 }
3559 }
8aa23a47
YO
3560 else {
3561 if (data->start_class->flags & ANYOF_LOCALE)
3562 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3563 else {
3564 for (value = 0; value < 256; value++)
3565 if (!isALNUM(value))
3566 ANYOF_BITMAP_SET(data->start_class, value);
3567 }
653099ff 3568 }
8aa23a47
YO
3569 break;
3570 case NALNUML:
3571 if (flags & SCF_DO_STCLASS_AND) {
3572 if (data->start_class->flags & ANYOF_LOCALE)
3573 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3574 }
8aa23a47
YO
3575 else {
3576 data->start_class->flags |= ANYOF_LOCALE;
3577 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3578 }
3579 break;
3580 case SPACE:
3581 if (flags & SCF_DO_STCLASS_AND) {
3582 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3583 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3584 for (value = 0; value < 256; value++)
3585 if (!isSPACE(value))
3586 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3587 }
3588 }
8aa23a47
YO
3589 else {
3590 if (data->start_class->flags & ANYOF_LOCALE)
3591 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3592 else {
3593 for (value = 0; value < 256; value++)
3594 if (isSPACE(value))
3595 ANYOF_BITMAP_SET(data->start_class, value);
3596 }
653099ff 3597 }
8aa23a47
YO
3598 break;
3599 case SPACEL:
3600 if (flags & SCF_DO_STCLASS_AND) {
3601 if (data->start_class->flags & ANYOF_LOCALE)
3602 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3603 }
3604 else {
3605 data->start_class->flags |= ANYOF_LOCALE;
3606 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3607 }
3608 break;
3609 case NSPACE:
3610 if (flags & SCF_DO_STCLASS_AND) {
3611 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3612 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3613 for (value = 0; value < 256; value++)
3614 if (isSPACE(value))
3615 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3616 }
8aa23a47
YO
3617 }
3618 else {
3619 if (data->start_class->flags & ANYOF_LOCALE)
3620 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3621 else {
3622 for (value = 0; value < 256; value++)
3623 if (!isSPACE(value))
3624 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3625 }
3626 }
8aa23a47
YO
3627 break;
3628 case NSPACEL:
3629 if (flags & SCF_DO_STCLASS_AND) {
3630 if (data->start_class->flags & ANYOF_LOCALE) {
3631 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3632 for (value = 0; value < 256; value++)
3633 if (!isSPACE(value))
3634 ANYOF_BITMAP_CLEAR(data->start_class, value);
3635 }
653099ff 3636 }
8aa23a47
YO
3637 else {
3638 data->start_class->flags |= ANYOF_LOCALE;
3639 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3640 }
3641 break;
3642 case DIGIT:
3643 if (flags & SCF_DO_STCLASS_AND) {
3644 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3645 for (value = 0; value < 256; value++)
3646 if (!isDIGIT(value))
3647 ANYOF_BITMAP_CLEAR(data->start_class, value);
3648 }
3649 else {
3650 if (data->start_class->flags & ANYOF_LOCALE)
3651 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3652 else {
3653 for (value = 0; value < 256; value++)
3654 if (isDIGIT(value))
3655 ANYOF_BITMAP_SET(data->start_class, value);
3656 }
3657 }
3658 break;
3659 case NDIGIT:
3660 if (flags & SCF_DO_STCLASS_AND) {
3661 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3662 for (value = 0; value < 256; value++)
3663 if (isDIGIT(value))
3664 ANYOF_BITMAP_CLEAR(data->start_class, value);
3665 }
3666 else {
3667 if (data->start_class->flags & ANYOF_LOCALE)
3668 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3669 else {
3670 for (value = 0; value < 256; value++)
3671 if (!isDIGIT(value))
3672 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3673 }
3674 }
8aa23a47 3675 break;
e1d1eefb
YO
3676 CASE_SYNST_FNC(VERTWS);
3677 CASE_SYNST_FNC(HORIZWS);
3678
8aa23a47
YO
3679 }
3680 if (flags & SCF_DO_STCLASS_OR)
3681 cl_and(data->start_class, and_withp);
3682 flags &= ~SCF_DO_STCLASS;
3683 }
3684 }
3685 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3686 data->flags |= (OP(scan) == MEOL
3687 ? SF_BEFORE_MEOL
3688 : SF_BEFORE_SEOL);
3689 }
3690 else if ( PL_regkind[OP(scan)] == BRANCHJ
3691 /* Lookbehind, or need to calculate parens/evals/stclass: */
3692 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3693 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3694 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3695 || OP(scan) == UNLESSM )
3696 {
3697 /* Negative Lookahead/lookbehind
3698 In this case we can't do fixed string optimisation.
3699 */
1de06328 3700
8aa23a47
YO
3701 I32 deltanext, minnext, fake = 0;
3702 regnode *nscan;
3703 struct regnode_charclass_class intrnl;
3704 int f = 0;
1de06328 3705
8aa23a47
YO
3706 data_fake.flags = 0;
3707 if (data) {
3708 data_fake.whilem_c = data->whilem_c;
3709 data_fake.last_closep = data->last_closep;
c277df42 3710 }
8aa23a47
YO
3711 else
3712 data_fake.last_closep = &fake;
58e23c8d 3713 data_fake.pos_delta = delta;
8aa23a47
YO
3714 if ( flags & SCF_DO_STCLASS && !scan->flags
3715 && OP(scan) == IFMATCH ) { /* Lookahead */
3716 cl_init(pRExC_state, &intrnl);
3717 data_fake.start_class = &intrnl;
3718 f |= SCF_DO_STCLASS_AND;
3719 }
3720 if (flags & SCF_WHILEM_VISITED_POS)
3721 f |= SCF_WHILEM_VISITED_POS;
3722 next = regnext(scan);
3723 nscan = NEXTOPER(NEXTOPER(scan));
3724 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3725 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3726 if (scan->flags) {
3727 if (deltanext) {
58e23c8d 3728 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3729 }
3730 else if (minnext > (I32)U8_MAX) {
58e23c8d 3731 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3732 }
3733 scan->flags = (U8)minnext;
3734 }
3735 if (data) {
3736 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3737 pars++;
3738 if (data_fake.flags & SF_HAS_EVAL)
3739 data->flags |= SF_HAS_EVAL;
3740 data->whilem_c = data_fake.whilem_c;
3741 }
3742 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
3743 if (flags & SCF_DO_STCLASS_OR) {
3744 /* OR before, AND after: ideally we would recurse with
3745 * data_fake to get the AND applied by study of the
3746 * remainder of the pattern, and then derecurse;
3747 * *** HACK *** for now just treat as "no information".
3748 * See [perl #56690].
3749 */
3750 cl_init(pRExC_state, data->start_class);
3751 } else {
3752 /* AND before and after: combine and continue */
3753 const int was = (data->start_class->flags & ANYOF_EOS);
3754
3755 cl_and(data->start_class, &intrnl);
3756 if (was)
3757 data->start_class->flags |= ANYOF_EOS;
3758 }
8aa23a47 3759 }
cb434fcc 3760 }
8aa23a47
YO
3761#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3762 else {
3763 /* Positive Lookahead/lookbehind
3764 In this case we can do fixed string optimisation,
3765 but we must be careful about it. Note in the case of
3766 lookbehind the positions will be offset by the minimum
3767 length of the pattern, something we won't know about
3768 until after the recurse.
3769 */
3770 I32 deltanext, fake = 0;
3771 regnode *nscan;
3772 struct regnode_charclass_class intrnl;
3773 int f = 0;
3774 /* We use SAVEFREEPV so that when the full compile
3775 is finished perl will clean up the allocated
3776 minlens when its all done. This was we don't
3777 have to worry about freeing them when we know
3778 they wont be used, which would be a pain.
3779 */
3780 I32 *minnextp;
3781 Newx( minnextp, 1, I32 );
3782 SAVEFREEPV(minnextp);
3783
3784 if (data) {
3785 StructCopy(data, &data_fake, scan_data_t);
3786 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3787 f |= SCF_DO_SUBSTR;
3788 if (scan->flags)
304ee84b 3789 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3790 data_fake.last_found=newSVsv(data->last_found);
3791 }
3792 }
3793 else
3794 data_fake.last_closep = &fake;
3795 data_fake.flags = 0;
58e23c8d 3796 data_fake.pos_delta = delta;
8aa23a47
YO
3797 if (is_inf)
3798 data_fake.flags |= SF_IS_INF;
3799 if ( flags & SCF_DO_STCLASS && !scan->flags
3800 && OP(scan) == IFMATCH ) { /* Lookahead */
3801 cl_init(pRExC_state, &intrnl);
3802 data_fake.start_class = &intrnl;
3803 f |= SCF_DO_STCLASS_AND;
3804 }
3805 if (flags & SCF_WHILEM_VISITED_POS)
3806 f |= SCF_WHILEM_VISITED_POS;
3807 next = regnext(scan);
3808 nscan = NEXTOPER(NEXTOPER(scan));
3809
3810 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3811 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3812 if (scan->flags) {
3813 if (deltanext) {
58e23c8d 3814 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3815 }
3816 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3817 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3818 }
3819 scan->flags = (U8)*minnextp;
3820 }
3821
3822 *minnextp += min;
3823
3824 if (f & SCF_DO_STCLASS_AND) {
3825 const int was = (data->start_class->flags & ANYOF_EOS);
3826
3827 cl_and(data->start_class, &intrnl);
3828 if (was)
3829 data->start_class->flags |= ANYOF_EOS;
3830 }
3831 if (data) {
3832 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3833 pars++;
3834 if (data_fake.flags & SF_HAS_EVAL)
3835 data->flags |= SF_HAS_EVAL;
3836 data->whilem_c = data_fake.whilem_c;
3837 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3838 if (RExC_rx->minlen<*minnextp)
3839 RExC_rx->minlen=*minnextp;
304ee84b 3840 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3841 SvREFCNT_dec(data_fake.last_found);
3842
3843 if ( data_fake.minlen_fixed != minlenp )
3844 {
3845 data->offset_fixed= data_fake.offset_fixed;
3846 data->minlen_fixed= data_fake.minlen_fixed;
3847 data->lookbehind_fixed+= scan->flags;
3848 }
3849 if ( data_fake.minlen_float != minlenp )
3850 {
3851 data->minlen_float= data_fake.minlen_float;
3852 data->offset_float_min=data_fake.offset_float_min;
3853 data->offset_float_max=data_fake.offset_float_max;
3854 data->lookbehind_float+= scan->flags;
3855 }
3856 }
3857 }
3858
3859
40d049e4 3860 }
8aa23a47
YO
3861#endif
3862 }
3863 else if (OP(scan) == OPEN) {
3864 if (stopparen != (I32)ARG(scan))
3865 pars++;
3866 }
3867 else if (OP(scan) == CLOSE) {
3868 if (stopparen == (I32)ARG(scan)) {
3869 break;
3870 }
3871 if ((I32)ARG(scan) == is_par) {
3872 next = regnext(scan);
b515a41d 3873
8aa23a47
YO
3874 if ( next && (OP(next) != WHILEM) && next < last)
3875 is_par = 0; /* Disable optimization */
40d049e4 3876 }
8aa23a47
YO
3877 if (data)
3878 *(data->last_closep) = ARG(scan);
3879 }
3880 else if (OP(scan) == EVAL) {
c277df42
IZ
3881 if (data)
3882 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3883 }
3884 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3885 if (flags & SCF_DO_SUBSTR) {
304ee84b 3886 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3887 flags &= ~SCF_DO_SUBSTR;
40d049e4 3888 }
8aa23a47
YO
3889 if (data && OP(scan)==ACCEPT) {
3890 data->flags |= SCF_SEEN_ACCEPT;
3891 if (stopmin > min)
3892 stopmin = min;
e2e6a0f1 3893 }
8aa23a47
YO
3894 }
3895 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3896 {
0f5d15d6 3897 if (flags & SCF_DO_SUBSTR) {
304ee84b 3898 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3899 data->longest = &(data->longest_float);
3900 }
3901 is_inf = is_inf_internal = 1;
653099ff 3902 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3903 cl_anything(pRExC_state, data->start_class);
96776eda 3904 flags &= ~SCF_DO_STCLASS;
8aa23a47 3905 }
58e23c8d 3906 else if (OP(scan) == GPOS) {
bbe252da 3907 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3908 !(delta || is_inf || (data && data->pos_delta)))
3909 {
bbe252da
YO
3910 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3911 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3912 if (RExC_rx->gofs < (U32)min)
3913 RExC_rx->gofs = min;
3914 } else {
bbe252da 3915 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3916 RExC_rx->gofs = 0;
3917 }
3918 }
786e8c11 3919#ifdef TRIE_STUDY_OPT
40d049e4 3920#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3921 else if (PL_regkind[OP(scan)] == TRIE) {
3922 /* NOTE - There is similar code to this block above for handling
3923 BRANCH nodes on the initial study. If you change stuff here
3924 check there too. */
3925 regnode *trie_node= scan;
3926 regnode *tail= regnext(scan);
f8fc2ecf 3927 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3928 I32 max1 = 0, min1 = I32_MAX;
3929 struct regnode_charclass_class accum;
3930
3931 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3932 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3933 if (flags & SCF_DO_STCLASS)
3934 cl_init_zero(pRExC_state, &accum);
3935
3936 if (!trie->jump) {
3937 min1= trie->minlen;
3938 max1= trie->maxlen;
3939 } else {
3940 const regnode *nextbranch= NULL;
3941 U32 word;
3942
3943 for ( word=1 ; word <= trie->wordcount ; word++)
3944 {
3945 I32 deltanext=0, minnext=0, f = 0, fake;
3946 struct regnode_charclass_class this_class;
3947
3948 data_fake.flags = 0;
3949 if (data) {
3950 data_fake.whilem_c = data->whilem_c;
3951 data_fake.last_closep = data->last_closep;
3952 }
3953 else
3954 data_fake.last_closep = &fake;
58e23c8d 3955 data_fake.pos_delta = delta;
8aa23a47
YO
3956 if (flags & SCF_DO_STCLASS) {
3957 cl_init(pRExC_state, &this_class);
3958 data_fake.start_class = &this_class;
3959 f = SCF_DO_STCLASS_AND;
3960 }
3961 if (flags & SCF_WHILEM_VISITED_POS)
3962 f |= SCF_WHILEM_VISITED_POS;
3963
3964 if (trie->jump[word]) {
3965 if (!nextbranch)
3966 nextbranch = trie_node + trie->jump[0];
3967 scan= trie_node + trie->jump[word];
3968 /* We go from the jump point to the branch that follows
3969 it. Note this means we need the vestigal unused branches
3970 even though they arent otherwise used.
3971 */
3972 minnext = study_chunk(pRExC_state, &scan, minlenp,
3973 &deltanext, (regnode *)nextbranch, &data_fake,
3974 stopparen, recursed, NULL, f,depth+1);
3975 }
3976 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3977 nextbranch= regnext((regnode*)nextbranch);
3978
3979 if (min1 > (I32)(minnext + trie->minlen))
3980 min1 = minnext + trie->minlen;
3981 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3982 max1 = minnext + deltanext + trie->maxlen;
3983 if (deltanext == I32_MAX)
3984 is_inf = is_inf_internal = 1;
3985
3986 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3987 pars++;
3988 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3989 if ( stopmin > min + min1)
3990 stopmin = min + min1;
3991 flags &= ~SCF_DO_SUBSTR;
3992 if (data)
3993 data->flags |= SCF_SEEN_ACCEPT;
3994 }
3995 if (data) {
3996 if (data_fake.flags & SF_HAS_EVAL)
3997 data->flags |= SF_HAS_EVAL;
3998 data->whilem_c = data_fake.whilem_c;
3999 }
4000 if (flags & SCF_DO_STCLASS)
4001 cl_or(pRExC_state, &accum, &this_class);
4002 }
4003 }
4004 if (flags & SCF_DO_SUBSTR) {
4005 data->pos_min += min1;
4006 data->pos_delta += max1 - min1;
4007 if (max1 != min1 || is_inf)
4008 data->longest = &(data->longest_float);
4009 }
4010 min += min1;
4011 delta += max1 - min1;
4012 if (flags & SCF_DO_STCLASS_OR) {
4013 cl_or(pRExC_state, data->start_class, &accum);
4014 if (min1) {
4015 cl_and(data->start_class, and_withp);
4016 flags &= ~SCF_DO_STCLASS;
4017 }
4018 }
4019 else if (flags & SCF_DO_STCLASS_AND) {
4020 if (min1) {
4021 cl_and(data->start_class, &accum);
4022 flags &= ~SCF_DO_STCLASS;
4023 }
4024 else {
4025 /* Switch to OR mode: cache the old value of
4026 * data->start_class */
4027 INIT_AND_WITHP;
4028 StructCopy(data->start_class, and_withp,
4029 struct regnode_charclass_class);
4030 flags &= ~SCF_DO_STCLASS_AND;
4031 StructCopy(&accum, data->start_class,
4032 struct regnode_charclass_class);
4033 flags |= SCF_DO_STCLASS_OR;
4034 data->start_class->flags |= ANYOF_EOS;
4035 }
4036 }
4037 scan= tail;
4038 continue;
4039 }
786e8c11 4040#else
8aa23a47 4041 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4042 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4043 U8*bang=NULL;
4044
4045 min += trie->minlen;
4046 delta += (trie->maxlen - trie->minlen);
4047 flags &= ~SCF_DO_STCLASS; /* xxx */
4048 if (flags & SCF_DO_SUBSTR) {
304ee84b 4049 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4050 data->pos_min += trie->minlen;
4051 data->pos_delta += (trie->maxlen - trie->minlen);
4052 if (trie->maxlen != trie->minlen)
4053 data->longest = &(data->longest_float);
4054 }
4055 if (trie->jump) /* no more substrings -- for now /grr*/
4056 flags &= ~SCF_DO_SUBSTR;
b515a41d 4057 }
8aa23a47
YO
4058#endif /* old or new */
4059#endif /* TRIE_STUDY_OPT */
e1d1eefb 4060
8aa23a47
YO
4061 /* Else: zero-length, ignore. */
4062 scan = regnext(scan);
4063 }
4064 if (frame) {
4065 last = frame->last;
4066 scan = frame->next;
4067 stopparen = frame->stop;
4068 frame = frame->prev;
4069 goto fake_study_recurse;
c277df42
IZ
4070 }
4071
4072 finish:
8aa23a47 4073 assert(!frame);
304ee84b 4074 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4075
c277df42 4076 *scanp = scan;
aca2d497 4077 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4078 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4079 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4080 if (is_par > (I32)U8_MAX)
c277df42
IZ
4081 is_par = 0;
4082 if (is_par && pars==1 && data) {
4083 data->flags |= SF_IN_PAR;
4084 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4085 }
4086 else if (pars && data) {
c277df42
IZ
4087 data->flags |= SF_HAS_PAR;
4088 data->flags &= ~SF_IN_PAR;
4089 }
653099ff 4090 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4091 cl_and(data->start_class, and_withp);
786e8c11
YO
4092 if (flags & SCF_TRIE_RESTUDY)
4093 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4094
304ee84b 4095 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4096
e2e6a0f1 4097 return min < stopmin ? min : stopmin;
c277df42
IZ
4098}
4099
2eccd3b2
NC
4100STATIC U32
4101S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4102{
4a4e7719
NC
4103 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4104
7918f24d
NC
4105 PERL_ARGS_ASSERT_ADD_DATA;
4106
4a4e7719
NC
4107 Renewc(RExC_rxi->data,
4108 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4109 char, struct reg_data);
4110 if(count)
f8fc2ecf 4111 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4112 else
f8fc2ecf 4113 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4114 RExC_rxi->data->count = count + n;
4115 Copy(s, RExC_rxi->data->what + count, n, U8);
4116 return count;
c277df42
IZ
4117}
4118
f8149455 4119/*XXX: todo make this not included in a non debugging perl */
76234dfb 4120#ifndef PERL_IN_XSUB_RE
d88dccdf 4121void
864dbfa3 4122Perl_reginitcolors(pTHX)
d88dccdf 4123{
97aff369 4124 dVAR;
1df70142 4125 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4126 if (s) {
1df70142
AL
4127 char *t = savepv(s);
4128 int i = 0;
4129 PL_colors[0] = t;
d88dccdf 4130 while (++i < 6) {
1df70142
AL
4131 t = strchr(t, '\t');
4132 if (t) {
4133 *t = '\0';
4134 PL_colors[i] = ++t;
d88dccdf
IZ
4135 }
4136 else
1df70142 4137 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4138 }
4139 } else {
1df70142 4140 int i = 0;
b81d288d 4141 while (i < 6)
06b5626a 4142 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4143 }
4144 PL_colorset = 1;
4145}
76234dfb 4146#endif
8615cb43 4147
07be1b83 4148
786e8c11
YO
4149#ifdef TRIE_STUDY_OPT
4150#define CHECK_RESTUDY_GOTO \
4151 if ( \
4152 (data.flags & SCF_TRIE_RESTUDY) \
4153 && ! restudied++ \
4154 ) goto reStudy
4155#else
4156#define CHECK_RESTUDY_GOTO
4157#endif
f9f4320a 4158
a687059c 4159/*
e50aee73 4160 - pregcomp - compile a regular expression into internal code
a687059c
LW
4161 *
4162 * We can't allocate space until we know how big the compiled form will be,
4163 * but we can't compile it (and thus know how big it is) until we've got a
4164 * place to put the code. So we cheat: we compile it twice, once with code
4165 * generation turned off and size counting turned on, and once "for real".
4166 * This also means that we don't allocate space until we are sure that the
4167 * thing really will compile successfully, and we never have to move the
4168 * code and thus invalidate pointers into it. (Note that it has to be in
4169 * one piece because free() must be able to free it all.) [NB: not true in perl]
4170 *
4171 * Beware that the optimization-preparation code in here knows about some
4172 * of the structure of the compiled regexp. [I'll say.]
4173 */
b9b4dddf
YO
4174
4175
4176
f9f4320a 4177#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4178#define RE_ENGINE_PTR &PL_core_reg_engine
4179#else
f9f4320a
YO
4180extern const struct regexp_engine my_reg_engine;
4181#define RE_ENGINE_PTR &my_reg_engine
4182#endif
6d5c990f
RGS
4183
4184#ifndef PERL_IN_XSUB_RE
3ab4a224 4185REGEXP *
1593ad57 4186Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4187{
97aff369 4188 dVAR;
6d5c990f 4189 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4190
4191 PERL_ARGS_ASSERT_PREGCOMP;
4192
f9f4320a
YO
4193 /* Dispatch a request to compile a regexp to correct
4194 regexp engine. */
f9f4320a
YO
4195 if (table) {
4196 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4197 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4198 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4199 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4200 DEBUG_COMPILE_r({
8d8756e7 4201 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4202 SvIV(*ptr));
4203 });
3ab4a224 4204 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4205 }
b9b4dddf 4206 }
3ab4a224 4207 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4208}
6d5c990f 4209#endif
2a5d9b1d 4210
3ab4a224 4211REGEXP *
1593ad57 4212Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
2a5d9b1d
RGS
4213{
4214 dVAR;
288b8c02
NC
4215 REGEXP *rx;
4216 struct regexp *r;
f8fc2ecf 4217 register regexp_internal *ri;
3ab4a224 4218 STRLEN plen;
1593ad57 4219 char *exp = SvPV(pattern, plen);
3ab4a224 4220 char* xend = exp + plen;
c277df42 4221 regnode *scan;
a0d0e21e 4222 I32 flags;
a0d0e21e
LW
4223 I32 minlen = 0;
4224 I32 sawplus = 0;
4225 I32 sawopen = 0;
2c2d71f5 4226 scan_data_t data;
830247a4 4227 RExC_state_t RExC_state;
be8e71aa 4228 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4229#ifdef TRIE_STUDY_OPT
4230 int restudied= 0;
4231 RExC_state_t copyRExC_state;
4232#endif
2a5d9b1d 4233 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4234
4235 PERL_ARGS_ASSERT_RE_COMPILE;
4236
6d5c990f 4237 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4238
b9ad30b4 4239 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
a0ed51b3 4240
a3621e74 4241 DEBUG_COMPILE_r({
ab3bbdeb
YO
4242 SV *dsv= sv_newmortal();
4243 RE_PV_QUOTED_DECL(s, RExC_utf8,
3ab4a224 4244 dsv, exp, plen, 60);
ab3bbdeb
YO
4245 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4246 PL_colors[4],PL_colors[5],s);
a5961de5 4247 });
02daf0ab
YO
4248
4249redo_first_pass:
4250 RExC_precomp = exp;
c737faaf 4251 RExC_flags = pm_flags;
830247a4 4252 RExC_sawback = 0;
bbce6d69 4253
830247a4
IZ
4254 RExC_seen = 0;
4255 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4256 RExC_seen_evals = 0;
4257 RExC_extralen = 0;
c277df42 4258
bbce6d69 4259 /* First pass: determine size, legality. */
830247a4 4260 RExC_parse = exp;
fac92740 4261 RExC_start = exp;
830247a4
IZ
4262 RExC_end = xend;
4263 RExC_naughty = 0;
4264 RExC_npar = 1;
e2e6a0f1 4265 RExC_nestroot = 0;
830247a4
IZ
4266 RExC_size = 0L;
4267 RExC_emit = &PL_regdummy;
4268 RExC_whilem_seen = 0;
40d049e4
YO
4269 RExC_open_parens = NULL;
4270 RExC_close_parens = NULL;
4271 RExC_opend = NULL;
81714fb9 4272 RExC_paren_names = NULL;
1f1031fe
YO
4273#ifdef DEBUGGING
4274 RExC_paren_name_list = NULL;
4275#endif
40d049e4
YO
4276 RExC_recurse = NULL;
4277 RExC_recurse_count = 0;
81714fb9 4278
85ddcde9
JH
4279#if 0 /* REGC() is (currently) a NOP at the first pass.
4280 * Clever compilers notice this and complain. --jhi */
830247a4 4281 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4282#endif
3dab1dad
YO
4283 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4284 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4285 RExC_precomp = NULL;
a0d0e21e
LW
4286 return(NULL);
4287 }
02daf0ab 4288 if (RExC_utf8 && !RExC_orig_utf8) {
38a44b82 4289 /* It's possible to write a regexp in ascii that represents Unicode
02daf0ab
YO
4290 codepoints outside of the byte range, such as via \x{100}. If we
4291 detect such a sequence we have to convert the entire pattern to utf8
4292 and then recompile, as our sizing calculation will have been based
4293 on 1 byte == 1 character, but we will need to use utf8 to encode
4294 at least some part of the pattern, and therefore must convert the whole
4295 thing.
4296 XXX: somehow figure out how to make this less expensive...
4297 -- dmq */
3ab4a224 4298 STRLEN len = plen;
02daf0ab
YO
4299 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4300 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4301 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
4302 xend = exp + len;
4303 RExC_orig_utf8 = RExC_utf8;
4304 SAVEFREEPV(exp);
4305 goto redo_first_pass;
4306 }
07be1b83 4307 DEBUG_PARSE_r({
81714fb9
YO
4308 PerlIO_printf(Perl_debug_log,
4309 "Required size %"IVdf" nodes\n"
4310 "Starting second pass (creation)\n",
4311 (IV)RExC_size);
07be1b83
YO
4312 RExC_lastnum=0;
4313 RExC_lastparse=NULL;
4314 });
c277df42
IZ
4315 /* Small enough for pointer-storage convention?
4316 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4317 if (RExC_size >= 0x10000L && RExC_extralen)
4318 RExC_size += RExC_extralen;
c277df42 4319 else
830247a4
IZ
4320 RExC_extralen = 0;
4321 if (RExC_whilem_seen > 15)
4322 RExC_whilem_seen = 15;
a0d0e21e 4323
f9f4320a
YO
4324 /* Allocate space and zero-initialize. Note, the two step process
4325 of zeroing when in debug mode, thus anything assigned has to
4326 happen after that */
d2f13c59 4327 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 4328 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
4329 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4330 char, regexp_internal);
4331 if ( r == NULL || ri == NULL )
b45f050a 4332 FAIL("Regexp out of space");
0f79a09d
GS
4333#ifdef DEBUGGING
4334 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4335 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4336#else
f8fc2ecf
YO
4337 /* bulk initialize base fields with 0. */
4338 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4339#endif
58e23c8d
YO
4340
4341 /* non-zero initialization begins here */
f8fc2ecf 4342 RXi_SET( r, ri );
f9f4320a 4343 r->engine= RE_ENGINE_PTR;
c737faaf 4344 r->extflags = pm_flags;
bcdf7404 4345 {
f7819f85 4346 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bcdf7404
YO
4347 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4348 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
4349 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4350 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
4351 const char *fptr = STD_PAT_MODS; /*"msix"*/
4352 char *p;
9d17798d 4353 const STRLEN wraplen = plen + has_minus + has_p + has_runon
bcdf7404
YO
4354 + (sizeof(STD_PAT_MODS) - 1)
4355 + (sizeof("(?:)") - 1);
4356
ad64d0ec 4357 p = sv_grow(MUTABLE_SV(rx), wraplen + 1);
9d17798d 4358 SvCUR_set(rx, wraplen);
f7c278bf 4359 SvPOK_on(rx);
8f6ae13c 4360 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 4361 *p++='('; *p++='?';
f7819f85
A
4362 if (has_p)
4363 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404
YO
4364 {
4365 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4366 char *colon = r + 1;
4367 char ch;
4368
4369 while((ch = *fptr++)) {
4370 if(reganch & 1)
4371 *p++ = ch;
4372 else
4373 *r-- = ch;
4374 reganch >>= 1;
4375 }
4376 if(has_minus) {
4377 *r = '-';
4378 p = colon;
4379 }
4380 }
4381
28d8d7f4 4382 *p++ = ':';
bb661a58 4383 Copy(RExC_precomp, p, plen, char);
efd26800
NC
4384 assert ((RX_WRAPPED(rx) - p) < 16);
4385 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 4386 p += plen;
bcdf7404 4387 if (has_runon)
28d8d7f4
YO
4388 *p++ = '\n';
4389 *p++ = ')';
4390 *p = 0;
bcdf7404
YO
4391 }
4392
bbe252da 4393 r->intflags = 0;
830247a4 4394 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4395
6bda09f9 4396 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4397 Newxz(RExC_open_parens, RExC_npar,regnode *);
4398 SAVEFREEPV(RExC_open_parens);
4399 Newxz(RExC_close_parens,RExC_npar,regnode *);
4400 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4401 }
4402
4403 /* Useful during FAIL. */
7122b237
YO
4404#ifdef RE_TRACK_PATTERN_OFFSETS
4405 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4406 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4407 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4408 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4409 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4410#endif
4411 SetProgLen(ri,RExC_size);
288b8c02 4412 RExC_rx_sv = rx;
830247a4 4413 RExC_rx = r;
f8fc2ecf 4414 RExC_rxi = ri;
bbce6d69 4415
4416 /* Second pass: emit code. */
c737faaf 4417 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
4418 RExC_parse = exp;
4419 RExC_end = xend;
4420 RExC_naughty = 0;
4421 RExC_npar = 1;
f8fc2ecf
YO
4422 RExC_emit_start = ri->program;
4423 RExC_emit = ri->program;
3b57cd43
YO
4424 RExC_emit_bound = ri->program + RExC_size + 1;
4425
2cd61cdb 4426 /* Store the count of eval-groups for security checks: */
f8149455 4427 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4428 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 4429 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 4430 ReREFCNT_dec(rx);
a0d0e21e 4431 return(NULL);
80757612 4432 }
07be1b83
YO
4433 /* XXXX To minimize changes to RE engine we always allocate
4434 3-units-long substrs field. */
4435 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4436 if (RExC_recurse_count) {
4437 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4438 SAVEFREEPV(RExC_recurse);
4439 }
a0d0e21e 4440
07be1b83 4441reStudy:
1de06328 4442 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4443 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4444
07be1b83 4445#ifdef TRIE_STUDY_OPT
0934c9d9
SH
4446 if (!restudied) {
4447 StructCopy(&zero_scan_data, &data, scan_data_t);
4448 copyRExC_state = RExC_state;
4449 } else {
5d458dd8 4450 U32 seen=RExC_seen;
07be1b83 4451 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4452
4453 RExC_state = copyRExC_state;
4454 if (seen & REG_TOP_LEVEL_BRANCHES)
4455 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4456 else
4457 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4458 if (data.last_found) {
07be1b83 4459 SvREFCNT_dec(data.longest_fixed);
07be1b83 4460 SvREFCNT_dec(data.longest_float);
07be1b83 4461 SvREFCNT_dec(data.last_found);
1de06328 4462 }
40d049e4 4463 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4464 }
40d049e4
YO
4465#else
4466 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4467#endif
fc8cd66c 4468
a0d0e21e 4469 /* Dig out information for optimizations. */
f7819f85 4470 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
4471 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4472
a0ed51b3 4473 if (UTF)
8f6ae13c 4474 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 4475 ri->regstclass = NULL;
830247a4 4476 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4477 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4478 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4479
1de06328
YO
4480 /* testing for BRANCH here tells us whether there is "must appear"
4481 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4482 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4483 I32 fake;
c5254dd6 4484 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4485 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4486 int stclass_flag;
07be1b83 4487 I32 last_close = 0; /* pointed to by data */
5339e136
YO
4488 regnode *first= scan;
4489 regnode *first_next= regnext(first);
4490
639081d6
YO
4491 /*
4492 * Skip introductions and multiplicators >= 1
4493 * so that we can extract the 'meat' of the pattern that must
4494 * match in the large if() sequence following.
4495 * NOTE that EXACT is NOT covered here, as it is normally
4496 * picked up by the optimiser separately.
4497 *
4498 * This is unfortunate as the optimiser isnt handling lookahead
4499 * properly currently.
4500 *
4501 */
a0d0e21e 4502 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4503 /* An OR of *one* alternative - should not happen now. */
5339e136 4504 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83
YO
4505 /* for now we can't handle lookbehind IFMATCH*/
4506 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4507 (OP(first) == PLUS) ||
4508 (OP(first) == MINMOD) ||
653099ff 4509 /* An {n,m} with n>0 */
5339e136
YO
4510 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4511 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 4512 {
639081d6
YO
4513 /*
4514 * the only op that could be a regnode is PLUS, all the rest
4515 * will be regnode_1 or regnode_2.
4516 *
4517 */
a0d0e21e
LW
4518 if (OP(first) == PLUS)
4519 sawplus = 1;
4520 else
3dab1dad 4521 first += regarglen[OP(first)];
639081d6
YO
4522
4523 first = NEXTOPER(first);
5339e136 4524 first_next= regnext(first);
a687059c
LW
4525 }
4526
a0d0e21e
LW
4527 /* Starting-point info. */
4528 again:
786e8c11 4529 DEBUG_PEEP("first:",first,0);
07be1b83 4530 /* Ignore EXACT as we deal with it later. */
3dab1dad 4531 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4532 if (OP(first) == EXACT)
6f207bd3 4533 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4534 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4535 ri->regstclass = first;
b3c9acc1 4536 }
07be1b83 4537#ifdef TRIE_STCLASS
786e8c11 4538 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4539 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4540 {
786e8c11 4541 regnode *trie_op;
07be1b83 4542 /* this can happen only on restudy */
786e8c11 4543 if ( OP(first) == TRIE ) {
c944940b 4544 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4545 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4546 StructCopy(first,trieop,struct regnode_1);
4547 trie_op=(regnode *)trieop;
4548 } else {
c944940b 4549 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4550 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4551 StructCopy(first,trieop,struct regnode_charclass);
4552 trie_op=(regnode *)trieop;
4553 }
1de06328 4554 OP(trie_op)+=2;
786e8c11 4555 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4556 ri->regstclass = trie_op;
07be1b83
YO
4557 }
4558#endif
bfed75c6 4559 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4560 ri->regstclass = first;
3dab1dad
YO
4561 else if (PL_regkind[OP(first)] == BOUND ||
4562 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4563 ri->regstclass = first;
3dab1dad 4564 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4565 r->extflags |= (OP(first) == MBOL
4566 ? RXf_ANCH_MBOL
cad2e5aa 4567 : (OP(first) == SBOL
bbe252da
YO
4568 ? RXf_ANCH_SBOL
4569 : RXf_ANCH_BOL));
a0d0e21e 4570 first = NEXTOPER(first);
774d564b 4571 goto again;
4572 }
4573 else if (OP(first) == GPOS) {
bbe252da 4574 r->extflags |= RXf_ANCH_GPOS;
774d564b 4575 first = NEXTOPER(first);
4576 goto again;
a0d0e21e 4577 }
cf2a2b69
YO
4578 else if ((!sawopen || !RExC_sawback) &&
4579 (OP(first) == STAR &&
3dab1dad 4580 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4581 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4582 {
4583 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4584 const int type =
4585 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4586 ? RXf_ANCH_MBOL
4587 : RXf_ANCH_SBOL;
4588 r->extflags |= type;
4589 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4590 first = NEXTOPER(first);
774d564b 4591 goto again;
a0d0e21e 4592 }
b81d288d 4593 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4594 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4595 /* x+ must match at the 1st pos of run of x's */
bbe252da 4596 r->intflags |= PREGf_SKIP;
a0d0e21e 4597
c277df42 4598 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4599#ifdef TRIE_STUDY_OPT
81714fb9 4600 DEBUG_PARSE_r(
be8e71aa
YO
4601 if (!restudied)
4602 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4603 (IV)(first - scan + 1))
4604 );
4605#else
81714fb9 4606 DEBUG_PARSE_r(
be8e71aa
YO
4607 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4608 (IV)(first - scan + 1))
4609 );
4610#endif
4611
4612
a0d0e21e
LW
4613 /*
4614 * If there's something expensive in the r.e., find the
4615 * longest literal string that must appear and make it the
4616 * regmust. Resolve ties in favor of later strings, since
4617 * the regstart check works with the beginning of the r.e.
4618 * and avoiding duplication strengthens checking. Not a
4619 * strong reason, but sufficient in the absence of others.
4620 * [Now we resolve ties in favor of the earlier string if
c277df42 4621 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4622 * earlier string may buy us something the later one won't.]
4623 */
de8c5301 4624
396482e1
GA
4625 data.longest_fixed = newSVpvs("");
4626 data.longest_float = newSVpvs("");
4627 data.last_found = newSVpvs("");
c277df42
IZ
4628 data.longest = &(data.longest_fixed);
4629 first = scan;
f8fc2ecf 4630 if (!ri->regstclass) {
830247a4 4631 cl_init(pRExC_state, &ch_class);
653099ff
GS
4632 data.start_class = &ch_class;
4633 stclass_flag = SCF_DO_STCLASS_AND;
4634 } else /* XXXX Check for BOUND? */
4635 stclass_flag = 0;
cb434fcc 4636 data.last_closep = &last_close;
de8c5301 4637
1de06328 4638 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4639 &data, -1, NULL, NULL,
4640 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4641
07be1b83 4642
786e8c11
YO
4643 CHECK_RESTUDY_GOTO;
4644
4645
830247a4 4646 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4647 && data.last_start_min == 0 && data.last_end > 0
830247a4 4648 && !RExC_seen_zerolen
2bf803e2 4649 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4650 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4651 r->extflags |= RXf_CHECK_ALL;
304ee84b 4652 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4653 SvREFCNT_dec(data.last_found);
4654
1de06328
YO
4655 /* Note that code very similar to this but for anchored string
4656 follows immediately below, changes may need to be made to both.
4657 Be careful.
4658 */
a0ed51b3 4659 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4660 if (longest_float_length
c277df42
IZ
4661 || (data.flags & SF_FL_BEFORE_EOL
4662 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4663 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4664 {
1182767e 4665 I32 t,ml;
cf93c79d 4666
1de06328 4667 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4668 && data.offset_fixed == data.offset_float_min
4669 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4670 goto remove_float; /* As in (a)+. */
4671
1de06328
YO
4672 /* copy the information about the longest float from the reg_scan_data
4673 over to the program. */
33b8afdf
JH
4674 if (SvUTF8(data.longest_float)) {
4675 r->float_utf8 = data.longest_float;
c445ea15 4676 r->float_substr = NULL;
33b8afdf
JH
4677 } else {
4678 r->float_substr = data.longest_float;
c445ea15 4679 r->float_utf8 = NULL;
33b8afdf 4680 }
1de06328
YO
4681 /* float_end_shift is how many chars that must be matched that
4682 follow this item. We calculate it ahead of time as once the
4683 lookbehind offset is added in we lose the ability to correctly
4684 calculate it.*/
4685 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4686 : (I32)longest_float_length;
1de06328
YO
4687 r->float_end_shift = ml - data.offset_float_min
4688 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4689 + data.lookbehind_float;
4690 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4691 r->float_max_offset = data.offset_float_max;
1182767e 4692 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4693 r->float_max_offset -= data.lookbehind_float;
4694
cf93c79d
IZ
4695 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4696 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4697 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4698 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4699 }
4700 else {
aca2d497 4701 remove_float:
c445ea15 4702 r->float_substr = r->float_utf8 = NULL;
c277df42 4703 SvREFCNT_dec(data.longest_float);
c5254dd6 4704 longest_float_length = 0;
a0d0e21e 4705 }
c277df42 4706
1de06328
YO
4707 /* Note that code very similar to this but for floating string
4708 is immediately above, changes may need to be made to both.
4709 Be careful.
4710 */
a0ed51b3 4711 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4712 if (longest_fixed_length
c277df42
IZ
4713 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4714 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4715 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4716 {
1182767e 4717 I32 t,ml;
cf93c79d 4718
1de06328
YO
4719 /* copy the information about the longest fixed
4720 from the reg_scan_data over to the program. */
33b8afdf
JH
4721 if (SvUTF8(data.longest_fixed)) {
4722 r->anchored_utf8 = data.longest_fixed;
c445ea15 4723 r->anchored_substr = NULL;
33b8afdf
JH
4724 } else {
4725 r->anchored_substr = data.longest_fixed;
c445ea15 4726 r->anchored_utf8 = NULL;
33b8afdf 4727 }
1de06328
YO
4728 /* fixed_end_shift is how many chars that must be matched that
4729 follow this item. We calculate it ahead of time as once the
4730 lookbehind offset is added in we lose the ability to correctly
4731 calculate it.*/
4732 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4733 : (I32)longest_fixed_length;
1de06328
YO
4734 r->anchored_end_shift = ml - data.offset_fixed
4735 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4736 + data.lookbehind_fixed;
4737 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4738
cf93c79d
IZ
4739 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4740 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4741 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4742 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4743 }
4744 else {
c445ea15 4745 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4746 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4747 longest_fixed_length = 0;
a0d0e21e 4748 }
f8fc2ecf
YO
4749 if (ri->regstclass
4750 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4751 ri->regstclass = NULL;
33b8afdf
JH
4752 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4753 && stclass_flag
653099ff 4754 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4755 && !cl_is_anything(data.start_class))
4756 {
2eccd3b2 4757 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4758
f8fc2ecf 4759 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4760 struct regnode_charclass_class);
4761 StructCopy(data.start_class,
f8fc2ecf 4762 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4763 struct regnode_charclass_class);
f8fc2ecf 4764 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4765 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4766 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4767 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4768 PerlIO_printf(Perl_debug_log,
a0288114 4769 "synthetic stclass \"%s\".\n",
3f7c398e 4770 SvPVX_const(sv));});
653099ff 4771 }
c277df42
IZ
4772
4773 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4774 if (longest_fixed_length > longest_float_length) {
1de06328 4775 r->check_end_shift = r->anchored_end_shift;
c277df42 4776 r->check_substr = r->anchored_substr;
33b8afdf 4777 r->check_utf8 = r->anchored_utf8;
c277df42 4778 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4779 if (r->extflags & RXf_ANCH_SINGLE)
4780 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4781 }
4782 else {
1de06328 4783 r->check_end_shift = r->float_end_shift;
c277df42 4784 r->check_substr = r->float_substr;
33b8afdf 4785 r->check_utf8 = r->float_utf8;
1de06328
YO
4786 r->check_offset_min = r->float_min_offset;
4787 r->check_offset_max = r->float_max_offset;
a0d0e21e 4788 }
30382c73
IZ
4789 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4790 This should be changed ASAP! */
bbe252da
YO
4791 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4792 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4793 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4794 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4795 }
1de06328
YO
4796 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4797 if ( (STRLEN)minlen < longest_float_length )
4798 minlen= longest_float_length;
4799 if ( (STRLEN)minlen < longest_fixed_length )
4800 minlen= longest_fixed_length;
4801 */
a0ed51b3
LW
4802 }
4803 else {
c277df42
IZ
4804 /* Several toplevels. Best we can is to set minlen. */
4805 I32 fake;
653099ff 4806 struct regnode_charclass_class ch_class;
cb434fcc 4807 I32 last_close = 0;
c277df42 4808
5d458dd8 4809 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4810
f8fc2ecf 4811 scan = ri->program + 1;
830247a4 4812 cl_init(pRExC_state, &ch_class);
653099ff 4813 data.start_class = &ch_class;
cb434fcc 4814 data.last_closep = &last_close;
07be1b83 4815
de8c5301 4816
1de06328 4817 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4818 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4819
786e8c11 4820 CHECK_RESTUDY_GOTO;
07be1b83 4821
33b8afdf 4822 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4823 = r->float_substr = r->float_utf8 = NULL;
653099ff 4824 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4825 && !cl_is_anything(data.start_class))
4826 {
2eccd3b2 4827 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4828
f8fc2ecf 4829 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4830 struct regnode_charclass_class);
4831 StructCopy(data.start_class,
f8fc2ecf 4832 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4833 struct regnode_charclass_class);
f8fc2ecf 4834 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4835 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4836 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4837 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4838 PerlIO_printf(Perl_debug_log,
a0288114 4839 "synthetic stclass \"%s\".\n",
3f7c398e 4840 SvPVX_const(sv));});
653099ff 4841 }
a0d0e21e
LW
4842 }
4843
1de06328
YO
4844 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4845 the "real" pattern. */
cf9788e3
RGS
4846 DEBUG_OPTIMISE_r({
4847 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4848 (IV)minlen, (IV)r->minlen);
cf9788e3 4849 });
de8c5301 4850 r->minlenret = minlen;
1de06328
YO
4851 if (r->minlen < minlen)
4852 r->minlen = minlen;
4853
b81d288d 4854 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4855 r->extflags |= RXf_GPOS_SEEN;
830247a4 4856 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4857 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4858 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4859 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4860 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4861 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4862 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4863 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4864 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4865 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 4866 if (RExC_paren_names)
85fbaab2 4867 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 4868 else
5daac39c 4869 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 4870
7bd1e614 4871#ifdef STUPID_PATTERN_CHECKS
5509d87a 4872 if (RX_PRELEN(rx) == 0)
640f820d 4873 r->extflags |= RXf_NULL;
5509d87a 4874 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
4875 /* XXX: this should happen BEFORE we compile */
4876 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 4877 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 4878 r->extflags |= RXf_WHITE;
5509d87a 4879 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 4880 r->extflags |= RXf_START_ONLY;
f1b875a0 4881#else
5509d87a 4882 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
4883 /* XXX: this should happen BEFORE we compile */
4884 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
4885 else {
4886 regnode *first = ri->program + 1;
39aa8307
JH
4887 U8 fop = OP(first);
4888 U8 nop = OP(NEXTOPER(first));
7bd1e614 4889
640f820d
AB
4890 if (PL_regkind[fop] == NOTHING && nop == END)
4891 r->extflags |= RXf_NULL;
4892 else if (PL_regkind[fop] == BOL && nop == END)
7bd1e614
YO
4893 r->extflags |= RXf_START_ONLY;
4894 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
4895 r->extflags |= RXf_WHITE;
4896 }
f1b875a0 4897#endif
1f1031fe
YO
4898#ifdef DEBUGGING
4899 if (RExC_paren_names) {
4900 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4901 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4902 } else
1f1031fe 4903#endif
cde0cee5 4904 ri->name_list_idx = 0;
1f1031fe 4905
40d049e4
YO
4906 if (RExC_recurse_count) {
4907 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4908 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4909 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4910 }
4911 }
f0ab9afb 4912 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
4913 /* assume we don't need to swap parens around before we match */
4914
be8e71aa
YO
4915 DEBUG_DUMP_r({
4916 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4917 regdump(r);
4918 });
7122b237
YO
4919#ifdef RE_TRACK_PATTERN_OFFSETS
4920 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4921 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4922 U32 i;
4923 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4924 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4925 for (i = 1; i <= len; i++) {
7122b237 4926 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4927 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4928 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4929 }
4930 PerlIO_printf(Perl_debug_log, "\n");
4931 });
7122b237 4932#endif
288b8c02 4933 return rx;
a687059c
LW
4934}
4935
f9f4320a 4936#undef RE_ENGINE_PTR
3dab1dad 4937
93b32b6d 4938
81714fb9 4939SV*
192b9cd1
AB
4940Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
4941 const U32 flags)
4942{
7918f24d
NC
4943 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
4944
192b9cd1
AB
4945 PERL_UNUSED_ARG(value);
4946
f1b875a0 4947 if (flags & RXapif_FETCH) {
192b9cd1 4948 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 4949 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
f1f66076 4950 Perl_croak(aTHX_ "%s", PL_no_modify);
192b9cd1 4951 return NULL;
f1b875a0 4952 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
4953 return reg_named_buff_exists(rx, key, flags)
4954 ? &PL_sv_yes
4955 : &PL_sv_no;
f1b875a0 4956 } else if (flags & RXapif_REGNAMES) {
192b9cd1 4957 return reg_named_buff_all(rx, flags);
f1b875a0 4958 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
4959 return reg_named_buff_scalar(rx, flags);
4960 } else {
4961 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
4962 return NULL;
4963 }
4964}
4965
4966SV*
4967Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
4968 const U32 flags)
4969{
7918f24d 4970 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
4971 PERL_UNUSED_ARG(lastkey);
4972
f1b875a0 4973 if (flags & RXapif_FIRSTKEY)
192b9cd1 4974 return reg_named_buff_firstkey(rx, flags);
f1b875a0 4975 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
4976 return reg_named_buff_nextkey(rx, flags);
4977 else {
4978 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
4979 return NULL;
4980 }
4981}
4982
4983SV*
288b8c02
NC
4984Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
4985 const U32 flags)
81714fb9 4986{
44a2ac75
YO
4987 AV *retarray = NULL;
4988 SV *ret;
288b8c02 4989 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
4990
4991 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
4992
f1b875a0 4993 if (flags & RXapif_ALL)
44a2ac75 4994 retarray=newAV();
93b32b6d 4995
5daac39c
NC
4996 if (rx && RXp_PAREN_NAMES(rx)) {
4997 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
4998 if (he_str) {
4999 IV i;
5000 SV* sv_dat=HeVAL(he_str);
5001 I32 *nums=(I32*)SvPVX(sv_dat);
5002 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5003 if ((I32)(rx->nparens) >= nums[i]
5004 && rx->offs[nums[i]].start != -1
5005 && rx->offs[nums[i]].end != -1)
93b32b6d 5006 {
49d7dfbc 5007 ret = newSVpvs("");
288b8c02 5008 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5009 if (!retarray)
5010 return ret;
5011 } else {
5012 ret = newSVsv(&PL_sv_undef);
5013 }
ec83ea38 5014 if (retarray)
93b32b6d 5015 av_push(retarray, ret);
81714fb9 5016 }
93b32b6d 5017 if (retarray)
ad64d0ec 5018 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5019 }
5020 }
5021 return NULL;
5022}
5023
5024bool
288b8c02 5025Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5026 const U32 flags)
5027{
288b8c02 5028 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5029
5030 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5031
5daac39c 5032 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5033 if (flags & RXapif_ALL) {
5daac39c 5034 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5035 } else {
288b8c02 5036 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5037 if (sv) {
5038 SvREFCNT_dec(sv);
192b9cd1
AB
5039 return TRUE;
5040 } else {
5041 return FALSE;
5042 }
5043 }
5044 } else {
5045 return FALSE;
5046 }
5047}
5048
5049SV*
288b8c02 5050Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5051{
288b8c02 5052 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5053
5054 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5055
5daac39c
NC
5056 if ( rx && RXp_PAREN_NAMES(rx) ) {
5057 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5058
288b8c02 5059 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5060 } else {
5061 return FALSE;
5062 }
192b9cd1
AB
5063}
5064
5065SV*
288b8c02 5066Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5067{
288b8c02 5068 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5069 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5070
5071 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5072
5daac39c
NC
5073 if (rx && RXp_PAREN_NAMES(rx)) {
5074 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5075 HE *temphe;
5076 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5077 IV i;
5078 IV parno = 0;
5079 SV* sv_dat = HeVAL(temphe);
5080 I32 *nums = (I32*)SvPVX(sv_dat);
5081 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5082 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5083 rx->offs[nums[i]].start != -1 &&
5084 rx->offs[nums[i]].end != -1)
5085 {
5086 parno = nums[i];
5087 break;
5088 }
5089 }
f1b875a0 5090 if (parno || flags & RXapif_ALL) {
a663657d 5091 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5092 }
81714fb9
YO
5093 }
5094 }
44a2ac75
YO
5095 return NULL;
5096}
5097
192b9cd1 5098SV*
288b8c02 5099Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5100{
5101 SV *ret;
5102 AV *av;
5103 I32 length;
288b8c02 5104 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5105
7918f24d
NC
5106 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5107
5daac39c 5108 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5109 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5110 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5111 } else if (flags & RXapif_ONE) {
288b8c02 5112 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5113 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5114 length = av_len(av);
ec83ea38 5115 SvREFCNT_dec(ret);
192b9cd1
AB
5116 return newSViv(length + 1);
5117 } else {
5118 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5119 return NULL;
5120 }
5121 }
5122 return &PL_sv_undef;
5123}
5124
5125SV*
288b8c02 5126Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5127{
288b8c02 5128 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5129 AV *av = newAV();
5130
7918f24d
NC
5131 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5132
5daac39c
NC
5133 if (rx && RXp_PAREN_NAMES(rx)) {
5134 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5135 HE *temphe;
5136 (void)hv_iterinit(hv);
5137 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5138 IV i;
5139 IV parno = 0;
5140 SV* sv_dat = HeVAL(temphe);
5141 I32 *nums = (I32*)SvPVX(sv_dat);
5142 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5143 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5144 rx->offs[nums[i]].start != -1 &&
5145 rx->offs[nums[i]].end != -1)
5146 {
5147 parno = nums[i];
5148 break;
5149 }
5150 }
f1b875a0 5151 if (parno || flags & RXapif_ALL) {
a663657d 5152 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5153 }
5154 }
5155 }
5156
ad64d0ec 5157 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5158}
5159
49d7dfbc 5160void
288b8c02
NC
5161Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5162 SV * const sv)
44a2ac75 5163{
288b8c02 5164 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5165 char *s = NULL;
a9d504c3 5166 I32 i = 0;
44a2ac75 5167 I32 s1, t1;
7918f24d
NC
5168
5169 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5170
cde0cee5
YO
5171 if (!rx->subbeg) {
5172 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5173 return;
cde0cee5
YO
5174 }
5175 else
f1b875a0 5176 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5177 /* $` */
f0ab9afb 5178 i = rx->offs[0].start;
cde0cee5 5179 s = rx->subbeg;
44a2ac75
YO
5180 }
5181 else
f1b875a0 5182 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5183 /* $' */
f0ab9afb
NC
5184 s = rx->subbeg + rx->offs[0].end;
5185 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5186 }
5187 else
5188 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5189 (s1 = rx->offs[paren].start) != -1 &&
5190 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5191 {
5192 /* $& $1 ... */
5193 i = t1 - s1;
5194 s = rx->subbeg + s1;
cde0cee5
YO
5195 } else {
5196 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5197 return;
cde0cee5
YO
5198 }
5199 assert(rx->sublen >= (s - rx->subbeg) + i );
5200 if (i >= 0) {
5201 const int oldtainted = PL_tainted;
5202 TAINT_NOT;
5203 sv_setpvn(sv, s, i);
5204 PL_tainted = oldtainted;
5205 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5206 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5207 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5208 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5209 {
5210 SvUTF8_on(sv);
5211 }
5212 else
5213 SvUTF8_off(sv);
5214 if (PL_tainting) {
07bc277f 5215 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5216 if (SvTYPE(sv) >= SVt_PVMG) {
5217 MAGIC* const mg = SvMAGIC(sv);
5218 MAGIC* mgt;
5219 PL_tainted = 1;
5220 SvMAGIC_set(sv, mg->mg_moremagic);
5221 SvTAINT(sv);
5222 if ((mgt = SvMAGIC(sv))) {
5223 mg->mg_moremagic = mgt;
5224 SvMAGIC_set(sv, mg);
44a2ac75 5225 }
cde0cee5
YO
5226 } else {
5227 PL_tainted = 1;
5228 SvTAINT(sv);
5229 }
5230 } else
5231 SvTAINTED_off(sv);
44a2ac75 5232 }
81714fb9 5233 } else {
44a2ac75 5234 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5235 return;
81714fb9
YO
5236 }
5237}
93b32b6d 5238
2fdbfb4d
AB
5239void
5240Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5241 SV const * const value)
5242{
7918f24d
NC
5243 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5244
2fdbfb4d
AB
5245 PERL_UNUSED_ARG(rx);
5246 PERL_UNUSED_ARG(paren);
5247 PERL_UNUSED_ARG(value);
5248
5249 if (!PL_localizing)
f1f66076 5250 Perl_croak(aTHX_ "%s", PL_no_modify);
2fdbfb4d
AB
5251}
5252
5253I32
288b8c02 5254Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5255 const I32 paren)
5256{
288b8c02 5257 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5258 I32 i;
5259 I32 s1, t1;
5260
7918f24d
NC
5261 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5262
2fdbfb4d
AB
5263 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5264 switch (paren) {
192b9cd1 5265 /* $` / ${^PREMATCH} */
f1b875a0 5266 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5267 if (rx->offs[0].start != -1) {
5268 i = rx->offs[0].start;
5269 if (i > 0) {
5270 s1 = 0;
5271 t1 = i;
5272 goto getlen;
5273 }
5274 }
5275 return 0;
192b9cd1 5276 /* $' / ${^POSTMATCH} */
f1b875a0 5277 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
5278 if (rx->offs[0].end != -1) {
5279 i = rx->sublen - rx->offs[0].end;
5280 if (i > 0) {
5281 s1 = rx->offs[0].end;
5282 t1 = rx->sublen;
5283 goto getlen;
5284 }
5285 }
5286 return 0;
192b9cd1
AB
5287 /* $& / ${^MATCH}, $1, $2, ... */
5288 default:
2fdbfb4d
AB
5289 if (paren <= (I32)rx->nparens &&
5290 (s1 = rx->offs[paren].start) != -1 &&
5291 (t1 = rx->offs[paren].end) != -1)
5292 {
5293 i = t1 - s1;
5294 goto getlen;
5295 } else {
5296 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 5297 report_uninit((const SV *)sv);
2fdbfb4d
AB
5298 return 0;
5299 }
5300 }
5301 getlen:
07bc277f 5302 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
5303 const char * const s = rx->subbeg + s1;
5304 const U8 *ep;
5305 STRLEN el;
5306
5307 i = t1 - s1;
5308 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5309 i = el;
5310 }
5311 return i;
5312}
5313
fe578d7f 5314SV*
49d7dfbc 5315Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 5316{
7918f24d 5317 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 5318 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
5319 if (0)
5320 return NULL;
5321 else
5322 return newSVpvs("Regexp");
fe578d7f 5323}
0a4db386 5324
894be9b7 5325/* Scans the name of a named buffer from the pattern.
0a4db386
YO
5326 * If flags is REG_RSN_RETURN_NULL returns null.
5327 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5328 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5329 * to the parsed name as looked up in the RExC_paren_names hash.
5330 * If there is an error throws a vFAIL().. type exception.
894be9b7 5331 */
0a4db386
YO
5332
5333#define REG_RSN_RETURN_NULL 0
5334#define REG_RSN_RETURN_NAME 1
5335#define REG_RSN_RETURN_DATA 2
5336
894be9b7 5337STATIC SV*
7918f24d
NC
5338S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5339{
894be9b7 5340 char *name_start = RExC_parse;
1f1031fe 5341
7918f24d
NC
5342 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5343
1f1031fe
YO
5344 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5345 /* skip IDFIRST by using do...while */
5346 if (UTF)
5347 do {
5348 RExC_parse += UTF8SKIP(RExC_parse);
5349 } while (isALNUM_utf8((U8*)RExC_parse));
5350 else
5351 do {
5352 RExC_parse++;
5353 } while (isALNUM(*RExC_parse));
894be9b7 5354 }
1f1031fe 5355
0a4db386 5356 if ( flags ) {
59cd0e26
NC
5357 SV* sv_name
5358 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5359 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
5360 if ( flags == REG_RSN_RETURN_NAME)
5361 return sv_name;
5362 else if (flags==REG_RSN_RETURN_DATA) {
5363 HE *he_str = NULL;
5364 SV *sv_dat = NULL;
5365 if ( ! sv_name ) /* should not happen*/
5366 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5367 if (RExC_paren_names)
5368 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5369 if ( he_str )
5370 sv_dat = HeVAL(he_str);
5371 if ( ! sv_dat )
5372 vFAIL("Reference to nonexistent named group");
5373 return sv_dat;
5374 }
5375 else {
5376 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5377 }
5378 /* NOT REACHED */
894be9b7 5379 }
0a4db386 5380 return NULL;
894be9b7
YO
5381}
5382
3dab1dad
YO
5383#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5384 int rem=(int)(RExC_end - RExC_parse); \
5385 int cut; \
5386 int num; \
5387 int iscut=0; \
5388 if (rem>10) { \
5389 rem=10; \
5390 iscut=1; \
5391 } \
5392 cut=10-rem; \
5393 if (RExC_lastparse!=RExC_parse) \
5394 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5395 rem, RExC_parse, \
5396 cut + 4, \
5397 iscut ? "..." : "<" \
5398 ); \
5399 else \
5400 PerlIO_printf(Perl_debug_log,"%16s",""); \
5401 \
5402 if (SIZE_ONLY) \
3b57cd43 5403 num = RExC_size + 1; \
3dab1dad
YO
5404 else \
5405 num=REG_NODE_NUM(RExC_emit); \
5406 if (RExC_lastnum!=num) \
0a4db386 5407 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 5408 else \
0a4db386 5409 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
5410 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5411 (int)((depth*2)), "", \
3dab1dad
YO
5412 (funcname) \
5413 ); \
5414 RExC_lastnum=num; \
5415 RExC_lastparse=RExC_parse; \
5416})
5417
07be1b83
YO
5418
5419
3dab1dad
YO
5420#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5421 DEBUG_PARSE_MSG((funcname)); \
5422 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5423})
6bda09f9
YO
5424#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5425 DEBUG_PARSE_MSG((funcname)); \
5426 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5427})
a687059c
LW
5428/*
5429 - reg - regular expression, i.e. main body or parenthesized thing
5430 *
5431 * Caller must absorb opening parenthesis.
5432 *
5433 * Combining parenthesis handling with the base level of regular expression
5434 * is a trifle forced, but the need to tie the tails of the branches to what
5435 * follows makes it hard to avoid.
5436 */
07be1b83
YO
5437#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5438#ifdef DEBUGGING
5439#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5440#else
5441#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5442#endif
3dab1dad 5443
76e3520e 5444STATIC regnode *
3dab1dad 5445S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 5446 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 5447{
27da23d5 5448 dVAR;
c277df42
IZ
5449 register regnode *ret; /* Will be the head of the group. */
5450 register regnode *br;
5451 register regnode *lastbr;
cbbf8932 5452 register regnode *ender = NULL;
a0d0e21e 5453 register I32 parno = 0;
cbbf8932 5454 I32 flags;
f7819f85 5455 U32 oregflags = RExC_flags;
6136c704
AL
5456 bool have_branch = 0;
5457 bool is_open = 0;
594d7033
YO
5458 I32 freeze_paren = 0;
5459 I32 after_freeze = 0;
9d1d55b5
JP
5460
5461 /* for (?g), (?gc), and (?o) warnings; warning
5462 about (?c) will warn about (?g) -- japhy */
5463
6136c704
AL
5464#define WASTED_O 0x01
5465#define WASTED_G 0x02
5466#define WASTED_C 0x04
5467#define WASTED_GC (0x02|0x04)
cbbf8932 5468 I32 wastedflags = 0x00;
9d1d55b5 5469
fac92740 5470 char * parse_start = RExC_parse; /* MJD */
a28509cc 5471 char * const oregcomp_parse = RExC_parse;
a0d0e21e 5472
3dab1dad 5473 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5474
5475 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
5476 DEBUG_PARSE("reg ");
5477
821b33a5 5478 *flagp = 0; /* Tentatively. */
a0d0e21e 5479
9d1d55b5 5480
a0d0e21e
LW
5481 /* Make an OPEN node, if parenthesized. */
5482 if (paren) {
e2e6a0f1
YO
5483 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5484 char *start_verb = RExC_parse;
5485 STRLEN verb_len = 0;
5486 char *start_arg = NULL;
5487 unsigned char op = 0;
5488 int argok = 1;
5489 int internal_argval = 0; /* internal_argval is only useful if !argok */
5490 while ( *RExC_parse && *RExC_parse != ')' ) {
5491 if ( *RExC_parse == ':' ) {
5492 start_arg = RExC_parse + 1;
5493 break;
5494 }
5495 RExC_parse++;
5496 }
5497 ++start_verb;
5498 verb_len = RExC_parse - start_verb;
5499 if ( start_arg ) {
5500 RExC_parse++;
5501 while ( *RExC_parse && *RExC_parse != ')' )
5502 RExC_parse++;
5503 if ( *RExC_parse != ')' )
5504 vFAIL("Unterminated verb pattern argument");
5505 if ( RExC_parse == start_arg )
5506 start_arg = NULL;
5507 } else {
5508 if ( *RExC_parse != ')' )
5509 vFAIL("Unterminated verb pattern");
5510 }
5d458dd8 5511
e2e6a0f1
YO
5512 switch ( *start_verb ) {
5513 case 'A': /* (*ACCEPT) */
568a785a 5514 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
5515 op = ACCEPT;
5516 internal_argval = RExC_nestroot;
5517 }
5518 break;
5519 case 'C': /* (*COMMIT) */
568a785a 5520 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 5521 op = COMMIT;
e2e6a0f1
YO
5522 break;
5523 case 'F': /* (*FAIL) */
568a785a 5524 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
5525 op = OPFAIL;
5526 argok = 0;
5527 }
5528 break;
5d458dd8
YO
5529 case ':': /* (*:NAME) */
5530 case 'M': /* (*MARK:NAME) */
568a785a 5531 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 5532 op = MARKPOINT;
5d458dd8
YO
5533 argok = -1;
5534 }
5535 break;
5536 case 'P': /* (*PRUNE) */
568a785a 5537 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 5538 op = PRUNE;
e2e6a0f1 5539 break;
5d458dd8 5540 case 'S': /* (*SKIP) */
568a785a 5541 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
5542 op = SKIP;
5543 break;
5544 case 'T': /* (*THEN) */
5545 /* [19:06] <TimToady> :: is then */
568a785a 5546 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
5547 op = CUTGROUP;
5548 RExC_seen |= REG_SEEN_CUTGROUP;
5549 }
e2e6a0f1
YO
5550 break;
5551 }
5552 if ( ! op ) {
5553 RExC_parse++;
5554 vFAIL3("Unknown verb pattern '%.*s'",
5555 verb_len, start_verb);
5556 }
5557 if ( argok ) {
5558 if ( start_arg && internal_argval ) {
5559 vFAIL3("Verb pattern '%.*s' may not have an argument",
5560 verb_len, start_verb);
5561 } else if ( argok < 0 && !start_arg ) {
5562 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5563 verb_len, start_verb);
5564 } else {
5565 ret = reganode(pRExC_state, op, internal_argval);
5566 if ( ! internal_argval && ! SIZE_ONLY ) {
5567 if (start_arg) {
5568 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5569 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5570 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5571 ret->flags = 0;
5572 } else {
5573 ret->flags = 1;
5574 }
5575 }
5576 }
5577 if (!internal_argval)
5578 RExC_seen |= REG_SEEN_VERBARG;
5579 } else if ( start_arg ) {
5580 vFAIL3("Verb pattern '%.*s' may not have an argument",
5581 verb_len, start_verb);
5582 } else {
5583 ret = reg_node(pRExC_state, op);
5584 }
5585 nextchar(pRExC_state);
5586 return ret;
5587 } else
fac92740 5588 if (*RExC_parse == '?') { /* (?...) */
6136c704 5589 bool is_logical = 0;
a28509cc 5590 const char * const seqstart = RExC_parse;
ca9dfc88 5591
830247a4
IZ
5592 RExC_parse++;
5593 paren = *RExC_parse++;
c277df42 5594 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5595 switch (paren) {
894be9b7 5596
1f1031fe
YO
5597 case 'P': /* (?P...) variants for those used to PCRE/Python */
5598 paren = *RExC_parse++;
5599 if ( paren == '<') /* (?P<...>) named capture */
5600 goto named_capture;
5601 else if (paren == '>') { /* (?P>name) named recursion */
5602 goto named_recursion;
5603 }
5604 else if (paren == '=') { /* (?P=...) named backref */
5605 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5606 you change this make sure you change that */
5607 char* name_start = RExC_parse;
5608 U32 num = 0;
5609 SV *sv_dat = reg_scan_name(pRExC_state,
5610 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5611 if (RExC_parse == name_start || *RExC_parse != ')')
5612 vFAIL2("Sequence %.3s... not terminated",parse_start);
5613
5614 if (!SIZE_ONLY) {
5615 num = add_data( pRExC_state, 1, "S" );
5616 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5617 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
5618 }
5619 RExC_sawback = 1;
5620 ret = reganode(pRExC_state,
5621 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5622 num);
5623 *flagp |= HASWIDTH;
5624
5625 Set_Node_Offset(ret, parse_start+1);
5626 Set_Node_Cur_Length(ret); /* MJD */
5627
5628 nextchar(pRExC_state);
5629 return ret;
5630 }
57b84237
YO
5631 RExC_parse++;
5632 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5633 /*NOTREACHED*/
5634 case '<': /* (?<...) */
b81d288d 5635 if (*RExC_parse == '!')
c277df42 5636 paren = ',';
0a4db386 5637 else if (*RExC_parse != '=')
1f1031fe 5638 named_capture:
0a4db386 5639 { /* (?<...>) */
81714fb9 5640 char *name_start;
894be9b7 5641 SV *svname;
81714fb9
YO
5642 paren= '>';
5643 case '\'': /* (?'...') */
5644 name_start= RExC_parse;
0a4db386
YO
5645 svname = reg_scan_name(pRExC_state,
5646 SIZE_ONLY ? /* reverse test from the others */
5647 REG_RSN_RETURN_NAME :
5648 REG_RSN_RETURN_NULL);
57b84237
YO
5649 if (RExC_parse == name_start) {
5650 RExC_parse++;
5651 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5652 /*NOTREACHED*/
5653 }
81714fb9
YO
5654 if (*RExC_parse != paren)
5655 vFAIL2("Sequence (?%c... not terminated",
5656 paren=='>' ? '<' : paren);
5657 if (SIZE_ONLY) {
e62cc96a
YO
5658 HE *he_str;
5659 SV *sv_dat = NULL;
894be9b7
YO
5660 if (!svname) /* shouldnt happen */
5661 Perl_croak(aTHX_
5662 "panic: reg_scan_name returned NULL");
81714fb9
YO
5663 if (!RExC_paren_names) {
5664 RExC_paren_names= newHV();
ad64d0ec 5665 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
5666#ifdef DEBUGGING
5667 RExC_paren_name_list= newAV();
ad64d0ec 5668 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 5669#endif
81714fb9
YO
5670 }
5671 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5672 if ( he_str )
81714fb9 5673 sv_dat = HeVAL(he_str);
e62cc96a 5674 if ( ! sv_dat ) {
81714fb9 5675 /* croak baby croak */
e62cc96a
YO
5676 Perl_croak(aTHX_
5677 "panic: paren_name hash element allocation failed");
5678 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5679 /* (?|...) can mean we have dupes so scan to check
5680 its already been stored. Maybe a flag indicating
5681 we are inside such a construct would be useful,
5682 but the arrays are likely to be quite small, so
5683 for now we punt -- dmq */
5684 IV count = SvIV(sv_dat);
5685 I32 *pv = (I32*)SvPVX(sv_dat);
5686 IV i;
5687 for ( i = 0 ; i < count ; i++ ) {
5688 if ( pv[i] == RExC_npar ) {
5689 count = 0;
5690 break;
5691 }
5692 }
5693 if ( count ) {
5694 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5695 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5696 pv[count] = RExC_npar;
3a92e6ae 5697 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 5698 }
81714fb9
YO
5699 } else {
5700 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5701 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5702 SvIOK_on(sv_dat);
3ec35e0f 5703 SvIV_set(sv_dat, 1);
e62cc96a 5704 }
1f1031fe
YO
5705#ifdef DEBUGGING
5706 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5707 SvREFCNT_dec(svname);
5708#endif
e62cc96a 5709
81714fb9
YO
5710 /*sv_dump(sv_dat);*/
5711 }
5712 nextchar(pRExC_state);
5713 paren = 1;
5714 goto capturing_parens;
5715 }
5716 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5717 RExC_parse++;
fac92740 5718 case '=': /* (?=...) */
89c6a13e
AB
5719 RExC_seen_zerolen++;
5720 break;
fac92740 5721 case '!': /* (?!...) */
830247a4 5722 RExC_seen_zerolen++;
e2e6a0f1
YO
5723 if (*RExC_parse == ')') {
5724 ret=reg_node(pRExC_state, OPFAIL);
5725 nextchar(pRExC_state);
5726 return ret;
5727 }
594d7033
YO
5728 break;
5729 case '|': /* (?|...) */
5730 /* branch reset, behave like a (?:...) except that
5731 buffers in alternations share the same numbers */
5732 paren = ':';
5733 after_freeze = freeze_paren = RExC_npar;
5734 break;
fac92740
MJD
5735 case ':': /* (?:...) */
5736 case '>': /* (?>...) */
a0d0e21e 5737 break;
fac92740
MJD
5738 case '$': /* (?$...) */
5739 case '@': /* (?@...) */
8615cb43 5740 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5741 break;
fac92740 5742 case '#': /* (?#...) */
830247a4
IZ
5743 while (*RExC_parse && *RExC_parse != ')')
5744 RExC_parse++;
5745 if (*RExC_parse != ')')
c277df42 5746 FAIL("Sequence (?#... not terminated");
830247a4 5747 nextchar(pRExC_state);
a0d0e21e
LW
5748 *flagp = TRYAGAIN;
5749 return NULL;
894be9b7
YO
5750 case '0' : /* (?0) */
5751 case 'R' : /* (?R) */
5752 if (*RExC_parse != ')')
6bda09f9 5753 FAIL("Sequence (?R) not terminated");
1a147d38 5754 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5755 *flagp |= POSTPONED;
7f69552c
YO
5756 nextchar(pRExC_state);
5757 return ret;
5758 /*notreached*/
894be9b7
YO
5759 { /* named and numeric backreferences */
5760 I32 num;
894be9b7
YO
5761 case '&': /* (?&NAME) */
5762 parse_start = RExC_parse - 1;
1f1031fe 5763 named_recursion:
894be9b7 5764 {
0a4db386
YO
5765 SV *sv_dat = reg_scan_name(pRExC_state,
5766 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5767 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5768 }
5769 goto gen_recurse_regop;
5770 /* NOT REACHED */
542fa716
YO
5771 case '+':
5772 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5773 RExC_parse++;
5774 vFAIL("Illegal pattern");
5775 }
5776 goto parse_recursion;
5777 /* NOT REACHED*/
5778 case '-': /* (?-1) */
5779 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5780 RExC_parse--; /* rewind to let it be handled later */
5781 goto parse_flags;
5782 }
5783 /*FALLTHROUGH */
6bda09f9
YO
5784 case '1': case '2': case '3': case '4': /* (?1) */
5785 case '5': case '6': case '7': case '8': case '9':
5786 RExC_parse--;
542fa716 5787 parse_recursion:
894be9b7
YO
5788 num = atoi(RExC_parse);
5789 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5790 if (*RExC_parse == '-')
5791 RExC_parse++;
6bda09f9
YO
5792 while (isDIGIT(*RExC_parse))
5793 RExC_parse++;
5794 if (*RExC_parse!=')')
5795 vFAIL("Expecting close bracket");
894be9b7
YO
5796
5797 gen_recurse_regop:
542fa716
YO
5798 if ( paren == '-' ) {
5799 /*
5800 Diagram of capture buffer numbering.
5801 Top line is the normal capture buffer numbers
5802 Botton line is the negative indexing as from
5803 the X (the (?-2))
5804
5805 + 1 2 3 4 5 X 6 7
5806 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5807 - 5 4 3 2 1 X x x
5808
5809 */
5810 num = RExC_npar + num;
5811 if (num < 1) {
5812 RExC_parse++;
5813 vFAIL("Reference to nonexistent group");
5814 }
5815 } else if ( paren == '+' ) {
5816 num = RExC_npar + num - 1;
5817 }
5818
1a147d38 5819 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5820 if (!SIZE_ONLY) {
5821 if (num > (I32)RExC_rx->nparens) {
5822 RExC_parse++;
5823 vFAIL("Reference to nonexistent group");
5824 }
40d049e4 5825 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5826 RExC_emit++;
226de585 5827 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5828 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5829 } else {
6bda09f9 5830 RExC_size++;
6bda09f9 5831 }
0a4db386 5832 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5833 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5834 Set_Node_Offset(ret, parse_start); /* MJD */
5835
a3b492c3 5836 *flagp |= POSTPONED;
6bda09f9
YO
5837 nextchar(pRExC_state);
5838 return ret;
894be9b7
YO
5839 } /* named and numeric backreferences */
5840 /* NOT REACHED */
5841
fac92740 5842 case '?': /* (??...) */
6136c704 5843 is_logical = 1;
57b84237
YO
5844 if (*RExC_parse != '{') {
5845 RExC_parse++;
5846 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5847 /*NOTREACHED*/
5848 }
a3b492c3 5849 *flagp |= POSTPONED;
830247a4 5850 paren = *RExC_parse++;
0f5d15d6 5851 /* FALL THROUGH */
fac92740 5852 case '{': /* (?{...}) */
c277df42 5853 {
2eccd3b2
NC
5854 I32 count = 1;
5855 U32 n = 0;
c277df42 5856 char c;
830247a4 5857 char *s = RExC_parse;
c277df42 5858
830247a4
IZ
5859 RExC_seen_zerolen++;
5860 RExC_seen |= REG_SEEN_EVAL;
5861 while (count && (c = *RExC_parse)) {
6136c704
AL
5862 if (c == '\\') {
5863 if (RExC_parse[1])
5864 RExC_parse++;
5865 }
b81d288d 5866 else if (c == '{')
c277df42 5867 count++;
b81d288d 5868 else if (c == '}')
c277df42 5869 count--;
830247a4 5870 RExC_parse++;
c277df42 5871 }
6136c704 5872 if (*RExC_parse != ')') {
b81d288d 5873 RExC_parse = s;
b45f050a
JF
5874 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5875 }
c277df42 5876 if (!SIZE_ONLY) {
f3548bdc 5877 PAD *pad;
6136c704
AL
5878 OP_4tree *sop, *rop;
5879 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5880
569233ed
SB
5881 ENTER;
5882 Perl_save_re_context(aTHX);
f3548bdc 5883 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5884 sop->op_private |= OPpREFCOUNTED;
5885 /* re_dup will OpREFCNT_inc */
5886 OpREFCNT_set(sop, 1);
569233ed 5887 LEAVE;
c277df42 5888
830247a4 5889 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5890 RExC_rxi->data->data[n] = (void*)rop;
5891 RExC_rxi->data->data[n+1] = (void*)sop;
5892 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5893 SvREFCNT_dec(sv);
a0ed51b3 5894 }
e24b16f9 5895 else { /* First pass */
830247a4 5896 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5897 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5898 /* No compiled RE interpolated, has runtime
5899 components ===> unsafe. */
5900 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5901 if (PL_tainting && PL_tainted)
cc6b7395 5902 FAIL("Eval-group in insecure regular expression");
54df2634 5903#if PERL_VERSION > 8
923e4eb5 5904 if (IN_PERL_COMPILETIME)
b5c19bd7 5905 PL_cv_has_eval = 1;
54df2634 5906#endif
c277df42 5907 }
b5c19bd7 5908
830247a4 5909 nextchar(pRExC_state);
6136c704 5910 if (is_logical) {
830247a4 5911 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5912 if (!SIZE_ONLY)
5913 ret->flags = 2;
3dab1dad 5914 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5915 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5916 return ret;
5917 }
ccb2c380
MP
5918 ret = reganode(pRExC_state, EVAL, n);
5919 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5920 Set_Node_Offset(ret, parse_start);
5921 return ret;
c277df42 5922 }
fac92740 5923 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5924 {
0a4db386 5925 int is_define= 0;
fac92740 5926 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5927 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5928 || RExC_parse[1] == '<'
830247a4 5929 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5930 I32 flag;
5931
830247a4 5932 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5933 if (!SIZE_ONLY)
5934 ret->flags = 1;
3dab1dad 5935 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5936 goto insert_if;
b81d288d 5937 }
a0ed51b3 5938 }
0a4db386
YO
5939 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5940 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5941 {
5942 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5943 char *name_start= RExC_parse++;
2eccd3b2 5944 U32 num = 0;
0a4db386
YO
5945 SV *sv_dat=reg_scan_name(pRExC_state,
5946 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5947 if (RExC_parse == name_start || *RExC_parse != ch)
5948 vFAIL2("Sequence (?(%c... not terminated",
5949 (ch == '>' ? '<' : ch));
5950 RExC_parse++;
5951 if (!SIZE_ONLY) {
5952 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5953 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 5954 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
5955 }
5956 ret = reganode(pRExC_state,NGROUPP,num);
5957 goto insert_if_check_paren;
5958 }
5959 else if (RExC_parse[0] == 'D' &&
5960 RExC_parse[1] == 'E' &&
5961 RExC_parse[2] == 'F' &&
5962 RExC_parse[3] == 'I' &&
5963 RExC_parse[4] == 'N' &&
5964 RExC_parse[5] == 'E')
5965 {
5966 ret = reganode(pRExC_state,DEFINEP,0);
5967 RExC_parse +=6 ;
5968 is_define = 1;
5969 goto insert_if_check_paren;
5970 }
5971 else if (RExC_parse[0] == 'R') {
5972 RExC_parse++;
5973 parno = 0;
5974 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5975 parno = atoi(RExC_parse++);
5976 while (isDIGIT(*RExC_parse))
5977 RExC_parse++;
5978 } else if (RExC_parse[0] == '&') {
5979 SV *sv_dat;
5980 RExC_parse++;
5981 sv_dat = reg_scan_name(pRExC_state,
5982 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5983 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5984 }
1a147d38 5985 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5986 goto insert_if_check_paren;
5987 }
830247a4 5988 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5989 /* (?(1)...) */
6136c704 5990 char c;
830247a4 5991 parno = atoi(RExC_parse++);
c277df42 5992
830247a4
IZ
5993 while (isDIGIT(*RExC_parse))
5994 RExC_parse++;
fac92740 5995 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5996
0a4db386 5997 insert_if_check_paren:
830247a4 5998 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5999 vFAIL("Switch condition not recognized");
c277df42 6000 insert_if:
3dab1dad
YO
6001 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6002 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 6003 if (br == NULL)
830247a4 6004 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 6005 else
3dab1dad 6006 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 6007 c = *nextchar(pRExC_state);
d1b80229
IZ
6008 if (flags&HASWIDTH)
6009 *flagp |= HASWIDTH;
c277df42 6010 if (c == '|') {
0a4db386
YO
6011 if (is_define)
6012 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 6013 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
6014 regbranch(pRExC_state, &flags, 1,depth+1);
6015 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
6016 if (flags&HASWIDTH)
6017 *flagp |= HASWIDTH;
830247a4 6018 c = *nextchar(pRExC_state);
a0ed51b3
LW
6019 }
6020 else
c277df42
IZ
6021 lastbr = NULL;
6022 if (c != ')')
8615cb43 6023 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 6024 ender = reg_node(pRExC_state, TAIL);
3dab1dad 6025 REGTAIL(pRExC_state, br, ender);
c277df42 6026 if (lastbr) {
3dab1dad
YO
6027 REGTAIL(pRExC_state, lastbr, ender);
6028 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
6029 }
6030 else
3dab1dad 6031 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
6032 RExC_size++; /* XXX WHY do we need this?!!
6033 For large programs it seems to be required
6034 but I can't figure out why. -- dmq*/
c277df42 6035 return ret;
a0ed51b3
LW
6036 }
6037 else {
830247a4 6038 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
6039 }
6040 }
1b1626e4 6041 case 0:
830247a4 6042 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 6043 vFAIL("Sequence (? incomplete");
1b1626e4 6044 break;
a0d0e21e 6045 default:
cde0cee5
YO
6046 --RExC_parse;
6047 parse_flags: /* (?i) */
6048 {
6049 U32 posflags = 0, negflags = 0;
6050 U32 *flagsp = &posflags;
6051
6052 while (*RExC_parse) {
6053 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
6054 /* (?g), (?gc) and (?o) are useless here
6055 and must be globally applied -- japhy */
cde0cee5
YO
6056 switch (*RExC_parse) {
6057 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
f7819f85
A
6058 case ONCE_PAT_MOD: /* 'o' */
6059 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 6060 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 6061 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
6062 if (! (wastedflags & wflagbit) ) {
6063 wastedflags |= wflagbit;
6064 vWARN5(
6065 RExC_parse + 1,
6066 "Useless (%s%c) - %suse /%c modifier",
6067 flagsp == &negflags ? "?-" : "?",
6068 *RExC_parse,
6069 flagsp == &negflags ? "don't " : "",
6070 *RExC_parse
6071 );
6072 }
6073 }
cde0cee5
YO
6074 break;
6075
f7819f85 6076 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 6077 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
6078 if (! (wastedflags & WASTED_C) ) {
6079 wastedflags |= WASTED_GC;
9d1d55b5
JP
6080 vWARN3(
6081 RExC_parse + 1,
6082 "Useless (%sc) - %suse /gc modifier",
6083 flagsp == &negflags ? "?-" : "?",
6084 flagsp == &negflags ? "don't " : ""
6085 );
6086 }
6087 }
cde0cee5 6088 break;
f7819f85 6089 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 6090 if (flagsp == &negflags) {
668c081a
NC
6091 if (SIZE_ONLY)
6092 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
6093 } else {
6094 *flagsp |= RXf_PMf_KEEPCOPY;
6095 }
6096 break;
6097 case '-':
57b84237
YO
6098 if (flagsp == &negflags) {
6099 RExC_parse++;
6100 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6101 /*NOTREACHED*/
6102 }
cde0cee5
YO
6103 flagsp = &negflags;
6104 wastedflags = 0; /* reset so (?g-c) warns twice */
6105 break;
6106 case ':':
6107 paren = ':';
6108 /*FALLTHROUGH*/
6109 case ')':
6110 RExC_flags |= posflags;
6111 RExC_flags &= ~negflags;
f7819f85
A
6112 if (paren != ':') {
6113 oregflags |= posflags;
6114 oregflags &= ~negflags;
6115 }
cde0cee5
YO
6116 nextchar(pRExC_state);
6117 if (paren != ':') {
6118 *flagp = TRYAGAIN;
6119 return NULL;
6120 } else {
6121 ret = NULL;
6122 goto parse_rest;
6123 }
6124 /*NOTREACHED*/
6125 default:
cde0cee5
YO
6126 RExC_parse++;
6127 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6128 /*NOTREACHED*/
6129 }
830247a4 6130 ++RExC_parse;
48c036b1 6131 }
cde0cee5 6132 }} /* one for the default block, one for the switch */
a0d0e21e 6133 }
fac92740 6134 else { /* (...) */
81714fb9 6135 capturing_parens:
830247a4
IZ
6136 parno = RExC_npar;
6137 RExC_npar++;
e2e6a0f1 6138
830247a4 6139 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
6140 if (!SIZE_ONLY ){
6141 if (!RExC_nestroot)
6142 RExC_nestroot = parno;
c009da3d
YO
6143 if (RExC_seen & REG_SEEN_RECURSE
6144 && !RExC_open_parens[parno-1])
6145 {
e2e6a0f1 6146 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
6147 "Setting open paren #%"IVdf" to %d\n",
6148 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
6149 RExC_open_parens[parno-1]= ret;
6150 }
6bda09f9 6151 }
fac92740
MJD
6152 Set_Node_Length(ret, 1); /* MJD */
6153 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 6154 is_open = 1;
a0d0e21e 6155 }
a0ed51b3 6156 }
fac92740 6157 else /* ! paren */
a0d0e21e 6158 ret = NULL;
cde0cee5
YO
6159
6160 parse_rest:
a0d0e21e 6161 /* Pick up the branches, linking them together. */
fac92740 6162 parse_start = RExC_parse; /* MJD */
3dab1dad 6163 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e
VP
6164
6165 if (freeze_paren) {
6166 if (RExC_npar > after_freeze)
6167 after_freeze = RExC_npar;
6168 RExC_npar = freeze_paren;
6169 }
6170
fac92740 6171 /* branch_len = (paren != 0); */
2af232bd 6172
a0d0e21e
LW
6173 if (br == NULL)
6174 return(NULL);
830247a4
IZ
6175 if (*RExC_parse == '|') {
6176 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 6177 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 6178 }
fac92740 6179 else { /* MJD */
6bda09f9 6180 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
6181 Set_Node_Length(br, paren != 0);
6182 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6183 }
c277df42
IZ
6184 have_branch = 1;
6185 if (SIZE_ONLY)
830247a4 6186 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
6187 }
6188 else if (paren == ':') {
c277df42
IZ
6189 *flagp |= flags&SIMPLE;
6190 }
6136c704 6191 if (is_open) { /* Starts with OPEN. */
3dab1dad 6192 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
6193 }
6194 else if (paren != '?') /* Not Conditional */
a0d0e21e 6195 ret = br;
8ae10a67 6196 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 6197 lastbr = br;
830247a4
IZ
6198 while (*RExC_parse == '|') {
6199 if (!SIZE_ONLY && RExC_extralen) {
6200 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 6201 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
6202 }
6203 if (SIZE_ONLY)
830247a4
IZ
6204 RExC_extralen += 2; /* Account for LONGJMP. */
6205 nextchar(pRExC_state);
594d7033
YO
6206 if (freeze_paren) {
6207 if (RExC_npar > after_freeze)
6208 after_freeze = RExC_npar;
6209 RExC_npar = freeze_paren;
6210 }
3dab1dad 6211 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 6212
a687059c 6213 if (br == NULL)
a0d0e21e 6214 return(NULL);
3dab1dad 6215 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 6216 lastbr = br;
8ae10a67 6217 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
6218 }
6219
c277df42
IZ
6220 if (have_branch || paren != ':') {
6221 /* Make a closing node, and hook it on the end. */
6222 switch (paren) {
6223 case ':':
830247a4 6224 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
6225 break;
6226 case 1:
830247a4 6227 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
6228 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6229 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6230 "Setting close paren #%"IVdf" to %d\n",
6231 (IV)parno, REG_NODE_NUM(ender)));
6232 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
6233 if (RExC_nestroot == parno)
6234 RExC_nestroot = 0;
40d049e4 6235 }
fac92740
MJD
6236 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6237 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
6238 break;
6239 case '<':
c277df42
IZ
6240 case ',':
6241 case '=':
6242 case '!':
c277df42 6243 *flagp &= ~HASWIDTH;
821b33a5
IZ
6244 /* FALL THROUGH */
6245 case '>':
830247a4 6246 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
6247 break;
6248 case 0:
830247a4 6249 ender = reg_node(pRExC_state, END);
40d049e4
YO
6250 if (!SIZE_ONLY) {
6251 assert(!RExC_opend); /* there can only be one! */
6252 RExC_opend = ender;
6253 }
c277df42
IZ
6254 break;
6255 }
eaf3ca90 6256 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 6257
9674d46a 6258 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
6259 if (depth==1)
6260 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6261
c277df42 6262 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
6263 for (br = ret; br; br = regnext(br)) {
6264 const U8 op = PL_regkind[OP(br)];
6265 if (op == BRANCH) {
07be1b83 6266 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
6267 }
6268 else if (op == BRANCHJ) {
07be1b83 6269 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 6270 }
c277df42
IZ
6271 }
6272 }
a0d0e21e 6273 }
c277df42
IZ
6274
6275 {
e1ec3a88
AL
6276 const char *p;
6277 static const char parens[] = "=!<,>";
c277df42
IZ
6278
6279 if (paren && (p = strchr(parens, paren))) {
eb160463 6280 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
6281 int flag = (p - parens) > 1;
6282
6283 if (paren == '>')
6284 node = SUSPEND, flag = 0;
6bda09f9 6285 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
6286 Set_Node_Cur_Length(ret);
6287 Set_Node_Offset(ret, parse_start + 1);
c277df42 6288 ret->flags = flag;
07be1b83 6289 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 6290 }
a0d0e21e
LW
6291 }
6292
6293 /* Check for proper termination. */
ce3e6498 6294 if (paren) {
e2509266 6295 RExC_flags = oregflags;
830247a4
IZ
6296 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6297 RExC_parse = oregcomp_parse;
380a0633 6298 vFAIL("Unmatched (");
ce3e6498 6299 }
a0ed51b3 6300 }
830247a4
IZ
6301 else if (!paren && RExC_parse < RExC_end) {
6302 if (*RExC_parse == ')') {
6303 RExC_parse++;
380a0633 6304 vFAIL("Unmatched )");
a0ed51b3
LW
6305 }
6306 else
b45f050a 6307 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
6308 /* NOTREACHED */
6309 }
594d7033
YO
6310 if (after_freeze)
6311 RExC_npar = after_freeze;
a0d0e21e 6312 return(ret);
a687059c
LW
6313}
6314
6315/*
6316 - regbranch - one alternative of an | operator
6317 *
6318 * Implements the concatenation operator.
6319 */
76e3520e 6320STATIC regnode *
3dab1dad 6321S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 6322{
97aff369 6323 dVAR;
c277df42
IZ
6324 register regnode *ret;
6325 register regnode *chain = NULL;
6326 register regnode *latest;
6327 I32 flags = 0, c = 0;
3dab1dad 6328 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6329
6330 PERL_ARGS_ASSERT_REGBRANCH;
6331
3dab1dad 6332 DEBUG_PARSE("brnc");
02daf0ab 6333
b81d288d 6334 if (first)
c277df42
IZ
6335 ret = NULL;
6336 else {
b81d288d 6337 if (!SIZE_ONLY && RExC_extralen)
830247a4 6338 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 6339 else {
830247a4 6340 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
6341 Set_Node_Length(ret, 1);
6342 }
c277df42
IZ
6343 }
6344
b81d288d 6345 if (!first && SIZE_ONLY)
830247a4 6346 RExC_extralen += 1; /* BRANCHJ */
b81d288d 6347
c277df42 6348 *flagp = WORST; /* Tentatively. */
a0d0e21e 6349
830247a4
IZ
6350 RExC_parse--;
6351 nextchar(pRExC_state);
6352 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 6353 flags &= ~TRYAGAIN;
3dab1dad 6354 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6355 if (latest == NULL) {
6356 if (flags & TRYAGAIN)
6357 continue;
6358 return(NULL);
a0ed51b3
LW
6359 }
6360 else if (ret == NULL)
c277df42 6361 ret = latest;
8ae10a67 6362 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 6363 if (chain == NULL) /* First piece. */
a0d0e21e
LW
6364 *flagp |= flags&SPSTART;
6365 else {
830247a4 6366 RExC_naughty++;
3dab1dad 6367 REGTAIL(pRExC_state, chain, latest);
a687059c 6368 }
a0d0e21e 6369 chain = latest;
c277df42
IZ
6370 c++;
6371 }
6372 if (chain == NULL) { /* Loop ran zero times. */
830247a4 6373 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
6374 if (ret == NULL)
6375 ret = chain;
6376 }
6377 if (c == 1) {
6378 *flagp |= flags&SIMPLE;
a0d0e21e 6379 }
a687059c 6380
d4c19fe8 6381 return ret;
a687059c
LW
6382}
6383
6384/*
6385 - regpiece - something followed by possible [*+?]
6386 *
6387 * Note that the branching code sequences used for ? and the general cases
6388 * of * and + are somewhat optimized: they use the same NOTHING node as
6389 * both the endmarker for their branch list and the body of the last branch.
6390 * It might seem that this node could be dispensed with entirely, but the
6391 * endmarker role is not redundant.
6392 */
76e3520e 6393STATIC regnode *
3dab1dad 6394S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6395{
97aff369 6396 dVAR;
c277df42 6397 register regnode *ret;
a0d0e21e
LW
6398 register char op;
6399 register char *next;
6400 I32 flags;
1df70142 6401 const char * const origparse = RExC_parse;
a0d0e21e 6402 I32 min;
c277df42 6403 I32 max = REG_INFTY;
fac92740 6404 char *parse_start;
10edeb5d 6405 const char *maxpos = NULL;
3dab1dad 6406 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6407
6408 PERL_ARGS_ASSERT_REGPIECE;
6409
3dab1dad 6410 DEBUG_PARSE("piec");
a0d0e21e 6411
3dab1dad 6412 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
6413 if (ret == NULL) {
6414 if (flags & TRYAGAIN)
6415 *flagp |= TRYAGAIN;
6416 return(NULL);
6417 }
6418
830247a4 6419 op = *RExC_parse;
a0d0e21e 6420
830247a4 6421 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 6422 maxpos = NULL;
fac92740 6423 parse_start = RExC_parse; /* MJD */
830247a4 6424 next = RExC_parse + 1;
a0d0e21e
LW
6425 while (isDIGIT(*next) || *next == ',') {
6426 if (*next == ',') {
6427 if (maxpos)
6428 break;
6429 else
6430 maxpos = next;
a687059c 6431 }
a0d0e21e
LW
6432 next++;
6433 }
6434 if (*next == '}') { /* got one */
6435 if (!maxpos)
6436 maxpos = next;
830247a4
IZ
6437 RExC_parse++;
6438 min = atoi(RExC_parse);
a0d0e21e
LW
6439 if (*maxpos == ',')
6440 maxpos++;
6441 else
830247a4 6442 maxpos = RExC_parse;
a0d0e21e
LW
6443 max = atoi(maxpos);
6444 if (!max && *maxpos != '0')
c277df42
IZ
6445 max = REG_INFTY; /* meaning "infinity" */
6446 else if (max >= REG_INFTY)
8615cb43 6447 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
6448 RExC_parse = next;
6449 nextchar(pRExC_state);
a0d0e21e
LW
6450
6451 do_curly:
6452 if ((flags&SIMPLE)) {
830247a4 6453 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 6454 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
6455 Set_Node_Offset(ret, parse_start+1); /* MJD */
6456 Set_Node_Cur_Length(ret);
a0d0e21e
LW
6457 }
6458 else {
3dab1dad 6459 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
6460
6461 w->flags = 0;
3dab1dad 6462 REGTAIL(pRExC_state, ret, w);
830247a4 6463 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
6464 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6465 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
6466 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6467 }
6bda09f9 6468 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
6469 /* MJD hk */
6470 Set_Node_Offset(ret, parse_start+1);
2af232bd 6471 Set_Node_Length(ret,
fac92740 6472 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 6473
830247a4 6474 if (!SIZE_ONLY && RExC_extralen)
c277df42 6475 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 6476 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 6477 if (SIZE_ONLY)
830247a4
IZ
6478 RExC_whilem_seen++, RExC_extralen += 3;
6479 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 6480 }
c277df42 6481 ret->flags = 0;
a0d0e21e
LW
6482
6483 if (min > 0)
821b33a5
IZ
6484 *flagp = WORST;
6485 if (max > 0)
6486 *flagp |= HASWIDTH;
8fa23287 6487 if (max < min)
8615cb43 6488 vFAIL("Can't do {n,m} with n > m");
c277df42 6489 if (!SIZE_ONLY) {
eb160463
GS
6490 ARG1_SET(ret, (U16)min);
6491 ARG2_SET(ret, (U16)max);
a687059c 6492 }
a687059c 6493
a0d0e21e 6494 goto nest_check;
a687059c 6495 }
a0d0e21e 6496 }
a687059c 6497
a0d0e21e
LW
6498 if (!ISMULT1(op)) {
6499 *flagp = flags;
a687059c 6500 return(ret);
a0d0e21e 6501 }
bb20fd44 6502
c277df42 6503#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
6504
6505 /* if this is reinstated, don't forget to put this back into perldiag:
6506
6507 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6508
6509 (F) The part of the regexp subject to either the * or + quantifier
6510 could match an empty string. The {#} shows in the regular
6511 expression about where the problem was discovered.
6512
6513 */
6514
bb20fd44 6515 if (!(flags&HASWIDTH) && op != '?')
b45f050a 6516 vFAIL("Regexp *+ operand could be empty");
b81d288d 6517#endif
bb20fd44 6518
fac92740 6519 parse_start = RExC_parse;
830247a4 6520 nextchar(pRExC_state);
a0d0e21e 6521
821b33a5 6522 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
6523
6524 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 6525 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 6526 ret->flags = 0;
830247a4 6527 RExC_naughty += 4;
a0d0e21e
LW
6528 }
6529 else if (op == '*') {
6530 min = 0;
6531 goto do_curly;
a0ed51b3
LW
6532 }
6533 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6534 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6535 ret->flags = 0;
830247a4 6536 RExC_naughty += 3;
a0d0e21e
LW
6537 }
6538 else if (op == '+') {
6539 min = 1;
6540 goto do_curly;
a0ed51b3
LW
6541 }
6542 else if (op == '?') {
a0d0e21e
LW
6543 min = 0; max = 1;
6544 goto do_curly;
6545 }
6546 nest_check:
668c081a
NC
6547 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6548 ckWARN3reg(RExC_parse,
6549 "%.*s matches null string many times",
6550 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6551 origparse);
a0d0e21e
LW
6552 }
6553
b9b4dddf 6554 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6555 nextchar(pRExC_state);
6bda09f9 6556 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6557 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6558 }
b9b4dddf
YO
6559#ifndef REG_ALLOW_MINMOD_SUSPEND
6560 else
6561#endif
6562 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6563 regnode *ender;
6564 nextchar(pRExC_state);
6565 ender = reg_node(pRExC_state, SUCCEED);
6566 REGTAIL(pRExC_state, ret, ender);
6567 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6568 ret->flags = 0;
6569 ender = reg_node(pRExC_state, TAIL);
6570 REGTAIL(pRExC_state, ret, ender);
6571 /*ret= ender;*/
6572 }
6573
6574 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6575 RExC_parse++;
b45f050a
JF
6576 vFAIL("Nested quantifiers");
6577 }
a0d0e21e
LW
6578
6579 return(ret);
a687059c
LW
6580}
6581
fc8cd66c
YO
6582
6583/* reg_namedseq(pRExC_state,UVp)
6584
6585 This is expected to be called by a parser routine that has
afefe6bf 6586 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
6587 expected to point at the first char following the N at the time
6588 of the call.
ff3f963a
KW
6589
6590 The \N may be inside (indicated by valuep not being NULL) or outside a
6591 character class.
6592
6593 \N may begin either a named sequence, or if outside a character class, mean
6594 to match a non-newline. For non single-quoted regexes, the tokenizer has
6595 attempted to decide which, and in the case of a named sequence converted it
6596 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6597 where c1... are the characters in the sequence. For single-quoted regexes,
6598 the tokenizer passes the \N sequence through unchanged; this code will not
6599 attempt to determine this nor expand those. The net effect is that if the
6600 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6601 signals that this \N occurrence means to match a non-newline.
6602
6603 Only the \N{U+...} form should occur in a character class, for the same
6604 reason that '.' inside a character class means to just match a period: it
6605 just doesn't make sense.
fc8cd66c
YO
6606
6607 If valuep is non-null then it is assumed that we are parsing inside
6608 of a charclass definition and the first codepoint in the resolved
6609 string is returned via *valuep and the routine will return NULL.
6610 In this mode if a multichar string is returned from the charnames
ff3f963a 6611 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
6612 sequence will be examined. If the string returned is zero length
6613 then the value of *valuep is undefined and NON-NULL will
6614 be returned to indicate failure. (This will NOT be a valid pointer
6615 to a regnode.)
6616
ff3f963a
KW
6617 If valuep is null then it is assumed that we are parsing normal text and a
6618 new EXACT node is inserted into the program containing the resolved string,
6619 and a pointer to the new node is returned. But if the string is zero length
6620 a NOTHING node is emitted instead.
afefe6bf 6621
fc8cd66c 6622 On success RExC_parse is set to the char following the endbrace.
ff3f963a 6623 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
6624 */
6625STATIC regnode *
afefe6bf 6626S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
fc8cd66c 6627{
c3c41406 6628 char * endbrace; /* '}' following the name */
fc8cd66c 6629 regnode *ret = NULL;
ff3f963a
KW
6630#ifdef DEBUGGING
6631 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6632#endif
c3c41406 6633 char* p;
ff3f963a
KW
6634
6635 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
6636
6637 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
6638
6639 GET_RE_DEBUG_FLAGS;
c3c41406
KW
6640
6641 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6642 * modifier. The other meaning does not */
6643 p = (RExC_flags & RXf_PMf_EXTENDED)
6644 ? regwhite( pRExC_state, RExC_parse )
6645 : RExC_parse;
7918f24d 6646
ff3f963a 6647 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
6648 * [^\n]. The former is assumed when it can't be the latter. */
6649 if (*p != '{' || regcurly(p)) {
6650 RExC_parse = p;
ff3f963a 6651 if (valuep) {
afefe6bf 6652 /* no bare \N in a charclass */
ff3f963a
KW
6653 vFAIL("\\N in a character class must be a named character: \\N{...}");
6654 }
afefe6bf
RGS
6655 nextchar(pRExC_state);
6656 ret = reg_node(pRExC_state, REG_ANY);
6657 *flagp |= HASWIDTH|SIMPLE;
6658 RExC_naughty++;
6659 RExC_parse--;
6660 Set_Node_Length(ret, 1); /* MJD */
6661 return ret;
fc8cd66c 6662 }
a4893424 6663
c3c41406
KW
6664 /* Here, we have decided it should be a named sequence */
6665
6666 /* The test above made sure that the next real character is a '{', but
6667 * under the /x modifier, it could be separated by space (or a comment and
6668 * \n) and this is not allowed (for consistency with \x{...} and the
6669 * tokenizer handling of \N{NAME}). */
6670 if (*RExC_parse != '{') {
6671 vFAIL("Missing braces on \\N{}");
6672 }
6673
ff3f963a 6674 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
6675
6676 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6677 || ! (endbrace == RExC_parse /* nothing between the {} */
6678 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6679 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6680 {
6681 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6682 vFAIL("\\N{NAME} must be resolved by the lexer");
6683 }
6684
ff3f963a
KW
6685 if (endbrace == RExC_parse) { /* empty: \N{} */
6686 if (! valuep) {
6687 RExC_parse = endbrace + 1;
6688 return reg_node(pRExC_state,NOTHING);
a4893424 6689 }
fc8cd66c 6690
ff3f963a
KW
6691 if (SIZE_ONLY) {
6692 ckWARNreg(RExC_parse,
6693 "Ignoring zero length \\N{} in character class"
6694 );
6695 RExC_parse = endbrace + 1;
6696 }
6697 *valuep = 0;
6698 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6699 }
ff3f963a
KW
6700
6701 RExC_utf8 = 1; /* named sequences imply Unicode semantics */
6702 RExC_parse += 2; /* Skip past the 'U+' */
6703
6704 if (valuep) { /* In a bracketed char class */
6705 /* We only pay attention to the first char of
6706 multichar strings being returned. I kinda wonder
6707 if this makes sense as it does change the behaviour
6708 from earlier versions, OTOH that behaviour was broken
6709 as well. XXX Solution is to recharacterize as
6710 [rest-of-class]|multi1|multi2... */
6711
6712 STRLEN length_of_hex;
6713 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6714 | PERL_SCAN_DISALLOW_PREFIX
6715 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6716
6717 char * endchar = strchr(RExC_parse, '.');
6718 if (endchar) {
6719 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
6720 }
6721 else endchar = endbrace;
6722
6723 length_of_hex = (STRLEN)(endchar - RExC_parse);
6724 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
6725
6726 /* The tokenizer should have guaranteed validity, but it's possible to
6727 * bypass it by using single quoting, so check */
c3c41406
KW
6728 if (length_of_hex == 0
6729 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
6730 {
6731 RExC_parse += length_of_hex; /* Includes all the valid */
6732 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6733 ? UTF8SKIP(RExC_parse)
6734 : 1;
6735 /* Guard against malformed utf8 */
6736 if (RExC_parse >= endchar) RExC_parse = endchar;
6737 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6738 }
6739
6740 RExC_parse = endbrace + 1;
6741 if (endchar == endbrace) return NULL;
6742
6743 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 6744 }
ff3f963a
KW
6745 else { /* Not a char class */
6746 char *s; /* String to put in generated EXACT node */
6747 STRLEN len = 0; /* Its current length */
6748 char *endchar; /* Points to '.' or '}' ending cur char in the input
6749 stream */
6750
6751 ret = reg_node(pRExC_state,
6752 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6753 s= STRING(ret);
6754
6755 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
6756 * the input which is of the form now 'c1.c2.c3...}' until find the
6757 * ending brace or exeed length 255. The characters that exceed this
6758 * limit are dropped. The limit could be relaxed should it become
6759 * desirable by reparsing this as (?:\N{NAME}), so could generate
6760 * multiple EXACT nodes, as is done for just regular input. But this
6761 * is primarily a named character, and not intended to be a huge long
6762 * string, so 255 bytes should be good enough */
6763 while (1) {
c3c41406 6764 STRLEN length_of_hex;
ff3f963a
KW
6765 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
6766 | PERL_SCAN_DISALLOW_PREFIX
6767 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6768 UV cp; /* Ord of current character */
6769
6770 /* Code points are separated by dots. If none, there is only one
6771 * code point, and is terminated by the brace */
6772 endchar = strchr(RExC_parse, '.');
6773 if (! endchar) endchar = endbrace;
6774
6775 /* The values are Unicode even on EBCDIC machines */
c3c41406
KW
6776 length_of_hex = (STRLEN)(endchar - RExC_parse);
6777 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
6778 if ( length_of_hex == 0
6779 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
ff3f963a 6780 {
c3c41406
KW
6781 RExC_parse += length_of_hex; /* Includes all the valid */
6782 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
6783 ? UTF8SKIP(RExC_parse)
6784 : 1;
6785 /* Guard against malformed utf8 */
6786 if (RExC_parse >= endchar) RExC_parse = endchar;
6787 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
6788 }
6789
6790 if (! FOLD) { /* Not folding, just append to the string */
6791 STRLEN unilen;
6792
6793 /* Quit before adding this character if would exceed limit */
6794 if (len + UNISKIP(cp) > U8_MAX) break;
fc8cd66c 6795
ff3f963a
KW
6796 unilen = reguni(pRExC_state, cp, s);
6797 if (unilen > 0) {
6798 s += unilen;
6799 len += unilen;
6800 }
6801 } else { /* Folding, output the folded equivalent */
6802 STRLEN foldlen,numlen;
6803 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6804 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
6805
6806 /* Quit before exceeding size limit */
6807 if (len + foldlen > U8_MAX) break;
6808
6809 for (foldbuf = tmpbuf;
6810 foldlen;
6811 foldlen -= numlen)
6812 {
6813 cp = utf8_to_uvchr(foldbuf, &numlen);
6814 if (numlen > 0) {
6815 const STRLEN unilen = reguni(pRExC_state, cp, s);
6816 s += unilen;
6817 len += unilen;
6818 /* In EBCDIC the numlen and unilen can differ. */
6819 foldbuf += numlen;
6820 if (numlen >= foldlen)
6821 break;
6822 }
6823 else
6824 break; /* "Can't happen." */
6825 }
6826 }
6827
6828 /* Point to the beginning of the next character in the sequence. */
6829 RExC_parse = endchar + 1;
6830
6831 /* Quit if no more characters */
6832 if (RExC_parse >= endbrace) break;
6833 }
6834
6835
6836 if (SIZE_ONLY) {
6837 if (RExC_parse < endbrace) {
6838 ckWARNreg(RExC_parse - 1,
6839 "Using just the first characters returned by \\N{}");
6840 }
6841
6842 RExC_size += STR_SZ(len);
6843 } else {
6844 STR_LEN(ret) = len;
6845 RExC_emit += STR_SZ(len);
6846 }
6847
6848 RExC_parse = endbrace + 1;
6849
6850 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
6851 with malformed in t/re/pat_advanced.t */
6852 RExC_parse --;
6853 Set_Node_Cur_Length(ret); /* MJD */
6854 nextchar(pRExC_state);
6855 }
6856
6857 return ret;
fc8cd66c
YO
6858}
6859
6860
9e08bc66
TS
6861/*
6862 * reg_recode
6863 *
6864 * It returns the code point in utf8 for the value in *encp.
6865 * value: a code value in the source encoding
6866 * encp: a pointer to an Encode object
6867 *
6868 * If the result from Encode is not a single character,
6869 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6870 */
6871STATIC UV
6872S_reg_recode(pTHX_ const char value, SV **encp)
6873{
6874 STRLEN numlen = 1;
59cd0e26 6875 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 6876 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
6877 const STRLEN newlen = SvCUR(sv);
6878 UV uv = UNICODE_REPLACEMENT;
6879
7918f24d
NC
6880 PERL_ARGS_ASSERT_REG_RECODE;
6881
9e08bc66
TS
6882 if (newlen)
6883 uv = SvUTF8(sv)
6884 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6885 : *(U8*)s;
6886
6887 if (!newlen || numlen != newlen) {
6888 uv = UNICODE_REPLACEMENT;
c86f7df5 6889 *encp = NULL;
9e08bc66
TS
6890 }
6891 return uv;
6892}
6893
fc8cd66c 6894
a687059c
LW
6895/*
6896 - regatom - the lowest level
ee9b8eae
YO
6897
6898 Try to identify anything special at the start of the pattern. If there
6899 is, then handle it as required. This may involve generating a single regop,
6900 such as for an assertion; or it may involve recursing, such as to
6901 handle a () structure.
6902
6903 If the string doesn't start with something special then we gobble up
6904 as much literal text as we can.
6905
6906 Once we have been able to handle whatever type of thing started the
6907 sequence, we return.
6908
6909 Note: we have to be careful with escapes, as they can be both literal
6910 and special, and in the case of \10 and friends can either, depending
6911 on context. Specifically there are two seperate switches for handling
6912 escape sequences, with the one for handling literal escapes requiring
6913 a dummy entry for all of the special escapes that are actually handled
6914 by the other.
6915*/
6916
76e3520e 6917STATIC regnode *
3dab1dad 6918S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6919{
97aff369 6920 dVAR;
cbbf8932 6921 register regnode *ret = NULL;
a0d0e21e 6922 I32 flags;
45948336 6923 char *parse_start = RExC_parse;
3dab1dad
YO
6924 GET_RE_DEBUG_FLAGS_DECL;
6925 DEBUG_PARSE("atom");
a0d0e21e
LW
6926 *flagp = WORST; /* Tentatively. */
6927
7918f24d 6928 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 6929
a0d0e21e 6930tryagain:
f9a79580 6931 switch ((U8)*RExC_parse) {
a0d0e21e 6932 case '^':
830247a4
IZ
6933 RExC_seen_zerolen++;
6934 nextchar(pRExC_state);
bbe252da 6935 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6936 ret = reg_node(pRExC_state, MBOL);
bbe252da 6937 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6938 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6939 else
830247a4 6940 ret = reg_node(pRExC_state, BOL);
fac92740 6941 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6942 break;
6943 case '$':
830247a4 6944 nextchar(pRExC_state);
b81d288d 6945 if (*RExC_parse)
830247a4 6946 RExC_seen_zerolen++;
bbe252da 6947 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6948 ret = reg_node(pRExC_state, MEOL);
bbe252da 6949 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6950 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6951 else
830247a4 6952 ret = reg_node(pRExC_state, EOL);
fac92740 6953 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6954 break;
6955 case '.':
830247a4 6956 nextchar(pRExC_state);
bbe252da 6957 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6958 ret = reg_node(pRExC_state, SANY);
6959 else
6960 ret = reg_node(pRExC_state, REG_ANY);
6961 *flagp |= HASWIDTH|SIMPLE;
830247a4 6962 RExC_naughty++;
fac92740 6963 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6964 break;
6965 case '[':
b45f050a 6966 {
3dab1dad
YO
6967 char * const oregcomp_parse = ++RExC_parse;
6968 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6969 if (*RExC_parse != ']') {
6970 RExC_parse = oregcomp_parse;
b45f050a
JF
6971 vFAIL("Unmatched [");
6972 }
830247a4 6973 nextchar(pRExC_state);
a0d0e21e 6974 *flagp |= HASWIDTH|SIMPLE;
fac92740 6975 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6976 break;
b45f050a 6977 }
a0d0e21e 6978 case '(':
830247a4 6979 nextchar(pRExC_state);
3dab1dad 6980 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6981 if (ret == NULL) {
bf93d4cc 6982 if (flags & TRYAGAIN) {
830247a4 6983 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6984 /* Make parent create an empty node if needed. */
6985 *flagp |= TRYAGAIN;
6986 return(NULL);
6987 }
a0d0e21e 6988 goto tryagain;
bf93d4cc 6989 }
a0d0e21e
LW
6990 return(NULL);
6991 }
a3b492c3 6992 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
6993 break;
6994 case '|':
6995 case ')':
6996 if (flags & TRYAGAIN) {
6997 *flagp |= TRYAGAIN;
6998 return NULL;
6999 }
b45f050a 7000 vFAIL("Internal urp");
a0d0e21e
LW
7001 /* Supposed to be caught earlier. */
7002 break;
85afd4ae 7003 case '{':
830247a4
IZ
7004 if (!regcurly(RExC_parse)) {
7005 RExC_parse++;
85afd4ae
CS
7006 goto defchar;
7007 }
7008 /* FALL THROUGH */
a0d0e21e
LW
7009 case '?':
7010 case '+':
7011 case '*':
830247a4 7012 RExC_parse++;
b45f050a 7013 vFAIL("Quantifier follows nothing");
a0d0e21e 7014 break;
f9a79580
RGS
7015 case 0xDF:
7016 case 0xC3:
7017 case 0xCE:
a0a388a1 7018 do_foldchar:
56d400ed 7019 if (!LOC && FOLD) {
e64b1bd1 7020 U32 len,cp;
7cf3a6a3 7021 len=0; /* silence a spurious compiler warning */
56d400ed 7022 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
e64b1bd1
YO
7023 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7024 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7025 ret = reganode(pRExC_state, FOLDCHAR, cp);
7026 Set_Node_Length(ret, 1); /* MJD */
7027 nextchar(pRExC_state); /* kill whitespace under /x */
7028 return ret;
7029 }
7030 }
7031 goto outer_default;
a0d0e21e 7032 case '\\':
ee9b8eae
YO
7033 /* Special Escapes
7034
7035 This switch handles escape sequences that resolve to some kind
7036 of special regop and not to literal text. Escape sequnces that
7037 resolve to literal text are handled below in the switch marked
7038 "Literal Escapes".
7039
7040 Every entry in this switch *must* have a corresponding entry
7041 in the literal escape switch. However, the opposite is not
7042 required, as the default for this switch is to jump to the
7043 literal text handling code.
7044 */
a0a388a1
YO
7045 switch ((U8)*++RExC_parse) {
7046 case 0xDF:
7047 case 0xC3:
7048 case 0xCE:
7049 goto do_foldchar;
ee9b8eae 7050 /* Special Escapes */
a0d0e21e 7051 case 'A':
830247a4
IZ
7052 RExC_seen_zerolen++;
7053 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 7054 *flagp |= SIMPLE;
ee9b8eae 7055 goto finish_meta_pat;
a0d0e21e 7056 case 'G':
830247a4
IZ
7057 ret = reg_node(pRExC_state, GPOS);
7058 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 7059 *flagp |= SIMPLE;
ee9b8eae
YO
7060 goto finish_meta_pat;
7061 case 'K':
7062 RExC_seen_zerolen++;
7063 ret = reg_node(pRExC_state, KEEPS);
7064 *flagp |= SIMPLE;
37923168
RGS
7065 /* XXX:dmq : disabling in-place substitution seems to
7066 * be necessary here to avoid cases of memory corruption, as
7067 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7068 */
7069 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 7070 goto finish_meta_pat;
a0d0e21e 7071 case 'Z':
830247a4 7072 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 7073 *flagp |= SIMPLE;
a1917ab9 7074 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7075 goto finish_meta_pat;
b85d18e9 7076 case 'z':
830247a4 7077 ret = reg_node(pRExC_state, EOS);
b85d18e9 7078 *flagp |= SIMPLE;
830247a4 7079 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 7080 goto finish_meta_pat;
4a2d328f 7081 case 'C':
f33976b4
DB
7082 ret = reg_node(pRExC_state, CANY);
7083 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 7084 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7085 goto finish_meta_pat;
a0ed51b3 7086 case 'X':
830247a4 7087 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 7088 *flagp |= HASWIDTH;
ee9b8eae 7089 goto finish_meta_pat;
a0d0e21e 7090 case 'w':
eb160463 7091 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 7092 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7093 goto finish_meta_pat;
a0d0e21e 7094 case 'W':
eb160463 7095 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 7096 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7097 goto finish_meta_pat;
a0d0e21e 7098 case 'b':
830247a4
IZ
7099 RExC_seen_zerolen++;
7100 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7101 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 7102 *flagp |= SIMPLE;
ee9b8eae 7103 goto finish_meta_pat;
a0d0e21e 7104 case 'B':
830247a4
IZ
7105 RExC_seen_zerolen++;
7106 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 7107 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 7108 *flagp |= SIMPLE;
ee9b8eae 7109 goto finish_meta_pat;
a0d0e21e 7110 case 's':
eb160463 7111 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 7112 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7113 goto finish_meta_pat;
a0d0e21e 7114 case 'S':
eb160463 7115 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 7116 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7117 goto finish_meta_pat;
a0d0e21e 7118 case 'd':
ffc61ed2 7119 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 7120 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7121 goto finish_meta_pat;
a0d0e21e 7122 case 'D':
ffc61ed2 7123 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 7124 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7125 goto finish_meta_pat;
e1d1eefb
YO
7126 case 'R':
7127 ret = reg_node(pRExC_state, LNBREAK);
7128 *flagp |= HASWIDTH|SIMPLE;
7129 goto finish_meta_pat;
7130 case 'h':
7131 ret = reg_node(pRExC_state, HORIZWS);
7132 *flagp |= HASWIDTH|SIMPLE;
7133 goto finish_meta_pat;
7134 case 'H':
7135 ret = reg_node(pRExC_state, NHORIZWS);
7136 *flagp |= HASWIDTH|SIMPLE;
7137 goto finish_meta_pat;
ee9b8eae 7138 case 'v':
e1d1eefb
YO
7139 ret = reg_node(pRExC_state, VERTWS);
7140 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
7141 goto finish_meta_pat;
7142 case 'V':
e1d1eefb
YO
7143 ret = reg_node(pRExC_state, NVERTWS);
7144 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 7145 finish_meta_pat:
830247a4 7146 nextchar(pRExC_state);
fac92740 7147 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 7148 break;
a14b48bc
LW
7149 case 'p':
7150 case 'P':
3568d838 7151 {
3dab1dad 7152 char* const oldregxend = RExC_end;
d008bc60 7153#ifdef DEBUGGING
ccb2c380 7154 char* parse_start = RExC_parse - 2;
d008bc60 7155#endif
a14b48bc 7156
830247a4 7157 if (RExC_parse[1] == '{') {
3568d838 7158 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
7159 RExC_end = strchr(RExC_parse, '}');
7160 if (!RExC_end) {
3dab1dad 7161 const U8 c = (U8)*RExC_parse;
830247a4
IZ
7162 RExC_parse += 2;
7163 RExC_end = oldregxend;
0da60cf5 7164 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 7165 }
830247a4 7166 RExC_end++;
a14b48bc 7167 }
af6f566e 7168 else {
830247a4 7169 RExC_end = RExC_parse + 2;
af6f566e
HS
7170 if (RExC_end > oldregxend)
7171 RExC_end = oldregxend;
7172 }
830247a4 7173 RExC_parse--;
a14b48bc 7174
3dab1dad 7175 ret = regclass(pRExC_state,depth+1);
a14b48bc 7176
830247a4
IZ
7177 RExC_end = oldregxend;
7178 RExC_parse--;
ccb2c380
MP
7179
7180 Set_Node_Offset(ret, parse_start + 2);
7181 Set_Node_Cur_Length(ret);
830247a4 7182 nextchar(pRExC_state);
a14b48bc
LW
7183 *flagp |= HASWIDTH|SIMPLE;
7184 }
7185 break;
fc8cd66c 7186 case 'N':
afefe6bf 7187 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
7188 multicharacter. join_exact() will join them up later on.
7189 Also this makes sure that things like /\N{BLAH}+/ and
7190 \N{BLAH} being multi char Just Happen. dmq*/
7191 ++RExC_parse;
afefe6bf 7192 ret= reg_namedseq(pRExC_state, NULL, flagp);
fc8cd66c 7193 break;
0a4db386 7194 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 7195 parse_named_seq:
81714fb9
YO
7196 {
7197 char ch= RExC_parse[1];
1f1031fe
YO
7198 if (ch != '<' && ch != '\'' && ch != '{') {
7199 RExC_parse++;
7200 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 7201 } else {
1f1031fe
YO
7202 /* this pretty much dupes the code for (?P=...) in reg(), if
7203 you change this make sure you change that */
81714fb9 7204 char* name_start = (RExC_parse += 2);
2eccd3b2 7205 U32 num = 0;
0a4db386
YO
7206 SV *sv_dat = reg_scan_name(pRExC_state,
7207 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 7208 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 7209 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
7210 vFAIL2("Sequence %.3s... not terminated",parse_start);
7211
7212 if (!SIZE_ONLY) {
7213 num = add_data( pRExC_state, 1, "S" );
7214 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7215 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7216 }
7217
81714fb9
YO
7218 RExC_sawback = 1;
7219 ret = reganode(pRExC_state,
7220 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7221 num);
7222 *flagp |= HASWIDTH;
1f1031fe 7223
81714fb9
YO
7224 /* override incorrect value set in reganode MJD */
7225 Set_Node_Offset(ret, parse_start+1);
7226 Set_Node_Cur_Length(ret); /* MJD */
7227 nextchar(pRExC_state);
1f1031fe 7228
81714fb9
YO
7229 }
7230 break;
1f1031fe 7231 }
2bf803e2 7232 case 'g':
a0d0e21e
LW
7233 case '1': case '2': case '3': case '4':
7234 case '5': case '6': case '7': case '8': case '9':
7235 {
c74340f9 7236 I32 num;
2bf803e2
YO
7237 bool isg = *RExC_parse == 'g';
7238 bool isrel = 0;
7239 bool hasbrace = 0;
7240 if (isg) {
c74340f9 7241 RExC_parse++;
2bf803e2
YO
7242 if (*RExC_parse == '{') {
7243 RExC_parse++;
7244 hasbrace = 1;
7245 }
7246 if (*RExC_parse == '-') {
7247 RExC_parse++;
7248 isrel = 1;
7249 }
1f1031fe
YO
7250 if (hasbrace && !isDIGIT(*RExC_parse)) {
7251 if (isrel) RExC_parse--;
7252 RExC_parse -= 2;
7253 goto parse_named_seq;
7254 } }
c74340f9 7255 num = atoi(RExC_parse);
b72d83b2
RGS
7256 if (isg && num == 0)
7257 vFAIL("Reference to invalid group 0");
c74340f9 7258 if (isrel) {
5624f11d 7259 num = RExC_npar - num;
c74340f9
YO
7260 if (num < 1)
7261 vFAIL("Reference to nonexistent or unclosed group");
7262 }
2bf803e2 7263 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
7264 goto defchar;
7265 else {
3dab1dad 7266 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
7267 while (isDIGIT(*RExC_parse))
7268 RExC_parse++;
1f1031fe
YO
7269 if (parse_start == RExC_parse - 1)
7270 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
7271 if (hasbrace) {
7272 if (*RExC_parse != '}')
7273 vFAIL("Unterminated \\g{...} pattern");
7274 RExC_parse++;
7275 }
c74340f9
YO
7276 if (!SIZE_ONLY) {
7277 if (num > (I32)RExC_rx->nparens)
7278 vFAIL("Reference to nonexistent group");
c74340f9 7279 }
830247a4 7280 RExC_sawback = 1;
eb160463
GS
7281 ret = reganode(pRExC_state,
7282 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7283 num);
a0d0e21e 7284 *flagp |= HASWIDTH;
2af232bd 7285
fac92740 7286 /* override incorrect value set in reganode MJD */
2af232bd 7287 Set_Node_Offset(ret, parse_start+1);
fac92740 7288 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
7289 RExC_parse--;
7290 nextchar(pRExC_state);
a0d0e21e
LW
7291 }
7292 }
7293 break;
7294 case '\0':
830247a4 7295 if (RExC_parse >= RExC_end)
b45f050a 7296 FAIL("Trailing \\");
a0d0e21e
LW
7297 /* FALL THROUGH */
7298 default:
a0288114 7299 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 7300 back into the quick-grab loop below */
45948336 7301 parse_start--;
a0d0e21e
LW
7302 goto defchar;
7303 }
7304 break;
4633a7c4
LW
7305
7306 case '#':
bbe252da 7307 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 7308 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
7309 goto tryagain;
7310 }
7311 /* FALL THROUGH */
7312
f9a79580
RGS
7313 default:
7314 outer_default:{
ba210ebe 7315 register STRLEN len;
58ae7d3f 7316 register UV ender;
a0d0e21e 7317 register char *p;
3dab1dad 7318 char *s;
80aecb99 7319 STRLEN foldlen;
89ebb4a3 7320 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
7321
7322 parse_start = RExC_parse - 1;
a0d0e21e 7323
830247a4 7324 RExC_parse++;
a0d0e21e
LW
7325
7326 defchar:
58ae7d3f 7327 ender = 0;
eb160463
GS
7328 ret = reg_node(pRExC_state,
7329 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 7330 s = STRING(ret);
830247a4
IZ
7331 for (len = 0, p = RExC_parse - 1;
7332 len < 127 && p < RExC_end;
a0d0e21e
LW
7333 len++)
7334 {
3dab1dad 7335 char * const oldp = p;
5b5a24f7 7336
bbe252da 7337 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 7338 p = regwhite( pRExC_state, p );
f9a79580
RGS
7339 switch ((U8)*p) {
7340 case 0xDF:
7341 case 0xC3:
7342 case 0xCE:
56d400ed 7343 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
f9a79580 7344 goto normal_default;
a0d0e21e
LW
7345 case '^':
7346 case '$':
7347 case '.':
7348 case '[':
7349 case '(':
7350 case ')':
7351 case '|':
7352 goto loopdone;
7353 case '\\':
ee9b8eae
YO
7354 /* Literal Escapes Switch
7355
7356 This switch is meant to handle escape sequences that
7357 resolve to a literal character.
7358
7359 Every escape sequence that represents something
7360 else, like an assertion or a char class, is handled
7361 in the switch marked 'Special Escapes' above in this
7362 routine, but also has an entry here as anything that
7363 isn't explicitly mentioned here will be treated as
7364 an unescaped equivalent literal.
7365 */
7366
a0a388a1 7367 switch ((U8)*++p) {
ee9b8eae 7368 /* These are all the special escapes. */
a0a388a1
YO
7369 case 0xDF:
7370 case 0xC3:
7371 case 0xCE:
7372 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7373 goto normal_default;
ee9b8eae
YO
7374 case 'A': /* Start assertion */
7375 case 'b': case 'B': /* Word-boundary assertion*/
7376 case 'C': /* Single char !DANGEROUS! */
7377 case 'd': case 'D': /* digit class */
7378 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 7379 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
7380 case 'k': case 'K': /* named backref, keep marker */
7381 case 'N': /* named char sequence */
38a44b82 7382 case 'p': case 'P': /* Unicode property */
e1d1eefb 7383 case 'R': /* LNBREAK */
ee9b8eae 7384 case 's': case 'S': /* space class */
e1d1eefb 7385 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
7386 case 'w': case 'W': /* word class */
7387 case 'X': /* eXtended Unicode "combining character sequence" */
7388 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
7389 --p;
7390 goto loopdone;
ee9b8eae
YO
7391
7392 /* Anything after here is an escape that resolves to a
7393 literal. (Except digits, which may or may not)
7394 */
a0d0e21e
LW
7395 case 'n':
7396 ender = '\n';
7397 p++;
a687059c 7398 break;
a0d0e21e
LW
7399 case 'r':
7400 ender = '\r';
7401 p++;
a687059c 7402 break;
a0d0e21e
LW
7403 case 't':
7404 ender = '\t';
7405 p++;
a687059c 7406 break;
a0d0e21e
LW
7407 case 'f':
7408 ender = '\f';
7409 p++;
a687059c 7410 break;
a0d0e21e 7411 case 'e':
c7f1f016 7412 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 7413 p++;
a687059c 7414 break;
a0d0e21e 7415 case 'a':
c7f1f016 7416 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 7417 p++;
a687059c 7418 break;
a0d0e21e 7419 case 'x':
a0ed51b3 7420 if (*++p == '{') {
1df70142 7421 char* const e = strchr(p, '}');
b81d288d 7422
b45f050a 7423 if (!e) {
830247a4 7424 RExC_parse = p + 1;
b45f050a
JF
7425 vFAIL("Missing right brace on \\x{}");
7426 }
de5f0749 7427 else {
a4c04bdc
NC
7428 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7429 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 7430 STRLEN numlen = e - p - 1;
53305cf1 7431 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
7432 if (ender > 0xff)
7433 RExC_utf8 = 1;
a0ed51b3
LW
7434 p = e + 1;
7435 }
a0ed51b3
LW
7436 }
7437 else {
a4c04bdc 7438 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 7439 STRLEN numlen = 2;
53305cf1 7440 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
7441 p += numlen;
7442 }
9e08bc66
TS
7443 if (PL_encoding && ender < 0x100)
7444 goto recode_encoding;
a687059c 7445 break;
a0d0e21e
LW
7446 case 'c':
7447 p++;
bbce6d69 7448 ender = UCHARAT(p++);
7449 ender = toCTRL(ender);
a687059c 7450 break;
a0d0e21e
LW
7451 case '0': case '1': case '2': case '3':case '4':
7452 case '5': case '6': case '7': case '8':case '9':
7453 if (*p == '0' ||
830247a4 7454 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 7455 I32 flags = 0;
1df70142 7456 STRLEN numlen = 3;
53305cf1 7457 ender = grok_oct(p, &numlen, &flags, NULL);
609122bd
KW
7458
7459 /* An octal above 0xff is interpreted differently
7460 * depending on if the re is in utf8 or not. If it
7461 * is in utf8, the value will be itself, otherwise
7462 * it is interpreted as modulo 0x100. It has been
7463 * decided to discourage the use of octal above the
7464 * single-byte range. For now, warn only when
7465 * it ends up modulo */
7466 if (SIZE_ONLY && ender >= 0x100
668c081a
NC
7467 && ! UTF && ! PL_encoding) {
7468 ckWARNregdep(p, "Use of octal value above 377 is deprecated");
609122bd 7469 }
a0d0e21e
LW
7470 p += numlen;
7471 }
7472 else {
7473 --p;
7474 goto loopdone;
a687059c 7475 }
9e08bc66
TS
7476 if (PL_encoding && ender < 0x100)
7477 goto recode_encoding;
7478 break;
7479 recode_encoding:
7480 {
7481 SV* enc = PL_encoding;
7482 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
7483 if (!enc && SIZE_ONLY)
7484 ckWARNreg(p, "Invalid escape in the specified encoding");
9e08bc66
TS
7485 RExC_utf8 = 1;
7486 }
a687059c 7487 break;
a0d0e21e 7488 case '\0':
830247a4 7489 if (p >= RExC_end)
b45f050a 7490 FAIL("Trailing \\");
a687059c 7491 /* FALL THROUGH */
a0d0e21e 7492 default:
668c081a
NC
7493 if (!SIZE_ONLY&& isALPHA(*p))
7494 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 7495 goto normal_default;
a0d0e21e
LW
7496 }
7497 break;
a687059c 7498 default:
a0ed51b3 7499 normal_default:
fd400ab9 7500 if (UTF8_IS_START(*p) && UTF) {
1df70142 7501 STRLEN numlen;
5e12f4fb 7502 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 7503 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
7504 p += numlen;
7505 }
7506 else
7507 ender = *p++;
a0d0e21e 7508 break;
a687059c 7509 }
bcdf7404
YO
7510 if ( RExC_flags & RXf_PMf_EXTENDED)
7511 p = regwhite( pRExC_state, p );
60a8b682
JH
7512 if (UTF && FOLD) {
7513 /* Prime the casefolded buffer. */
ac7e0132 7514 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 7515 }
bcdf7404 7516 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
7517 if (len)
7518 p = oldp;
16ea2a2e 7519 else if (UTF) {
80aecb99 7520 if (FOLD) {
60a8b682 7521 /* Emit all the Unicode characters. */
1df70142 7522 STRLEN numlen;
80aecb99
JH
7523 for (foldbuf = tmpbuf;
7524 foldlen;
7525 foldlen -= numlen) {
7526 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7527 if (numlen > 0) {
71207a34 7528 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7529 s += unilen;
7530 len += unilen;
7531 /* In EBCDIC the numlen
7532 * and unilen can differ. */
9dc45d57 7533 foldbuf += numlen;
47654450
JH
7534 if (numlen >= foldlen)
7535 break;
9dc45d57
JH
7536 }
7537 else
7538 break; /* "Can't happen." */
80aecb99
JH
7539 }
7540 }
7541 else {
71207a34 7542 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7543 if (unilen > 0) {
0ebc6274
JH
7544 s += unilen;
7545 len += unilen;
9dc45d57 7546 }
80aecb99 7547 }
a0ed51b3 7548 }
a0d0e21e
LW
7549 else {
7550 len++;
eb160463 7551 REGC((char)ender, s++);
a0d0e21e
LW
7552 }
7553 break;
a687059c 7554 }
16ea2a2e 7555 if (UTF) {
80aecb99 7556 if (FOLD) {
60a8b682 7557 /* Emit all the Unicode characters. */
1df70142 7558 STRLEN numlen;
80aecb99
JH
7559 for (foldbuf = tmpbuf;
7560 foldlen;
7561 foldlen -= numlen) {
7562 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 7563 if (numlen > 0) {
71207a34 7564 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
7565 len += unilen;
7566 s += unilen;
7567 /* In EBCDIC the numlen
7568 * and unilen can differ. */
9dc45d57 7569 foldbuf += numlen;
47654450
JH
7570 if (numlen >= foldlen)
7571 break;
9dc45d57
JH
7572 }
7573 else
7574 break;
80aecb99
JH
7575 }
7576 }
7577 else {
71207a34 7578 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 7579 if (unilen > 0) {
0ebc6274
JH
7580 s += unilen;
7581 len += unilen;
9dc45d57 7582 }
80aecb99
JH
7583 }
7584 len--;
a0ed51b3
LW
7585 }
7586 else
eb160463 7587 REGC((char)ender, s++);
a0d0e21e
LW
7588 }
7589 loopdone:
830247a4 7590 RExC_parse = p - 1;
fac92740 7591 Set_Node_Cur_Length(ret); /* MJD */
830247a4 7592 nextchar(pRExC_state);
793db0cb
JH
7593 {
7594 /* len is STRLEN which is unsigned, need to copy to signed */
7595 IV iv = len;
7596 if (iv < 0)
7597 vFAIL("Internal disaster");
7598 }
a0d0e21e
LW
7599 if (len > 0)
7600 *flagp |= HASWIDTH;
090f7165 7601 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 7602 *flagp |= SIMPLE;
3dab1dad 7603
cd439c50 7604 if (SIZE_ONLY)
830247a4 7605 RExC_size += STR_SZ(len);
3dab1dad
YO
7606 else {
7607 STR_LEN(ret) = len;
830247a4 7608 RExC_emit += STR_SZ(len);
07be1b83 7609 }
3dab1dad 7610 }
a0d0e21e
LW
7611 break;
7612 }
a687059c 7613
a0d0e21e 7614 return(ret);
a687059c
LW
7615}
7616
873ef191 7617STATIC char *
bcdf7404 7618S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7619{
bcdf7404 7620 const char *e = RExC_end;
7918f24d
NC
7621
7622 PERL_ARGS_ASSERT_REGWHITE;
7623
5b5a24f7
CS
7624 while (p < e) {
7625 if (isSPACE(*p))
7626 ++p;
7627 else if (*p == '#') {
bcdf7404 7628 bool ended = 0;
5b5a24f7 7629 do {
bcdf7404
YO
7630 if (*p++ == '\n') {
7631 ended = 1;
7632 break;
7633 }
7634 } while (p < e);
7635 if (!ended)
7636 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7637 }
7638 else
7639 break;
7640 }
7641 return p;
7642}
7643
b8c5462f
JH
7644/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7645 Character classes ([:foo:]) can also be negated ([:^foo:]).
7646 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7647 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7648 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7649
7650#define POSIXCC_DONE(c) ((c) == ':')
7651#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7652#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7653
b8c5462f 7654STATIC I32
830247a4 7655S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7656{
97aff369 7657 dVAR;
936ed897 7658 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7659
7918f24d
NC
7660 PERL_ARGS_ASSERT_REGPPOSIXCC;
7661
830247a4 7662 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7663 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7664 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7665 const char c = UCHARAT(RExC_parse);
097eb12c 7666 char* const s = RExC_parse++;
b81d288d 7667
9a86a77b 7668 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7669 RExC_parse++;
7670 if (RExC_parse == RExC_end)
620e46c5 7671 /* Grandfather lone [:, [=, [. */
830247a4 7672 RExC_parse = s;
620e46c5 7673 else {
3dab1dad 7674 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7675 assert(*t == c);
7676
9a86a77b 7677 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7678 const char *posixcc = s + 1;
830247a4 7679 RExC_parse++; /* skip over the ending ] */
3dab1dad 7680
b8c5462f 7681 if (*s == ':') {
1df70142
AL
7682 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7683 const I32 skip = t - posixcc;
80916619
NC
7684
7685 /* Initially switch on the length of the name. */
7686 switch (skip) {
7687 case 4:
3dab1dad
YO
7688 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7689 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7690 break;
80916619
NC
7691 case 5:
7692 /* Names all of length 5. */
7693 /* alnum alpha ascii blank cntrl digit graph lower
7694 print punct space upper */
7695 /* Offset 4 gives the best switch position. */
7696 switch (posixcc[4]) {
7697 case 'a':
3dab1dad
YO
7698 if (memEQ(posixcc, "alph", 4)) /* alpha */
7699 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7700 break;
7701 case 'e':
3dab1dad
YO
7702 if (memEQ(posixcc, "spac", 4)) /* space */
7703 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7704 break;
7705 case 'h':
3dab1dad
YO
7706 if (memEQ(posixcc, "grap", 4)) /* graph */
7707 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7708 break;
7709 case 'i':
3dab1dad
YO
7710 if (memEQ(posixcc, "asci", 4)) /* ascii */
7711 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7712 break;
7713 case 'k':
3dab1dad
YO
7714 if (memEQ(posixcc, "blan", 4)) /* blank */
7715 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7716 break;
7717 case 'l':
3dab1dad
YO
7718 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7719 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7720 break;
7721 case 'm':
3dab1dad
YO
7722 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7723 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7724 break;
7725 case 'r':
3dab1dad
YO
7726 if (memEQ(posixcc, "lowe", 4)) /* lower */
7727 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7728 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7729 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7730 break;
7731 case 't':
3dab1dad
YO
7732 if (memEQ(posixcc, "digi", 4)) /* digit */
7733 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7734 else if (memEQ(posixcc, "prin", 4)) /* print */
7735 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7736 else if (memEQ(posixcc, "punc", 4)) /* punct */
7737 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7738 break;
b8c5462f
JH
7739 }
7740 break;
80916619 7741 case 6:
3dab1dad
YO
7742 if (memEQ(posixcc, "xdigit", 6))
7743 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7744 break;
7745 }
80916619
NC
7746
7747 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7748 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7749 t - s - 1, s + 1);
80916619
NC
7750 assert (posixcc[skip] == ':');
7751 assert (posixcc[skip+1] == ']');
b45f050a 7752 } else if (!SIZE_ONLY) {
b8c5462f 7753 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7754
830247a4 7755 /* adjust RExC_parse so the warning shows after
b45f050a 7756 the class closes */
9a86a77b 7757 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7758 RExC_parse++;
b45f050a
JF
7759 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7760 }
b8c5462f
JH
7761 } else {
7762 /* Maternal grandfather:
7763 * "[:" ending in ":" but not in ":]" */
830247a4 7764 RExC_parse = s;
767d463e 7765 }
620e46c5
JH
7766 }
7767 }
7768
b8c5462f
JH
7769 return namedclass;
7770}
7771
7772STATIC void
830247a4 7773S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7774{
97aff369 7775 dVAR;
7918f24d
NC
7776
7777 PERL_ARGS_ASSERT_CHECKPOSIXCC;
7778
3dab1dad 7779 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7780 const char *s = RExC_parse;
7781 const char c = *s++;
b8c5462f 7782
3dab1dad 7783 while (isALNUM(*s))
b8c5462f
JH
7784 s++;
7785 if (*s && c == *s && s[1] == ']') {
668c081a
NC
7786 ckWARN3reg(s+2,
7787 "POSIX syntax [%c %c] belongs inside character classes",
7788 c, c);
b45f050a
JF
7789
7790 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7791 if (POSIXCC_NOTYET(c)) {
830247a4 7792 /* adjust RExC_parse so the error shows after
b45f050a 7793 the class closes */
9a86a77b 7794 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7795 NOOP;
b45f050a
JF
7796 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7797 }
b8c5462f
JH
7798 }
7799 }
620e46c5
JH
7800}
7801
7f6f358c 7802
89836f1f
YO
7803#define _C_C_T_(NAME,TEST,WORD) \
7804ANYOF_##NAME: \
7805 if (LOC) \
7806 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7807 else { \
7808 for (value = 0; value < 256; value++) \
7809 if (TEST) \
7810 ANYOF_BITMAP_SET(ret, value); \
7811 } \
7812 yesno = '+'; \
7813 what = WORD; \
7814 break; \
7815case ANYOF_N##NAME: \
7816 if (LOC) \
7817 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7818 else { \
7819 for (value = 0; value < 256; value++) \
7820 if (!TEST) \
7821 ANYOF_BITMAP_SET(ret, value); \
7822 } \
7823 yesno = '!'; \
7824 what = WORD; \
7825 break
7826
e1d1eefb
YO
7827#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
7828ANYOF_##NAME: \
7829 for (value = 0; value < 256; value++) \
7830 if (TEST) \
7831 ANYOF_BITMAP_SET(ret, value); \
7832 yesno = '+'; \
7833 what = WORD; \
7834 break; \
7835case ANYOF_N##NAME: \
7836 for (value = 0; value < 256; value++) \
7837 if (!TEST) \
7838 ANYOF_BITMAP_SET(ret, value); \
7839 yesno = '!'; \
7840 what = WORD; \
7841 break
89836f1f 7842
da7fcca4
YO
7843/*
7844 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
7845 so that it is possible to override the option here without having to
7846 rebuild the entire core. as we are required to do if we change regcomp.h
7847 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
7848*/
7849#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
7850#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
7851#endif
7852
7853#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
7854#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
7855#else
7856#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
7857#endif
7858
7f6f358c
YO
7859/*
7860 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7861 matches the pattern or if the pattern matches a single char only and
7862 that char is < 256 and we are case insensitive then we produce an
7863 EXACT node instead.
7f6f358c 7864*/
89836f1f 7865
76e3520e 7866STATIC regnode *
3dab1dad 7867S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7868{
97aff369 7869 dVAR;
9a86a77b 7870 register UV nextvalue;
3568d838 7871 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7872 register IV range = 0;
e1d1eefb 7873 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 7874 register regnode *ret;
ba210ebe 7875 STRLEN numlen;
ffc61ed2 7876 IV namedclass;
cbbf8932 7877 char *rangebegin = NULL;
936ed897 7878 bool need_class = 0;
c445ea15 7879 SV *listsv = NULL;
ffc61ed2 7880 UV n;
9e55ce06 7881 bool optimize_invert = TRUE;
cbbf8932 7882 AV* unicode_alternate = NULL;
1b2d223b
JH
7883#ifdef EBCDIC
7884 UV literal_endpoint = 0;
7885#endif
7f6f358c 7886 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7887
3dab1dad 7888 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7889 case we need to change the emitted regop to an EXACT. */
07be1b83 7890 const char * orig_parse = RExC_parse;
72f13be8 7891 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7892
7893 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
7894#ifndef DEBUGGING
7895 PERL_UNUSED_ARG(depth);
7896#endif
72f13be8 7897
3dab1dad 7898 DEBUG_PARSE("clas");
7f6f358c
YO
7899
7900 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7901 ret = reganode(pRExC_state, ANYOF, 0);
7902
7903 if (!SIZE_ONLY)
7904 ANYOF_FLAGS(ret) = 0;
7905
9a86a77b 7906 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7907 RExC_naughty++;
7908 RExC_parse++;
7909 if (!SIZE_ONLY)
7910 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7911 }
a0d0e21e 7912
73060fc4 7913 if (SIZE_ONLY) {
830247a4 7914 RExC_size += ANYOF_SKIP;
73060fc4
JH
7915 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7916 }
936ed897 7917 else {
830247a4 7918 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7919 if (FOLD)
7920 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7921 if (LOC)
7922 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7923 ANYOF_BITMAP_ZERO(ret);
396482e1 7924 listsv = newSVpvs("# comment\n");
a0d0e21e 7925 }
b8c5462f 7926
9a86a77b
JH
7927 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7928
b938889d 7929 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7930 checkposixcc(pRExC_state);
b8c5462f 7931
f064b6ad
HS
7932 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7933 if (UCHARAT(RExC_parse) == ']')
7934 goto charclassloop;
ffc61ed2 7935
fc8cd66c 7936parseit:
9a86a77b 7937 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7938
7939 charclassloop:
7940
7941 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7942
73b437c8 7943 if (!range)
830247a4 7944 rangebegin = RExC_parse;
ffc61ed2 7945 if (UTF) {
5e12f4fb 7946 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7947 RExC_end - RExC_parse,
9f7f3913 7948 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7949 RExC_parse += numlen;
7950 }
7951 else
7952 value = UCHARAT(RExC_parse++);
7f6f358c 7953
9a86a77b
JH
7954 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7955 if (value == '[' && POSIXCC(nextvalue))
830247a4 7956 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7957 else if (value == '\\') {
ffc61ed2 7958 if (UTF) {
5e12f4fb 7959 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7960 RExC_end - RExC_parse,
9f7f3913 7961 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7962 RExC_parse += numlen;
7963 }
7964 else
7965 value = UCHARAT(RExC_parse++);
470c3474 7966 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7967 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7968 * be a problem later if we want switch on Unicode.
7969 * A similar issue a little bit later when switching on
7970 * namedclass. --jhi */
ffc61ed2 7971 switch ((I32)value) {
b8c5462f
JH
7972 case 'w': namedclass = ANYOF_ALNUM; break;
7973 case 'W': namedclass = ANYOF_NALNUM; break;
7974 case 's': namedclass = ANYOF_SPACE; break;
7975 case 'S': namedclass = ANYOF_NSPACE; break;
7976 case 'd': namedclass = ANYOF_DIGIT; break;
7977 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
7978 case 'v': namedclass = ANYOF_VERTWS; break;
7979 case 'V': namedclass = ANYOF_NVERTWS; break;
7980 case 'h': namedclass = ANYOF_HORIZWS; break;
7981 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
7982 case 'N': /* Handle \N{NAME} in class */
7983 {
7984 /* We only pay attention to the first char of
7985 multichar strings being returned. I kinda wonder
7986 if this makes sense as it does change the behaviour
7987 from earlier versions, OTOH that behaviour was broken
7988 as well. */
7989 UV v; /* value is register so we cant & it /grrr */
afefe6bf 7990 if (reg_namedseq(pRExC_state, &v, NULL)) {
fc8cd66c
YO
7991 goto parseit;
7992 }
7993 value= v;
7994 }
7995 break;
ffc61ed2
JH
7996 case 'p':
7997 case 'P':
3dab1dad
YO
7998 {
7999 char *e;
af6f566e 8000 if (RExC_parse >= RExC_end)
2a4859cd 8001 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 8002 if (*RExC_parse == '{') {
1df70142 8003 const U8 c = (U8)value;
ffc61ed2
JH
8004 e = strchr(RExC_parse++, '}');
8005 if (!e)
0da60cf5 8006 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
8007 while (isSPACE(UCHARAT(RExC_parse)))
8008 RExC_parse++;
8009 if (e == RExC_parse)
0da60cf5 8010 vFAIL2("Empty \\%c{}", c);
ffc61ed2 8011 n = e - RExC_parse;
ab13f0c7
JH
8012 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8013 n--;
ffc61ed2
JH
8014 }
8015 else {
8016 e = RExC_parse;
8017 n = 1;
8018 }
8019 if (!SIZE_ONLY) {
ab13f0c7
JH
8020 if (UCHARAT(RExC_parse) == '^') {
8021 RExC_parse++;
8022 n--;
8023 value = value == 'p' ? 'P' : 'p'; /* toggle */
8024 while (isSPACE(UCHARAT(RExC_parse))) {
8025 RExC_parse++;
8026 n--;
8027 }
8028 }
097eb12c
AL
8029 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8030 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
8031 }
8032 RExC_parse = e + 1;
8033 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 8034 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 8035 }
f81125e2 8036 break;
b8c5462f
JH
8037 case 'n': value = '\n'; break;
8038 case 'r': value = '\r'; break;
8039 case 't': value = '\t'; break;
8040 case 'f': value = '\f'; break;
8041 case 'b': value = '\b'; break;
c7f1f016
NIS
8042 case 'e': value = ASCII_TO_NATIVE('\033');break;
8043 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 8044 case 'x':
ffc61ed2 8045 if (*RExC_parse == '{') {
a4c04bdc
NC
8046 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8047 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 8048 char * const e = strchr(RExC_parse++, '}');
b81d288d 8049 if (!e)
ffc61ed2 8050 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
8051
8052 numlen = e - RExC_parse;
8053 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8054 RExC_parse = e + 1;
8055 }
8056 else {
a4c04bdc 8057 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
8058 numlen = 2;
8059 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
8060 RExC_parse += numlen;
8061 }
9e08bc66
TS
8062 if (PL_encoding && value < 0x100)
8063 goto recode_encoding;
b8c5462f
JH
8064 break;
8065 case 'c':
830247a4 8066 value = UCHARAT(RExC_parse++);
b8c5462f
JH
8067 value = toCTRL(value);
8068 break;
8069 case '0': case '1': case '2': case '3': case '4':
8070 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
8071 {
8072 I32 flags = 0;
8073 numlen = 3;
8074 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8075 RExC_parse += numlen;
8076 if (PL_encoding && value < 0x100)
8077 goto recode_encoding;
8078 break;
8079 }
8080 recode_encoding:
8081 {
8082 SV* enc = PL_encoding;
8083 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
8084 if (!enc && SIZE_ONLY)
8085 ckWARNreg(RExC_parse,
8086 "Invalid escape in the specified encoding");
9e08bc66
TS
8087 break;
8088 }
1028017a 8089 default:
668c081a
NC
8090 if (!SIZE_ONLY && isALPHA(value))
8091 ckWARN2reg(RExC_parse,
8092 "Unrecognized escape \\%c in character class passed through",
8093 (int)value);
1028017a 8094 break;
b8c5462f 8095 }
ffc61ed2 8096 } /* end of \blah */
1b2d223b
JH
8097#ifdef EBCDIC
8098 else
8099 literal_endpoint++;
8100#endif
ffc61ed2
JH
8101
8102 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8103
8104 if (!SIZE_ONLY && !need_class)
936ed897 8105 ANYOF_CLASS_ZERO(ret);
ffc61ed2 8106
936ed897 8107 need_class = 1;
ffc61ed2
JH
8108
8109 /* a bad range like a-\d, a-[:digit:] ? */
8110 if (range) {
73b437c8 8111 if (!SIZE_ONLY) {
668c081a
NC
8112 const int w =
8113 RExC_parse >= rangebegin ?
8114 RExC_parse - rangebegin : 0;
8115 ckWARN4reg(RExC_parse,
b45f050a 8116 "False [] range \"%*.*s\"",
097eb12c 8117 w, w, rangebegin);
668c081a 8118
3568d838
JH
8119 if (prevvalue < 256) {
8120 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
8121 ANYOF_BITMAP_SET(ret, '-');
8122 }
8123 else {
8124 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
8125 Perl_sv_catpvf(aTHX_ listsv,
3568d838 8126 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 8127 }
b8c5462f 8128 }
ffc61ed2
JH
8129
8130 range = 0; /* this was not a true range */
73b437c8 8131 }
ffc61ed2 8132
89836f1f
YO
8133
8134
73b437c8 8135 if (!SIZE_ONLY) {
c49a72a9
NC
8136 const char *what = NULL;
8137 char yesno = 0;
8138
3568d838
JH
8139 if (namedclass > OOB_NAMEDCLASS)
8140 optimize_invert = FALSE;
e2962f66
JH
8141 /* Possible truncation here but in some 64-bit environments
8142 * the compiler gets heartburn about switch on 64-bit values.
8143 * A similar issue a little earlier when switching on value.
98f323fa 8144 * --jhi */
e2962f66 8145 switch ((I32)namedclass) {
da7fcca4
YO
8146
8147 case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum"));
8148 case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha"));
8149 case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank"));
8150 case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl"));
8151 case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph"));
8152 case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower"));
8153 case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print"));
8154 case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space"));
8155 case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct"));
8156 case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper"));
8157#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
89836f1f 8158 case _C_C_T_(ALNUM, isALNUM(value), "Word");
89836f1f 8159 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
da7fcca4
YO
8160#else
8161 case _C_C_T_(SPACE, isSPACE(value), "PerlSpace");
8162 case _C_C_T_(ALNUM, isALNUM(value), "PerlWord");
8163#endif
89836f1f 8164 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
e1d1eefb
YO
8165 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8166 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
73b437c8
JH
8167 case ANYOF_ASCII:
8168 if (LOC)
936ed897 8169 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 8170 else {
c7f1f016 8171#ifndef EBCDIC
1ba5c669
JH
8172 for (value = 0; value < 128; value++)
8173 ANYOF_BITMAP_SET(ret, value);
8174#else /* EBCDIC */
ffbc6a93 8175 for (value = 0; value < 256; value++) {
3a3c4447
JH
8176 if (isASCII(value))
8177 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8178 }
1ba5c669 8179#endif /* EBCDIC */
73b437c8 8180 }
c49a72a9
NC
8181 yesno = '+';
8182 what = "ASCII";
73b437c8
JH
8183 break;
8184 case ANYOF_NASCII:
8185 if (LOC)
936ed897 8186 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 8187 else {
c7f1f016 8188#ifndef EBCDIC
1ba5c669
JH
8189 for (value = 128; value < 256; value++)
8190 ANYOF_BITMAP_SET(ret, value);
8191#else /* EBCDIC */
ffbc6a93 8192 for (value = 0; value < 256; value++) {
3a3c4447
JH
8193 if (!isASCII(value))
8194 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 8195 }
1ba5c669 8196#endif /* EBCDIC */
73b437c8 8197 }
c49a72a9
NC
8198 yesno = '!';
8199 what = "ASCII";
89836f1f 8200 break;
ffc61ed2
JH
8201 case ANYOF_DIGIT:
8202 if (LOC)
8203 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8204 else {
8205 /* consecutive digits assumed */
8206 for (value = '0'; value <= '9'; value++)
8207 ANYOF_BITMAP_SET(ret, value);
8208 }
c49a72a9 8209 yesno = '+';
da7fcca4 8210 what = POSIX_CC_UNI_NAME("Digit");
ffc61ed2
JH
8211 break;
8212 case ANYOF_NDIGIT:
8213 if (LOC)
8214 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8215 else {
8216 /* consecutive digits assumed */
8217 for (value = 0; value < '0'; value++)
8218 ANYOF_BITMAP_SET(ret, value);
8219 for (value = '9' + 1; value < 256; value++)
8220 ANYOF_BITMAP_SET(ret, value);
8221 }
c49a72a9 8222 yesno = '!';
da7fcca4 8223 what = POSIX_CC_UNI_NAME("Digit");
89836f1f 8224 break;
f81125e2
JP
8225 case ANYOF_MAX:
8226 /* this is to handle \p and \P */
8227 break;
73b437c8 8228 default:
b45f050a 8229 vFAIL("Invalid [::] class");
73b437c8 8230 break;
b8c5462f 8231 }
c49a72a9
NC
8232 if (what) {
8233 /* Strings such as "+utf8::isWord\n" */
8234 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8235 }
b8c5462f 8236 if (LOC)
936ed897 8237 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 8238 continue;
a0d0e21e 8239 }
ffc61ed2
JH
8240 } /* end of namedclass \blah */
8241
a0d0e21e 8242 if (range) {
eb160463 8243 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
8244 const int w = RExC_parse - rangebegin;
8245 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 8246 range = 0; /* not a valid range */
73b437c8 8247 }
a0d0e21e
LW
8248 }
8249 else {
3568d838 8250 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
8251 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8252 RExC_parse[1] != ']') {
8253 RExC_parse++;
ffc61ed2
JH
8254
8255 /* a bad range like \w-, [:word:]- ? */
8256 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 8257 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 8258 const int w =
afd78fd5
JH
8259 RExC_parse >= rangebegin ?
8260 RExC_parse - rangebegin : 0;
830247a4 8261 vWARN4(RExC_parse,
b45f050a 8262 "False [] range \"%*.*s\"",
097eb12c 8263 w, w, rangebegin);
afd78fd5 8264 }
73b437c8 8265 if (!SIZE_ONLY)
936ed897 8266 ANYOF_BITMAP_SET(ret, '-');
73b437c8 8267 } else
ffc61ed2
JH
8268 range = 1; /* yeah, it's a range! */
8269 continue; /* but do it the next time */
a0d0e21e 8270 }
a687059c 8271 }
ffc61ed2 8272
93733859 8273 /* now is the next time */
07be1b83 8274 /*stored += (value - prevvalue + 1);*/
ae5c130c 8275 if (!SIZE_ONLY) {
3568d838 8276 if (prevvalue < 256) {
1df70142 8277 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 8278 IV i;
3568d838 8279#ifdef EBCDIC
1b2d223b
JH
8280 /* In EBCDIC [\x89-\x91] should include
8281 * the \x8e but [i-j] should not. */
8282 if (literal_endpoint == 2 &&
8283 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8284 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 8285 {
3568d838
JH
8286 if (isLOWER(prevvalue)) {
8287 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8288 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8289 stored++;
ffc61ed2 8290 ANYOF_BITMAP_SET(ret, i);
2670d666 8291 }
ffc61ed2 8292 } else {
3568d838 8293 for (i = prevvalue; i <= ceilvalue; i++)
2670d666
BC
8294 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8295 stored++;
ffc61ed2 8296 ANYOF_BITMAP_SET(ret, i);
2670d666 8297 }
ffc61ed2 8298 }
8ada0baa 8299 }
ffc61ed2 8300 else
8ada0baa 8301#endif
07be1b83
YO
8302 for (i = prevvalue; i <= ceilvalue; i++) {
8303 if (!ANYOF_BITMAP_TEST(ret,i)) {
8304 stored++;
8305 ANYOF_BITMAP_SET(ret, i);
8306 }
8307 }
3568d838 8308 }
a5961de5 8309 if (value > 255 || UTF) {
1df70142
AL
8310 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8311 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 8312 stored+=2; /* can't optimize this class */
ffc61ed2 8313 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 8314 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 8315 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
8316 prevnatvalue, natvalue);
8317 }
8318 else if (prevnatvalue == natvalue) {
8319 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 8320 if (FOLD) {
89ebb4a3 8321 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 8322 STRLEN foldlen;
1df70142 8323 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 8324
e294cc5d
JH
8325#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8326 if (RExC_precomp[0] == ':' &&
8327 RExC_precomp[1] == '[' &&
8328 (f == 0xDF || f == 0x92)) {
8329 f = NATIVE_TO_UNI(f);
8330 }
8331#endif
c840d2a2
JH
8332 /* If folding and foldable and a single
8333 * character, insert also the folded version
8334 * to the charclass. */
9e55ce06 8335 if (f != value) {
e294cc5d
JH
8336#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8337 if ((RExC_precomp[0] == ':' &&
8338 RExC_precomp[1] == '[' &&
8339 (f == 0xA2 &&
8340 (value == 0xFB05 || value == 0xFB06))) ?
8341 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8342 foldlen == (STRLEN)UNISKIP(f) )
8343#else
eb160463 8344 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 8345#endif
9e55ce06
JH
8346 Perl_sv_catpvf(aTHX_ listsv,
8347 "%04"UVxf"\n", f);
8348 else {
8349 /* Any multicharacter foldings
8350 * require the following transform:
8351 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8352 * where E folds into "pq" and F folds
8353 * into "rst", all other characters
8354 * fold to single characters. We save
8355 * away these multicharacter foldings,
8356 * to be later saved as part of the
8357 * additional "s" data. */
8358 SV *sv;
8359
8360 if (!unicode_alternate)
8361 unicode_alternate = newAV();
740cce10
NC
8362 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8363 TRUE);
9e55ce06
JH
8364 av_push(unicode_alternate, sv);
8365 }
8366 }
254ba52a 8367
60a8b682
JH
8368 /* If folding and the value is one of the Greek
8369 * sigmas insert a few more sigmas to make the
8370 * folding rules of the sigmas to work right.
8371 * Note that not all the possible combinations
8372 * are handled here: some of them are handled
9e55ce06
JH
8373 * by the standard folding rules, and some of
8374 * them (literal or EXACTF cases) are handled
8375 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
8376 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8377 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8378 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 8379 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8380 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8381 }
8382 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8383 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 8384 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
8385 }
8386 }
ffc61ed2 8387 }
1b2d223b
JH
8388#ifdef EBCDIC
8389 literal_endpoint = 0;
8390#endif
8ada0baa 8391 }
ffc61ed2
JH
8392
8393 range = 0; /* this range (if it was one) is done now */
a0d0e21e 8394 }
ffc61ed2 8395
936ed897 8396 if (need_class) {
4f66b38d 8397 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 8398 if (SIZE_ONLY)
830247a4 8399 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 8400 else
830247a4 8401 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 8402 }
ffc61ed2 8403
7f6f358c
YO
8404
8405 if (SIZE_ONLY)
8406 return ret;
8407 /****** !SIZE_ONLY AFTER HERE *********/
8408
3b57cd43 8409 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
7f6f358c
YO
8410 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8411 ) {
8412 /* optimize single char class to an EXACT node
8413 but *only* when its not a UTF/high char */
07be1b83
YO
8414 const char * cur_parse= RExC_parse;
8415 RExC_emit = (regnode *)orig_emit;
8416 RExC_parse = (char *)orig_parse;
7f6f358c
YO
8417 ret = reg_node(pRExC_state,
8418 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 8419 RExC_parse = (char *)cur_parse;
7f6f358c
YO
8420 *STRING(ret)= (char)value;
8421 STR_LEN(ret)= 1;
8422 RExC_emit += STR_SZ(1);
ef8d46e8 8423 SvREFCNT_dec(listsv);
7f6f358c
YO
8424 return ret;
8425 }
ae5c130c 8426 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 8427 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
8428 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8429 ) {
a0ed51b3 8430 for (value = 0; value < 256; ++value) {
936ed897 8431 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 8432 UV fold = PL_fold[value];
ffc61ed2
JH
8433
8434 if (fold != value)
8435 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
8436 }
8437 }
936ed897 8438 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 8439 }
ffc61ed2 8440
ae5c130c 8441 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 8442 if (optimize_invert &&
ffc61ed2
JH
8443 /* If the only flag is inversion. */
8444 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 8445 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 8446 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 8447 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 8448 }
7f6f358c 8449 {
097eb12c 8450 AV * const av = newAV();
ffc61ed2 8451 SV *rv;
9e55ce06 8452 /* The 0th element stores the character class description
6a0407ee 8453 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
8454 * to initialize the appropriate swash (which gets stored in
8455 * the 1st element), and also useful for dumping the regnode.
8456 * The 2nd element stores the multicharacter foldings,
6a0407ee 8457 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
8458 av_store(av, 0, listsv);
8459 av_store(av, 1, NULL);
ad64d0ec
NC
8460 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8461 rv = newRV_noinc(MUTABLE_SV(av));
19860706 8462 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 8463 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 8464 ARG_SET(ret, n);
a0ed51b3 8465 }
a0ed51b3
LW
8466 return ret;
8467}
89836f1f
YO
8468#undef _C_C_T_
8469
a0ed51b3 8470
bcdf7404
YO
8471/* reg_skipcomment()
8472
8473 Absorbs an /x style # comments from the input stream.
8474 Returns true if there is more text remaining in the stream.
8475 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8476 terminates the pattern without including a newline.
8477
8478 Note its the callers responsibility to ensure that we are
8479 actually in /x mode
8480
8481*/
8482
8483STATIC bool
8484S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8485{
8486 bool ended = 0;
7918f24d
NC
8487
8488 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8489
bcdf7404
YO
8490 while (RExC_parse < RExC_end)
8491 if (*RExC_parse++ == '\n') {
8492 ended = 1;
8493 break;
8494 }
8495 if (!ended) {
8496 /* we ran off the end of the pattern without ending
8497 the comment, so we have to add an \n when wrapping */
8498 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8499 return 0;
8500 } else
8501 return 1;
8502}
8503
8504/* nextchar()
8505
8506 Advance that parse position, and optionall absorbs
8507 "whitespace" from the inputstream.
8508
8509 Without /x "whitespace" means (?#...) style comments only,
8510 with /x this means (?#...) and # comments and whitespace proper.
8511
8512 Returns the RExC_parse point from BEFORE the scan occurs.
8513
8514 This is the /x friendly way of saying RExC_parse++.
8515*/
8516
76e3520e 8517STATIC char*
830247a4 8518S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 8519{
097eb12c 8520 char* const retval = RExC_parse++;
a0d0e21e 8521
7918f24d
NC
8522 PERL_ARGS_ASSERT_NEXTCHAR;
8523
4633a7c4 8524 for (;;) {
830247a4
IZ
8525 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
8526 RExC_parse[2] == '#') {
e994fd66
AE
8527 while (*RExC_parse != ')') {
8528 if (RExC_parse == RExC_end)
8529 FAIL("Sequence (?#... not terminated");
830247a4 8530 RExC_parse++;
e994fd66 8531 }
830247a4 8532 RExC_parse++;
4633a7c4
LW
8533 continue;
8534 }
bbe252da 8535 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
8536 if (isSPACE(*RExC_parse)) {
8537 RExC_parse++;
748a9306
LW
8538 continue;
8539 }
830247a4 8540 else if (*RExC_parse == '#') {
bcdf7404
YO
8541 if ( reg_skipcomment( pRExC_state ) )
8542 continue;
748a9306 8543 }
748a9306 8544 }
4633a7c4 8545 return retval;
a0d0e21e 8546 }
a687059c
LW
8547}
8548
8549/*
c277df42 8550- reg_node - emit a node
a0d0e21e 8551*/
76e3520e 8552STATIC regnode * /* Location. */
830247a4 8553S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 8554{
97aff369 8555 dVAR;
c277df42 8556 register regnode *ptr;
504618e9 8557 regnode * const ret = RExC_emit;
07be1b83 8558 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8559
7918f24d
NC
8560 PERL_ARGS_ASSERT_REG_NODE;
8561
c277df42 8562 if (SIZE_ONLY) {
830247a4
IZ
8563 SIZE_ALIGN(RExC_size);
8564 RExC_size += 1;
a0d0e21e
LW
8565 return(ret);
8566 }
3b57cd43
YO
8567 if (RExC_emit >= RExC_emit_bound)
8568 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8569
c277df42 8570 NODE_ALIGN_FILL(ret);
a0d0e21e 8571 ptr = ret;
c277df42 8572 FILL_ADVANCE_NODE(ptr, op);
7122b237 8573#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8574 if (RExC_offsets) { /* MJD */
07be1b83 8575 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 8576 "reg_node", __LINE__,
13d6edb4 8577 PL_reg_name[op],
07be1b83
YO
8578 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
8579 ? "Overwriting end of array!\n" : "OK",
8580 (UV)(RExC_emit - RExC_emit_start),
8581 (UV)(RExC_parse - RExC_start),
8582 (UV)RExC_offsets[0]));
ccb2c380 8583 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 8584 }
7122b237 8585#endif
830247a4 8586 RExC_emit = ptr;
a0d0e21e 8587 return(ret);
a687059c
LW
8588}
8589
8590/*
a0d0e21e
LW
8591- reganode - emit a node with an argument
8592*/
76e3520e 8593STATIC regnode * /* Location. */
830247a4 8594S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 8595{
97aff369 8596 dVAR;
c277df42 8597 register regnode *ptr;
504618e9 8598 regnode * const ret = RExC_emit;
07be1b83 8599 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 8600
7918f24d
NC
8601 PERL_ARGS_ASSERT_REGANODE;
8602
c277df42 8603 if (SIZE_ONLY) {
830247a4
IZ
8604 SIZE_ALIGN(RExC_size);
8605 RExC_size += 2;
6bda09f9
YO
8606 /*
8607 We can't do this:
8608
8609 assert(2==regarglen[op]+1);
8610
8611 Anything larger than this has to allocate the extra amount.
8612 If we changed this to be:
8613
8614 RExC_size += (1 + regarglen[op]);
8615
8616 then it wouldn't matter. Its not clear what side effect
8617 might come from that so its not done so far.
8618 -- dmq
8619 */
a0d0e21e
LW
8620 return(ret);
8621 }
3b57cd43
YO
8622 if (RExC_emit >= RExC_emit_bound)
8623 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
8624
c277df42 8625 NODE_ALIGN_FILL(ret);
a0d0e21e 8626 ptr = ret;
c277df42 8627 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 8628#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8629 if (RExC_offsets) { /* MJD */
07be1b83 8630 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8631 "reganode",
ccb2c380 8632 __LINE__,
13d6edb4 8633 PL_reg_name[op],
07be1b83 8634 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 8635 "Overwriting end of array!\n" : "OK",
07be1b83
YO
8636 (UV)(RExC_emit - RExC_emit_start),
8637 (UV)(RExC_parse - RExC_start),
8638 (UV)RExC_offsets[0]));
ccb2c380 8639 Set_Cur_Node_Offset;
fac92740 8640 }
7122b237 8641#endif
830247a4 8642 RExC_emit = ptr;
a0d0e21e 8643 return(ret);
fe14fcc3
LW
8644}
8645
8646/*
cd439c50 8647- reguni - emit (if appropriate) a Unicode character
a0ed51b3 8648*/
71207a34
AL
8649STATIC STRLEN
8650S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 8651{
97aff369 8652 dVAR;
7918f24d
NC
8653
8654 PERL_ARGS_ASSERT_REGUNI;
8655
71207a34 8656 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
8657}
8658
8659/*
a0d0e21e
LW
8660- reginsert - insert an operator in front of already-emitted operand
8661*
8662* Means relocating the operand.
8663*/
76e3520e 8664STATIC void
6bda09f9 8665S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 8666{
97aff369 8667 dVAR;
c277df42
IZ
8668 register regnode *src;
8669 register regnode *dst;
8670 register regnode *place;
504618e9 8671 const int offset = regarglen[(U8)op];
6bda09f9 8672 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8673 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8674
8675 PERL_ARGS_ASSERT_REGINSERT;
def51078 8676 PERL_UNUSED_ARG(depth);
22c35a8c 8677/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 8678 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 8679 if (SIZE_ONLY) {
6bda09f9 8680 RExC_size += size;
a0d0e21e
LW
8681 return;
8682 }
a687059c 8683
830247a4 8684 src = RExC_emit;
6bda09f9 8685 RExC_emit += size;
830247a4 8686 dst = RExC_emit;
40d049e4 8687 if (RExC_open_parens) {
6bda09f9 8688 int paren;
3b57cd43 8689 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 8690 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 8691 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 8692 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
8693 RExC_open_parens[paren] += size;
8694 } else {
3b57cd43 8695 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
8696 }
8697 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 8698 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
8699 RExC_close_parens[paren] += size;
8700 } else {
3b57cd43 8701 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
8702 }
8703 }
6bda09f9 8704 }
40d049e4 8705
fac92740 8706 while (src > opnd) {
c277df42 8707 StructCopy(--src, --dst, regnode);
7122b237 8708#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8709 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8710 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8711 "reg_insert",
ccb2c380 8712 __LINE__,
13d6edb4 8713 PL_reg_name[op],
07be1b83
YO
8714 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8715 ? "Overwriting end of array!\n" : "OK",
8716 (UV)(src - RExC_emit_start),
8717 (UV)(dst - RExC_emit_start),
8718 (UV)RExC_offsets[0]));
ccb2c380
MP
8719 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8720 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8721 }
7122b237 8722#endif
fac92740
MJD
8723 }
8724
a0d0e21e
LW
8725
8726 place = opnd; /* Op node, where operand used to be. */
7122b237 8727#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8728 if (RExC_offsets) { /* MJD */
07be1b83 8729 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8730 "reginsert",
ccb2c380 8731 __LINE__,
13d6edb4 8732 PL_reg_name[op],
07be1b83 8733 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8734 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8735 (UV)(place - RExC_emit_start),
8736 (UV)(RExC_parse - RExC_start),
786e8c11 8737 (UV)RExC_offsets[0]));
ccb2c380 8738 Set_Node_Offset(place, RExC_parse);
45948336 8739 Set_Node_Length(place, 1);
fac92740 8740 }
7122b237 8741#endif
c277df42
IZ
8742 src = NEXTOPER(place);
8743 FILL_ADVANCE_NODE(place, op);
8744 Zero(src, offset, regnode);
a687059c
LW
8745}
8746
8747/*
c277df42 8748- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8749- SEE ALSO: regtail_study
a0d0e21e 8750*/
097eb12c 8751/* TODO: All three parms should be const */
76e3520e 8752STATIC void
3dab1dad 8753S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8754{
97aff369 8755 dVAR;
c277df42 8756 register regnode *scan;
72f13be8 8757 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8758
8759 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
8760#ifndef DEBUGGING
8761 PERL_UNUSED_ARG(depth);
8762#endif
a0d0e21e 8763
c277df42 8764 if (SIZE_ONLY)
a0d0e21e
LW
8765 return;
8766
8767 /* Find last node. */
8768 scan = p;
8769 for (;;) {
504618e9 8770 regnode * const temp = regnext(scan);
3dab1dad
YO
8771 DEBUG_PARSE_r({
8772 SV * const mysv=sv_newmortal();
8773 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8774 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8775 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8776 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8777 (temp == NULL ? "->" : ""),
13d6edb4 8778 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 8779 );
3dab1dad
YO
8780 });
8781 if (temp == NULL)
8782 break;
8783 scan = temp;
8784 }
8785
8786 if (reg_off_by_arg[OP(scan)]) {
8787 ARG_SET(scan, val - scan);
8788 }
8789 else {
8790 NEXT_OFF(scan) = val - scan;
8791 }
8792}
8793
07be1b83 8794#ifdef DEBUGGING
3dab1dad
YO
8795/*
8796- regtail_study - set the next-pointer at the end of a node chain of p to val.
8797- Look for optimizable sequences at the same time.
8798- currently only looks for EXACT chains.
07be1b83
YO
8799
8800This is expermental code. The idea is to use this routine to perform
8801in place optimizations on branches and groups as they are constructed,
8802with the long term intention of removing optimization from study_chunk so
8803that it is purely analytical.
8804
8805Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8806to control which is which.
8807
3dab1dad
YO
8808*/
8809/* TODO: All four parms should be const */
07be1b83 8810
3dab1dad
YO
8811STATIC U8
8812S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8813{
8814 dVAR;
8815 register regnode *scan;
07be1b83
YO
8816 U8 exact = PSEUDO;
8817#ifdef EXPERIMENTAL_INPLACESCAN
8818 I32 min = 0;
8819#endif
3dab1dad
YO
8820 GET_RE_DEBUG_FLAGS_DECL;
8821
7918f24d
NC
8822 PERL_ARGS_ASSERT_REGTAIL_STUDY;
8823
07be1b83 8824
3dab1dad
YO
8825 if (SIZE_ONLY)
8826 return exact;
8827
8828 /* Find last node. */
8829
8830 scan = p;
8831 for (;;) {
8832 regnode * const temp = regnext(scan);
07be1b83
YO
8833#ifdef EXPERIMENTAL_INPLACESCAN
8834 if (PL_regkind[OP(scan)] == EXACT)
8835 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8836 return EXACT;
8837#endif
3dab1dad
YO
8838 if ( exact ) {
8839 switch (OP(scan)) {
8840 case EXACT:
8841 case EXACTF:
8842 case EXACTFL:
8843 if( exact == PSEUDO )
8844 exact= OP(scan);
07be1b83
YO
8845 else if ( exact != OP(scan) )
8846 exact= 0;
3dab1dad
YO
8847 case NOTHING:
8848 break;
8849 default:
8850 exact= 0;
8851 }
8852 }
8853 DEBUG_PARSE_r({
8854 SV * const mysv=sv_newmortal();
8855 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8856 regprop(RExC_rx, mysv, scan);
eaf3ca90 8857 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8858 SvPV_nolen_const(mysv),
eaf3ca90 8859 REG_NODE_NUM(scan),
13d6edb4 8860 PL_reg_name[exact]);
3dab1dad 8861 });
a0d0e21e
LW
8862 if (temp == NULL)
8863 break;
8864 scan = temp;
8865 }
07be1b83
YO
8866 DEBUG_PARSE_r({
8867 SV * const mysv_val=sv_newmortal();
8868 DEBUG_PARSE_MSG("");
8869 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8870 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8871 SvPV_nolen_const(mysv_val),
8872 (IV)REG_NODE_NUM(val),
8873 (IV)(val - scan)
07be1b83
YO
8874 );
8875 });
c277df42
IZ
8876 if (reg_off_by_arg[OP(scan)]) {
8877 ARG_SET(scan, val - scan);
a0ed51b3
LW
8878 }
8879 else {
c277df42
IZ
8880 NEXT_OFF(scan) = val - scan;
8881 }
3dab1dad
YO
8882
8883 return exact;
a687059c 8884}
07be1b83 8885#endif
a687059c
LW
8886
8887/*
a687059c
LW
8888 - regcurly - a little FSA that accepts {\d+,?\d*}
8889 */
d3b0eb15 8890#ifndef PERL_IN_XSUB_RE
ff3f963a
KW
8891I32
8892Perl_regcurly(register const char *s)
a687059c 8893{
7918f24d
NC
8894 PERL_ARGS_ASSERT_REGCURLY;
8895
a687059c
LW
8896 if (*s++ != '{')
8897 return FALSE;
f0fcb552 8898 if (!isDIGIT(*s))
a687059c 8899 return FALSE;
f0fcb552 8900 while (isDIGIT(*s))
a687059c
LW
8901 s++;
8902 if (*s == ',')
8903 s++;
f0fcb552 8904 while (isDIGIT(*s))
a687059c
LW
8905 s++;
8906 if (*s != '}')
8907 return FALSE;
8908 return TRUE;
8909}
d3b0eb15 8910#endif
a687059c
LW
8911
8912/*
fd181c75 8913 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 8914 */
f7819f85 8915#ifdef DEBUGGING
c33269f7 8916static void
7918f24d
NC
8917S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
8918{
f7819f85
A
8919 int bit;
8920 int set=0;
7918f24d 8921
f7819f85
A
8922 for (bit=0; bit<32; bit++) {
8923 if (flags & (1<<bit)) {
8924 if (!set++ && lead)
8925 PerlIO_printf(Perl_debug_log, "%s",lead);
8926 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
8927 }
8928 }
8929 if (lead) {
8930 if (set)
8931 PerlIO_printf(Perl_debug_log, "\n");
8932 else
8933 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
8934 }
8935}
8936#endif
8937
a687059c 8938void
097eb12c 8939Perl_regdump(pTHX_ const regexp *r)
a687059c 8940{
35ff7856 8941#ifdef DEBUGGING
97aff369 8942 dVAR;
c445ea15 8943 SV * const sv = sv_newmortal();
ab3bbdeb 8944 SV *dsv= sv_newmortal();
f8fc2ecf 8945 RXi_GET_DECL(r,ri);
f7819f85 8946 GET_RE_DEBUG_FLAGS_DECL;
a687059c 8947
7918f24d
NC
8948 PERL_ARGS_ASSERT_REGDUMP;
8949
f8fc2ecf 8950 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8951
8952 /* Header fields of interest. */
ab3bbdeb
YO
8953 if (r->anchored_substr) {
8954 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8955 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8956 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8957 "anchored %s%s at %"IVdf" ",
8958 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8959 (IV)r->anchored_offset);
ab3bbdeb
YO
8960 } else if (r->anchored_utf8) {
8961 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8962 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8963 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8964 "anchored utf8 %s%s at %"IVdf" ",
8965 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8966 (IV)r->anchored_offset);
ab3bbdeb
YO
8967 }
8968 if (r->float_substr) {
8969 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8970 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8971 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8972 "floating %s%s at %"IVdf"..%"UVuf" ",
8973 s, RE_SV_TAIL(r->float_substr),
7b0972df 8974 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8975 } else if (r->float_utf8) {
8976 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8977 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8978 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8979 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8980 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8981 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8982 }
33b8afdf 8983 if (r->check_substr || r->check_utf8)
b81d288d 8984 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8985 (const char *)
8986 (r->check_substr == r->float_substr
8987 && r->check_utf8 == r->float_utf8
8988 ? "(checking floating" : "(checking anchored"));
bbe252da 8989 if (r->extflags & RXf_NOSCAN)
c277df42 8990 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8991 if (r->extflags & RXf_CHECK_ALL)
c277df42 8992 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8993 if (r->check_substr || r->check_utf8)
c277df42
IZ
8994 PerlIO_printf(Perl_debug_log, ") ");
8995
f8fc2ecf
YO
8996 if (ri->regstclass) {
8997 regprop(r, sv, ri->regstclass);
1de06328 8998 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8999 }
bbe252da 9000 if (r->extflags & RXf_ANCH) {
774d564b 9001 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 9002 if (r->extflags & RXf_ANCH_BOL)
774d564b 9003 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 9004 if (r->extflags & RXf_ANCH_MBOL)
c277df42 9005 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 9006 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 9007 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 9008 if (r->extflags & RXf_ANCH_GPOS)
774d564b 9009 PerlIO_printf(Perl_debug_log, "(GPOS)");
9010 PerlIO_putc(Perl_debug_log, ' ');
9011 }
bbe252da 9012 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 9013 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 9014 if (r->intflags & PREGf_SKIP)
760ac839 9015 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 9016 if (r->intflags & PREGf_IMPLICIT)
760ac839 9017 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 9018 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 9019 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 9020 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 9021 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 9022 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 9023#else
7918f24d 9024 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 9025 PERL_UNUSED_CONTEXT;
65e66c80 9026 PERL_UNUSED_ARG(r);
17c3b450 9027#endif /* DEBUGGING */
a687059c
LW
9028}
9029
9030/*
a0d0e21e
LW
9031- regprop - printable representation of opcode
9032*/
3339dfd8
YO
9033#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9034STMT_START { \
9035 if (do_sep) { \
9036 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9037 if (flags & ANYOF_INVERT) \
9038 /*make sure the invert info is in each */ \
9039 sv_catpvs(sv, "^"); \
9040 do_sep = 0; \
9041 } \
9042} STMT_END
9043
46fc3d4c 9044void
32fc9b6a 9045Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 9046{
35ff7856 9047#ifdef DEBUGGING
97aff369 9048 dVAR;
9b155405 9049 register int k;
f8fc2ecf 9050 RXi_GET_DECL(prog,progi);
1de06328 9051 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 9052
7918f24d 9053 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 9054
76f68e9b 9055 sv_setpvs(sv, "");
8aa23a47 9056
03363afd 9057 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
9058 /* It would be nice to FAIL() here, but this may be called from
9059 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 9060 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 9061 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 9062
3dab1dad 9063 k = PL_regkind[OP(o)];
9b155405 9064
2a782b5b 9065 if (k == EXACT) {
f92a2122 9066 sv_catpvs(sv, " ");
ab3bbdeb
YO
9067 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9068 * is a crude hack but it may be the best for now since
9069 * we have no flag "this EXACTish node was UTF-8"
9070 * --jhi */
f92a2122
NC
9071 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9072 PERL_PV_ESCAPE_UNI_DETECT |
9073 PERL_PV_PRETTY_ELLIPSES |
9074 PERL_PV_PRETTY_LTGT |
9075 PERL_PV_PRETTY_NOCLEAR
9076 );
bb263b4e 9077 } else if (k == TRIE) {
3dab1dad 9078 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 9079 * progi->data isn't available here */
1de06328 9080 const char op = OP(o);
647f639f 9081 const U32 n = ARG(o);
1de06328 9082 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 9083 (reg_ac_data *)progi->data->data[n] :
1de06328 9084 NULL;
3251b653
NC
9085 const reg_trie_data * const trie
9086 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 9087
13d6edb4 9088 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
9089 DEBUG_TRIE_COMPILE_r(
9090 Perl_sv_catpvf(aTHX_ sv,
9091 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9092 (UV)trie->startstate,
1e2e3d02 9093 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
9094 (UV)trie->wordcount,
9095 (UV)trie->minlen,
9096 (UV)trie->maxlen,
9097 (UV)TRIE_CHARCOUNT(trie),
9098 (UV)trie->uniquecharcount
9099 )
9100 );
9101 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9102 int i;
9103 int rangestart = -1;
f46cb337 9104 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 9105 sv_catpvs(sv, "[");
1de06328
YO
9106 for (i = 0; i <= 256; i++) {
9107 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9108 if (rangestart == -1)
9109 rangestart = i;
9110 } else if (rangestart != -1) {
9111 if (i <= rangestart + 3)
9112 for (; rangestart < i; rangestart++)
9113 put_byte(sv, rangestart);
9114 else {
9115 put_byte(sv, rangestart);
9116 sv_catpvs(sv, "-");
9117 put_byte(sv, i - 1);
9118 }
9119 rangestart = -1;
9120 }
9121 }
f3a2811a 9122 sv_catpvs(sv, "]");
1de06328
YO
9123 }
9124
a3621e74 9125 } else if (k == CURLY) {
cb434fcc 9126 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
9127 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9128 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 9129 }
2c2d71f5
JH
9130 else if (k == WHILEM && o->flags) /* Ordinal/of */
9131 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 9132 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 9133 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 9134 if ( RXp_PAREN_NAMES(prog) ) {
ee9b8eae 9135 if ( k != REF || OP(o) < NREF) {
502c6561 9136 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
9137 SV **name= av_fetch(list, ARG(o), 0 );
9138 if (name)
9139 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9140 }
9141 else {
502c6561 9142 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 9143 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
9144 I32 *nums=(I32*)SvPVX(sv_dat);
9145 SV **name= av_fetch(list, nums[0], 0 );
9146 I32 n;
9147 if (name) {
9148 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9149 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9150 (n ? "," : ""), (IV)nums[n]);
9151 }
9152 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 9153 }
1f1031fe 9154 }
ee9b8eae 9155 }
1f1031fe 9156 } else if (k == GOSUB)
6bda09f9 9157 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
9158 else if (k == VERB) {
9159 if (!o->flags)
9160 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 9161 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 9162 } else if (k == LOGICAL)
04ebc1ab 9163 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
f9a79580 9164 else if (k == FOLDCHAR)
df44d732 9165 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
653099ff
GS
9166 else if (k == ANYOF) {
9167 int i, rangestart = -1;
2d03de9c 9168 const U8 flags = ANYOF_FLAGS(o);
24d786f4 9169 int do_sep = 0;
0bd48802
AL
9170
9171 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9172 static const char * const anyofs[] = {
653099ff
GS
9173 "\\w",
9174 "\\W",
9175 "\\s",
9176 "\\S",
9177 "\\d",
9178 "\\D",
9179 "[:alnum:]",
9180 "[:^alnum:]",
9181 "[:alpha:]",
9182 "[:^alpha:]",
9183 "[:ascii:]",
9184 "[:^ascii:]",
24d786f4
YO
9185 "[:cntrl:]",
9186 "[:^cntrl:]",
653099ff
GS
9187 "[:graph:]",
9188 "[:^graph:]",
9189 "[:lower:]",
9190 "[:^lower:]",
9191 "[:print:]",
9192 "[:^print:]",
9193 "[:punct:]",
9194 "[:^punct:]",
9195 "[:upper:]",
aaa51d5e 9196 "[:^upper:]",
653099ff 9197 "[:xdigit:]",
aaa51d5e
JF
9198 "[:^xdigit:]",
9199 "[:space:]",
9200 "[:^space:]",
9201 "[:blank:]",
9202 "[:^blank:]"
653099ff
GS
9203 };
9204
19860706 9205 if (flags & ANYOF_LOCALE)
396482e1 9206 sv_catpvs(sv, "{loc}");
19860706 9207 if (flags & ANYOF_FOLD)
396482e1 9208 sv_catpvs(sv, "{i}");
653099ff 9209 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 9210 if (flags & ANYOF_INVERT)
396482e1 9211 sv_catpvs(sv, "^");
3339dfd8
YO
9212
9213 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
9214 for (i = 0; i <= 256; i++) {
9215 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9216 if (rangestart == -1)
9217 rangestart = i;
9218 } else if (rangestart != -1) {
9219 if (i <= rangestart + 3)
9220 for (; rangestart < i; rangestart++)
653099ff 9221 put_byte(sv, rangestart);
ffc61ed2
JH
9222 else {
9223 put_byte(sv, rangestart);
396482e1 9224 sv_catpvs(sv, "-");
ffc61ed2 9225 put_byte(sv, i - 1);
653099ff 9226 }
24d786f4 9227 do_sep = 1;
ffc61ed2 9228 rangestart = -1;
653099ff 9229 }
847a199f 9230 }
3339dfd8
YO
9231
9232 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9233 /* output any special charclass tests (used mostly under use locale) */
ffc61ed2 9234 if (o->flags & ANYOF_CLASS)
bb7a0f54 9235 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 9236 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 9237 sv_catpv(sv, anyofs[i]);
24d786f4
YO
9238 do_sep = 1;
9239 }
9240
3339dfd8
YO
9241 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9242
9243 /* output information about the unicode matching */
ffc61ed2 9244 if (flags & ANYOF_UNICODE)
396482e1 9245 sv_catpvs(sv, "{unicode}");
1aa99e6b 9246 else if (flags & ANYOF_UNICODE_ALL)
396482e1 9247 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
9248
9249 {
9250 SV *lv;
32fc9b6a 9251 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 9252
ffc61ed2
JH
9253 if (lv) {
9254 if (sw) {
89ebb4a3 9255 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 9256
ffc61ed2 9257 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 9258 uvchr_to_utf8(s, i);
ffc61ed2 9259
3568d838 9260 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
9261 if (rangestart == -1)
9262 rangestart = i;
9263 } else if (rangestart != -1) {
ffc61ed2
JH
9264 if (i <= rangestart + 3)
9265 for (; rangestart < i; rangestart++) {
2d03de9c
AL
9266 const U8 * const e = uvchr_to_utf8(s,rangestart);
9267 U8 *p;
9268 for(p = s; p < e; p++)
ffc61ed2
JH
9269 put_byte(sv, *p);
9270 }
9271 else {
2d03de9c
AL
9272 const U8 *e = uvchr_to_utf8(s,rangestart);
9273 U8 *p;
9274 for (p = s; p < e; p++)
ffc61ed2 9275 put_byte(sv, *p);
396482e1 9276 sv_catpvs(sv, "-");
2d03de9c
AL
9277 e = uvchr_to_utf8(s, i-1);
9278 for (p = s; p < e; p++)
1df70142 9279 put_byte(sv, *p);
ffc61ed2
JH
9280 }
9281 rangestart = -1;
9282 }
19860706 9283 }
ffc61ed2 9284
396482e1 9285 sv_catpvs(sv, "..."); /* et cetera */
19860706 9286 }
fde631ed 9287
ffc61ed2 9288 {
2e0de35c 9289 char *s = savesvpv(lv);
c445ea15 9290 char * const origs = s;
b81d288d 9291
3dab1dad
YO
9292 while (*s && *s != '\n')
9293 s++;
b81d288d 9294
ffc61ed2 9295 if (*s == '\n') {
2d03de9c 9296 const char * const t = ++s;
ffc61ed2
JH
9297
9298 while (*s) {
9299 if (*s == '\n')
9300 *s = ' ';
9301 s++;
9302 }
9303 if (s[-1] == ' ')
9304 s[-1] = 0;
9305
9306 sv_catpv(sv, t);
fde631ed 9307 }
b81d288d 9308
ffc61ed2 9309 Safefree(origs);
fde631ed
JH
9310 }
9311 }
653099ff 9312 }
ffc61ed2 9313
653099ff
GS
9314 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9315 }
9b155405 9316 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 9317 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 9318#else
96a5add6 9319 PERL_UNUSED_CONTEXT;
65e66c80
SP
9320 PERL_UNUSED_ARG(sv);
9321 PERL_UNUSED_ARG(o);
f9049ba1 9322 PERL_UNUSED_ARG(prog);
17c3b450 9323#endif /* DEBUGGING */
35ff7856 9324}
a687059c 9325
cad2e5aa 9326SV *
288b8c02 9327Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 9328{ /* Assume that RE_INTUIT is set */
97aff369 9329 dVAR;
288b8c02 9330 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 9331 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9332
9333 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
9334 PERL_UNUSED_CONTEXT;
9335
a3621e74 9336 DEBUG_COMPILE_r(
cfd0369c 9337 {
2d03de9c 9338 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 9339 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
9340
9341 if (!PL_colorset) reginitcolors();
9342 PerlIO_printf(Perl_debug_log,
a0288114 9343 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
9344 PL_colors[4],
9345 prog->check_substr ? "" : "utf8 ",
9346 PL_colors[5],PL_colors[0],
cad2e5aa
JH
9347 s,
9348 PL_colors[1],
9349 (strlen(s) > 60 ? "..." : ""));
9350 } );
9351
33b8afdf 9352 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
9353}
9354
84da74a7 9355/*
f8149455 9356 pregfree()
84da74a7 9357
f8149455
YO
9358 handles refcounting and freeing the perl core regexp structure. When
9359 it is necessary to actually free the structure the first thing it
9360 does is call the 'free' method of the regexp_engine associated to to
9361 the regexp, allowing the handling of the void *pprivate; member
9362 first. (This routine is not overridable by extensions, which is why
9363 the extensions free is called first.)
9364
9365 See regdupe and regdupe_internal if you change anything here.
84da74a7 9366*/
f8149455 9367#ifndef PERL_IN_XSUB_RE
2b69d0c2 9368void
84679df5 9369Perl_pregfree(pTHX_ REGEXP *r)
a687059c 9370{
288b8c02
NC
9371 SvREFCNT_dec(r);
9372}
9373
9374void
9375Perl_pregfree2(pTHX_ REGEXP *rx)
9376{
27da23d5 9377 dVAR;
288b8c02 9378 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 9379 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 9380
7918f24d
NC
9381 PERL_ARGS_ASSERT_PREGFREE2;
9382
28d8d7f4
YO
9383 if (r->mother_re) {
9384 ReREFCNT_dec(r->mother_re);
9385 } else {
288b8c02 9386 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 9387 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
9388 }
9389 if (r->substrs) {
ef8d46e8
VP
9390 SvREFCNT_dec(r->anchored_substr);
9391 SvREFCNT_dec(r->anchored_utf8);
9392 SvREFCNT_dec(r->float_substr);
9393 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
9394 Safefree(r->substrs);
9395 }
288b8c02 9396 RX_MATCH_COPY_FREE(rx);
f8c7b90f 9397#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 9398 SvREFCNT_dec(r->saved_copy);
ed252734 9399#endif
f0ab9afb 9400 Safefree(r->offs);
f8149455 9401}
28d8d7f4
YO
9402
9403/* reg_temp_copy()
9404
9405 This is a hacky workaround to the structural issue of match results
9406 being stored in the regexp structure which is in turn stored in
9407 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9408 could be PL_curpm in multiple contexts, and could require multiple
9409 result sets being associated with the pattern simultaneously, such
9410 as when doing a recursive match with (??{$qr})
9411
9412 The solution is to make a lightweight copy of the regexp structure
9413 when a qr// is returned from the code executed by (??{$qr}) this
9414 lightweight copy doesnt actually own any of its data except for
9415 the starp/end and the actual regexp structure itself.
9416
9417*/
9418
9419
84679df5 9420REGEXP *
f0826785 9421Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 9422{
f0826785 9423 struct regexp *ret;
288b8c02 9424 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 9425 register const I32 npar = r->nparens+1;
7918f24d
NC
9426
9427 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9428
f0826785
BM
9429 if (!ret_x)
9430 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9431 ret = (struct regexp *)SvANY(ret_x);
9432
288b8c02 9433 (void)ReREFCNT_inc(rx);
f7c278bf
NC
9434 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9435 by pointing directly at the buffer, but flagging that the allocated
9436 space in the copy is zero. As we've just done a struct copy, it's now
9437 a case of zero-ing that, rather than copying the current length. */
9438 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 9439 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
9440 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9441 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 9442 SvLEN_set(ret_x, 0);
b9ad13ac 9443 SvSTASH_set(ret_x, NULL);
703c388d 9444 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
9445 Newx(ret->offs, npar, regexp_paren_pair);
9446 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 9447 if (r->substrs) {
28d8d7f4 9448 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
9449 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9450
9451 SvREFCNT_inc_void(ret->anchored_substr);
9452 SvREFCNT_inc_void(ret->anchored_utf8);
9453 SvREFCNT_inc_void(ret->float_substr);
9454 SvREFCNT_inc_void(ret->float_utf8);
9455
9456 /* check_substr and check_utf8, if non-NULL, point to either their
9457 anchored or float namesakes, and don't hold a second reference. */
486913e4 9458 }
288b8c02 9459 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 9460#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 9461 ret->saved_copy = NULL;
28d8d7f4 9462#endif
288b8c02 9463 ret->mother_re = rx;
28d8d7f4 9464
288b8c02 9465 return ret_x;
28d8d7f4 9466}
f8149455
YO
9467#endif
9468
9469/* regfree_internal()
9470
9471 Free the private data in a regexp. This is overloadable by
9472 extensions. Perl takes care of the regexp structure in pregfree(),
9473 this covers the *pprivate pointer which technically perldoesnt
9474 know about, however of course we have to handle the
9475 regexp_internal structure when no extension is in use.
9476
9477 Note this is called before freeing anything in the regexp
9478 structure.
9479 */
9480
9481void
288b8c02 9482Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
9483{
9484 dVAR;
288b8c02 9485 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9486 RXi_GET_DECL(r,ri);
9487 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9488
9489 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9490
f8149455
YO
9491 DEBUG_COMPILE_r({
9492 if (!PL_colorset)
9493 reginitcolors();
9494 {
9495 SV *dsv= sv_newmortal();
3c8556c3 9496 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 9497 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
9498 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9499 PL_colors[4],PL_colors[5],s);
9500 }
9501 });
7122b237
YO
9502#ifdef RE_TRACK_PATTERN_OFFSETS
9503 if (ri->u.offsets)
9504 Safefree(ri->u.offsets); /* 20010421 MJD */
9505#endif
f8fc2ecf
YO
9506 if (ri->data) {
9507 int n = ri->data->count;
f3548bdc
DM
9508 PAD* new_comppad = NULL;
9509 PAD* old_comppad;
4026c95a 9510 PADOFFSET refcnt;
dfad63ad 9511
c277df42 9512 while (--n >= 0) {
261faec3 9513 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 9514 switch (ri->data->what[n]) {
c277df42 9515 case 's':
81714fb9 9516 case 'S':
55eed653 9517 case 'u':
ad64d0ec 9518 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 9519 break;
653099ff 9520 case 'f':
f8fc2ecf 9521 Safefree(ri->data->data[n]);
653099ff 9522 break;
dfad63ad 9523 case 'p':
502c6561 9524 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 9525 break;
c277df42 9526 case 'o':
dfad63ad 9527 if (new_comppad == NULL)
cea2e8a9 9528 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
9529 PAD_SAVE_LOCAL(old_comppad,
9530 /* Watch out for global destruction's random ordering. */
c445ea15 9531 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 9532 );
b34c0dd4 9533 OP_REFCNT_LOCK;
f8fc2ecf 9534 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
9535 OP_REFCNT_UNLOCK;
9536 if (!refcnt)
f8fc2ecf 9537 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 9538
f3548bdc 9539 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 9540 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 9541 new_comppad = NULL;
c277df42
IZ
9542 break;
9543 case 'n':
9e55ce06 9544 break;
07be1b83 9545 case 'T':
be8e71aa
YO
9546 { /* Aho Corasick add-on structure for a trie node.
9547 Used in stclass optimization only */
07be1b83 9548 U32 refcount;
f8fc2ecf 9549 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
9550 OP_REFCNT_LOCK;
9551 refcount = --aho->refcount;
9552 OP_REFCNT_UNLOCK;
9553 if ( !refcount ) {
446bd890
NC
9554 PerlMemShared_free(aho->states);
9555 PerlMemShared_free(aho->fail);
446bd890
NC
9556 /* do this last!!!! */
9557 PerlMemShared_free(ri->data->data[n]);
9558 PerlMemShared_free(ri->regstclass);
07be1b83
YO
9559 }
9560 }
9561 break;
a3621e74 9562 case 't':
07be1b83 9563 {
be8e71aa 9564 /* trie structure. */
07be1b83 9565 U32 refcount;
f8fc2ecf 9566 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
9567 OP_REFCNT_LOCK;
9568 refcount = --trie->refcount;
9569 OP_REFCNT_UNLOCK;
9570 if ( !refcount ) {
446bd890 9571 PerlMemShared_free(trie->charmap);
446bd890
NC
9572 PerlMemShared_free(trie->states);
9573 PerlMemShared_free(trie->trans);
07be1b83 9574 if (trie->bitmap)
446bd890 9575 PerlMemShared_free(trie->bitmap);
07be1b83 9576 if (trie->wordlen)
446bd890 9577 PerlMemShared_free(trie->wordlen);
786e8c11 9578 if (trie->jump)
446bd890 9579 PerlMemShared_free(trie->jump);
786e8c11 9580 if (trie->nextword)
446bd890 9581 PerlMemShared_free(trie->nextword);
446bd890
NC
9582 /* do this last!!!! */
9583 PerlMemShared_free(ri->data->data[n]);
a3621e74 9584 }
07be1b83
YO
9585 }
9586 break;
c277df42 9587 default:
f8fc2ecf 9588 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
9589 }
9590 }
f8fc2ecf
YO
9591 Safefree(ri->data->what);
9592 Safefree(ri->data);
a0d0e21e 9593 }
28d8d7f4 9594
f8fc2ecf 9595 Safefree(ri);
a687059c 9596}
c277df42 9597
84da74a7 9598#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
502c6561 9599#define av_dup_inc(s,t) MUTABLE_AV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
85fbaab2 9600#define hv_dup_inc(s,t) MUTABLE_HV(SvREFCNT_inc(sv_dup((const SV *)s,t)))
84da74a7
YO
9601#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
9602
9603/*
32cd70f6 9604 re_dup - duplicate a regexp.
84da74a7 9605
8233f606
DM
9606 This routine is expected to clone a given regexp structure. It is only
9607 compiled under USE_ITHREADS.
32cd70f6 9608
f8149455
YO
9609 After all of the core data stored in struct regexp is duplicated
9610 the regexp_engine.dupe method is used to copy any private data
9611 stored in the *pprivate pointer. This allows extensions to handle
9612 any duplication it needs to do.
9613
9614 See pregfree() and regfree_internal() if you change anything here.
84da74a7 9615*/
a3c0e9ca 9616#if defined(USE_ITHREADS)
f8149455 9617#ifndef PERL_IN_XSUB_RE
288b8c02
NC
9618void
9619Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 9620{
84da74a7 9621 dVAR;
a86a1ca7 9622 I32 npar;
288b8c02
NC
9623 const struct regexp *r = (const struct regexp *)SvANY(sstr);
9624 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 9625
7918f24d
NC
9626 PERL_ARGS_ASSERT_RE_DUP_GUTS;
9627
84da74a7 9628 npar = r->nparens+1;
f0ab9afb
NC
9629 Newx(ret->offs, npar, regexp_paren_pair);
9630 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 9631 if(ret->swap) {
28d8d7f4 9632 /* no need to copy these */
f0ab9afb 9633 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 9634 }
84da74a7 9635
6057429f 9636 if (ret->substrs) {
32cd70f6
NC
9637 /* Do it this way to avoid reading from *r after the StructCopy().
9638 That way, if any of the sv_dup_inc()s dislodge *r from the L1
9639 cache, it doesn't matter. */
66b1de87
NC
9640 const bool anchored = r->check_substr
9641 ? r->check_substr == r->anchored_substr
9642 : r->check_utf8 == r->anchored_utf8;
785a26d5 9643 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
9644 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9645
32cd70f6
NC
9646 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
9647 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
9648 ret->float_substr = sv_dup_inc(ret->float_substr, param);
9649 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 9650
32cd70f6
NC
9651 /* check_substr and check_utf8, if non-NULL, point to either their
9652 anchored or float namesakes, and don't hold a second reference. */
9653
9654 if (ret->check_substr) {
9655 if (anchored) {
9656 assert(r->check_utf8 == r->anchored_utf8);
9657 ret->check_substr = ret->anchored_substr;
9658 ret->check_utf8 = ret->anchored_utf8;
9659 } else {
9660 assert(r->check_substr == r->float_substr);
9661 assert(r->check_utf8 == r->float_utf8);
9662 ret->check_substr = ret->float_substr;
9663 ret->check_utf8 = ret->float_utf8;
9664 }
66b1de87
NC
9665 } else if (ret->check_utf8) {
9666 if (anchored) {
9667 ret->check_utf8 = ret->anchored_utf8;
9668 } else {
9669 ret->check_utf8 = ret->float_utf8;
9670 }
32cd70f6 9671 }
6057429f 9672 }
f8149455 9673
5daac39c 9674 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 9675
6057429f 9676 if (ret->pprivate)
288b8c02 9677 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 9678
288b8c02 9679 if (RX_MATCH_COPIED(dstr))
6057429f 9680 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
9681 else
9682 ret->subbeg = NULL;
9683#ifdef PERL_OLD_COPY_ON_WRITE
9684 ret->saved_copy = NULL;
9685#endif
6057429f 9686
c2123ae3
NC
9687 if (ret->mother_re) {
9688 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
9689 /* Our storage points directly to our mother regexp, but that's
9690 1: a buffer in a different thread
9691 2: something we no longer hold a reference on
9692 so we need to copy it locally. */
9693 /* Note we need to sue SvCUR() on our mother_re, because it, in
9694 turn, may well be pointing to its own mother_re. */
9695 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
9696 SvCUR(ret->mother_re)+1));
9697 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
9698 }
9699 ret->mother_re = NULL;
9700 }
6057429f 9701 ret->gofs = 0;
f8149455
YO
9702}
9703#endif /* PERL_IN_XSUB_RE */
9704
9705/*
9706 regdupe_internal()
9707
9708 This is the internal complement to regdupe() which is used to copy
9709 the structure pointed to by the *pprivate pointer in the regexp.
9710 This is the core version of the extension overridable cloning hook.
9711 The regexp structure being duplicated will be copied by perl prior
9712 to this and will be provided as the regexp *r argument, however
9713 with the /old/ structures pprivate pointer value. Thus this routine
9714 may override any copying normally done by perl.
9715
9716 It returns a pointer to the new regexp_internal structure.
9717*/
9718
9719void *
288b8c02 9720Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
9721{
9722 dVAR;
288b8c02 9723 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
9724 regexp_internal *reti;
9725 int len, npar;
9726 RXi_GET_DECL(r,ri);
7918f24d
NC
9727
9728 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455
YO
9729
9730 npar = r->nparens+1;
7122b237 9731 len = ProgLen(ri);
f8149455 9732
45cf4570 9733 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
9734 Copy(ri->program, reti->program, len+1, regnode);
9735
f8149455 9736
f8fc2ecf 9737 reti->regstclass = NULL;
bcdf7404 9738
f8fc2ecf 9739 if (ri->data) {
84da74a7 9740 struct reg_data *d;
f8fc2ecf 9741 const int count = ri->data->count;
84da74a7
YO
9742 int i;
9743
9744 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
9745 char, struct reg_data);
9746 Newx(d->what, count, U8);
9747
9748 d->count = count;
9749 for (i = 0; i < count; i++) {
f8fc2ecf 9750 d->what[i] = ri->data->what[i];
84da74a7 9751 switch (d->what[i]) {
55eed653 9752 /* legal options are one of: sSfpontTu
84da74a7
YO
9753 see also regcomp.h and pregfree() */
9754 case 's':
81714fb9 9755 case 'S':
0536c0a7 9756 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 9757 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 9758 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 9759 break;
84da74a7
YO
9760 case 'f':
9761 /* This is cheating. */
9762 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 9763 StructCopy(ri->data->data[i], d->data[i],
84da74a7 9764 struct regnode_charclass_class);
f8fc2ecf 9765 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
9766 break;
9767 case 'o':
bbe252da
YO
9768 /* Compiled op trees are readonly and in shared memory,
9769 and can thus be shared without duplication. */
84da74a7 9770 OP_REFCNT_LOCK;
f8fc2ecf 9771 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
9772 OP_REFCNT_UNLOCK;
9773 break;
23eab42c
NC
9774 case 'T':
9775 /* Trie stclasses are readonly and can thus be shared
9776 * without duplication. We free the stclass in pregfree
9777 * when the corresponding reg_ac_data struct is freed.
9778 */
9779 reti->regstclass= ri->regstclass;
9780 /* Fall through */
84da74a7 9781 case 't':
84da74a7 9782 OP_REFCNT_LOCK;
0536c0a7 9783 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 9784 OP_REFCNT_UNLOCK;
0536c0a7
NC
9785 /* Fall through */
9786 case 'n':
9787 d->data[i] = ri->data->data[i];
84da74a7 9788 break;
84da74a7 9789 default:
f8fc2ecf 9790 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
9791 }
9792 }
9793
f8fc2ecf 9794 reti->data = d;
84da74a7
YO
9795 }
9796 else
f8fc2ecf 9797 reti->data = NULL;
84da74a7 9798
cde0cee5
YO
9799 reti->name_list_idx = ri->name_list_idx;
9800
7122b237
YO
9801#ifdef RE_TRACK_PATTERN_OFFSETS
9802 if (ri->u.offsets) {
9803 Newx(reti->u.offsets, 2*len+1, U32);
9804 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
9805 }
9806#else
9807 SetProgLen(reti,len);
9808#endif
9809
f8149455 9810 return (void*)reti;
84da74a7 9811}
f8149455
YO
9812
9813#endif /* USE_ITHREADS */
84da74a7 9814
f8149455 9815#ifndef PERL_IN_XSUB_RE
bcdf7404 9816
c277df42
IZ
9817/*
9818 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9819 */
9820regnode *
864dbfa3 9821Perl_regnext(pTHX_ register regnode *p)
c277df42 9822{
97aff369 9823 dVAR;
c277df42
IZ
9824 register I32 offset;
9825
f8fc2ecf 9826 if (!p)
c277df42
IZ
9827 return(NULL);
9828
9829 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9830 if (offset == 0)
9831 return(NULL);
9832
c277df42 9833 return(p+offset);
c277df42 9834}
76234dfb 9835#endif
c277df42 9836
01f988be 9837STATIC void
cea2e8a9 9838S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9839{
9840 va_list args;
9841 STRLEN l1 = strlen(pat1);
9842 STRLEN l2 = strlen(pat2);
9843 char buf[512];
06bf62c7 9844 SV *msv;
73d840c0 9845 const char *message;
c277df42 9846
7918f24d
NC
9847 PERL_ARGS_ASSERT_RE_CROAK2;
9848
c277df42
IZ
9849 if (l1 > 510)
9850 l1 = 510;
9851 if (l1 + l2 > 510)
9852 l2 = 510 - l1;
9853 Copy(pat1, buf, l1 , char);
9854 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9855 buf[l1 + l2] = '\n';
9856 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9857#ifdef I_STDARG
9858 /* ANSI variant takes additional second argument */
c277df42 9859 va_start(args, pat2);
8736538c
AS
9860#else
9861 va_start(args);
9862#endif
5a844595 9863 msv = vmess(buf, &args);
c277df42 9864 va_end(args);
cfd0369c 9865 message = SvPV_const(msv,l1);
c277df42
IZ
9866 if (l1 > 512)
9867 l1 = 512;
9868 Copy(message, buf, l1 , char);
197cf9b9 9869 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9870 Perl_croak(aTHX_ "%s", buf);
c277df42 9871}
a0ed51b3
LW
9872
9873/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9874
76234dfb 9875#ifndef PERL_IN_XSUB_RE
a0ed51b3 9876void
864dbfa3 9877Perl_save_re_context(pTHX)
b81d288d 9878{
97aff369 9879 dVAR;
1ade1aa1
NC
9880
9881 struct re_save_state *state;
9882
9883 SAVEVPTR(PL_curcop);
9884 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9885
9886 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9887 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9888 SSPUSHINT(SAVEt_RE_STATE);
9889
46ab3289 9890 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9891
a0ed51b3 9892 PL_reg_start_tmp = 0;
a0ed51b3 9893 PL_reg_start_tmpl = 0;
c445ea15 9894 PL_reg_oldsaved = NULL;
a5db57d6 9895 PL_reg_oldsavedlen = 0;
a5db57d6 9896 PL_reg_maxiter = 0;
a5db57d6 9897 PL_reg_leftiter = 0;
c445ea15 9898 PL_reg_poscache = NULL;
a5db57d6 9899 PL_reg_poscache_size = 0;
1ade1aa1
NC
9900#ifdef PERL_OLD_COPY_ON_WRITE
9901 PL_nrs = NULL;
9902#endif
ada6e8a9 9903
c445ea15
AL
9904 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9905 if (PL_curpm) {
9906 const REGEXP * const rx = PM_GETRE(PL_curpm);
9907 if (rx) {
1df70142 9908 U32 i;
07bc277f 9909 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 9910 char digits[TYPE_CHARS(long)];
d9fad198 9911 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9912 GV *const *const gvp
9913 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9914
b37c2d43
AL
9915 if (gvp) {
9916 GV * const gv = *gvp;
9917 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9918 save_scalar(gv);
49f27e4b 9919 }
ada6e8a9
AMS
9920 }
9921 }
9922 }
a0ed51b3 9923}
76234dfb 9924#endif
51371543 9925
51371543 9926static void
acfe0abc 9927clear_re(pTHX_ void *r)
51371543 9928{
97aff369 9929 dVAR;
84679df5 9930 ReREFCNT_dec((REGEXP *)r);
51371543 9931}
ffbc6a93 9932
a28509cc
AL
9933#ifdef DEBUGGING
9934
9935STATIC void
9936S_put_byte(pTHX_ SV *sv, int c)
9937{
7918f24d
NC
9938 PERL_ARGS_ASSERT_PUT_BYTE;
9939
7fddd944
NC
9940 /* Our definition of isPRINT() ignores locales, so only bytes that are
9941 not part of UTF-8 are considered printable. I assume that the same
9942 holds for UTF-EBCDIC.
9943 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
9944 which Wikipedia says:
9945
9946 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
9947 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
9948 identical, to the ASCII delete (DEL) or rubout control character.
9949 ) So the old condition can be simplified to !isPRINT(c) */
9950 if (!isPRINT(c))
a28509cc 9951 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5e7aa789 9952 else {
88c9ea1e 9953 const char string = c;
5e7aa789
NC
9954 if (c == '-' || c == ']' || c == '\\' || c == '^')
9955 sv_catpvs(sv, "\\");
9956 sv_catpvn(sv, &string, 1);
9957 }
a28509cc
AL
9958}
9959
786e8c11 9960
3dab1dad
YO
9961#define CLEAR_OPTSTART \
9962 if (optstart) STMT_START { \
70685ca0 9963 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9964 optstart=NULL; \
9965 } STMT_END
9966
786e8c11 9967#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9968
b5a2f8d8
NC
9969STATIC const regnode *
9970S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9971 const regnode *last, const regnode *plast,
9972 SV* sv, I32 indent, U32 depth)
a28509cc 9973{
97aff369 9974 dVAR;
786e8c11 9975 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9976 register const regnode *next;
3dab1dad 9977 const regnode *optstart= NULL;
1f1031fe 9978
f8fc2ecf 9979 RXi_GET_DECL(r,ri);
3dab1dad 9980 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
9981
9982 PERL_ARGS_ASSERT_DUMPUNTIL;
9983
786e8c11
YO
9984#ifdef DEBUG_DUMPUNTIL
9985 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9986 last ? last-start : 0,plast ? plast-start : 0);
9987#endif
9988
9989 if (plast && plast < last)
9990 last= plast;
9991
9992 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9993 /* While that wasn't END last time... */
a28509cc
AL
9994 NODE_ALIGN(node);
9995 op = OP(node);
de734bd5 9996 if (op == CLOSE || op == WHILEM)
786e8c11 9997 indent--;
b5a2f8d8 9998 next = regnext((regnode *)node);
1f1031fe 9999
a28509cc 10000 /* Where, what. */
8e11feef 10001 if (OP(node) == OPTIMIZED) {
e68ec53f 10002 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 10003 optstart = node;
3dab1dad 10004 else
8e11feef 10005 goto after_print;
3dab1dad
YO
10006 } else
10007 CLEAR_OPTSTART;
1f1031fe 10008
32fc9b6a 10009 regprop(r, sv, node);
a28509cc 10010 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 10011 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
10012
10013 if (OP(node) != OPTIMIZED) {
10014 if (next == NULL) /* Next ptr. */
10015 PerlIO_printf(Perl_debug_log, " (0)");
10016 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10017 PerlIO_printf(Perl_debug_log, " (FAIL)");
10018 else
10019 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10020 (void)PerlIO_putc(Perl_debug_log, '\n');
10021 }
10022
a28509cc
AL
10023 after_print:
10024 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
10025 assert(next);
10026 {
10027 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
10028 ? regnext((regnode *)next)
10029 : next);
be8e71aa
YO
10030 if (last && nnode > last)
10031 nnode = last;
786e8c11 10032 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 10033 }
a28509cc
AL
10034 }
10035 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 10036 assert(next);
786e8c11 10037 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
10038 }
10039 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 10040 const regnode *this_trie = node;
1de06328 10041 const char op = OP(node);
647f639f 10042 const U32 n = ARG(node);
1de06328 10043 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 10044 (reg_ac_data *)ri->data->data[n] :
1de06328 10045 NULL;
3251b653
NC
10046 const reg_trie_data * const trie =
10047 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 10048#ifdef DEBUGGING
502c6561 10049 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 10050#endif
786e8c11 10051 const regnode *nextbranch= NULL;
a28509cc 10052 I32 word_idx;
76f68e9b 10053 sv_setpvs(sv, "");
786e8c11 10054 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 10055 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
10056
10057 PerlIO_printf(Perl_debug_log, "%*s%s ",
10058 (int)(2*(indent+3)), "",
10059 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
10060 PL_colors[0], PL_colors[1],
10061 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 10062 PERL_PV_PRETTY_ELLIPSES |
7f69552c 10063 PERL_PV_PRETTY_LTGT
786e8c11
YO
10064 )
10065 : "???"
10066 );
10067 if (trie->jump) {
40d049e4 10068 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
10069 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10070 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
10071 if (dist) {
10072 if (!nextbranch)
24b23f37 10073 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
10074 DUMPUNTIL(this_trie + dist, nextbranch);
10075 }
786e8c11
YO
10076 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10077 nextbranch= regnext((regnode *)nextbranch);
10078 } else {
10079 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 10080 }
786e8c11
YO
10081 }
10082 if (last && next > last)
10083 node= last;
10084 else
10085 node= next;
a28509cc 10086 }
786e8c11
YO
10087 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10088 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10089 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
10090 }
10091 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 10092 assert(next);
786e8c11 10093 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
10094 }
10095 else if ( op == PLUS || op == STAR) {
786e8c11 10096 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
10097 }
10098 else if (op == ANYOF) {
10099 /* arglen 1 + class block */
10100 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
10101 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10102 node = NEXTOPER(node);
10103 }
10104 else if (PL_regkind[(U8)op] == EXACT) {
10105 /* Literal string, where present. */
10106 node += NODE_SZ_STR(node) - 1;
10107 node = NEXTOPER(node);
10108 }
10109 else {
10110 node = NEXTOPER(node);
10111 node += regarglen[(U8)op];
10112 }
10113 if (op == CURLYX || op == OPEN)
786e8c11 10114 indent++;
a28509cc 10115 }
3dab1dad 10116 CLEAR_OPTSTART;
786e8c11 10117#ifdef DEBUG_DUMPUNTIL
70685ca0 10118 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 10119#endif
1de06328 10120 return node;
a28509cc
AL
10121}
10122
10123#endif /* DEBUGGING */
10124
241d1a3b
NC
10125/*
10126 * Local variables:
10127 * c-indentation-style: bsd
10128 * c-basic-offset: 4
10129 * indent-tabs-mode: t
10130 * End:
10131 *
37442d52
RGS
10132 * ex: set ts=8 sts=4 sw=4 noet:
10133 */